diff --git a/.travis.yml b/.travis.yml
index 78fc7f9d1..141313905 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -33,9 +33,6 @@ matrix:
           sources: ubuntu-toolchain-r-test
           packages: ninja-build g++-5
       env: DXC_BUILD_TYPE=Release
-  allow_failures:
-    - os: linux
-    - os: osx
 
 cache:
   apt: true
@@ -53,7 +50,12 @@ addons:
     packages: ninja-build libstdc++-5-dev
 
 before_install:
-  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install ninja; fi
+  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
+      wget -q https://github.com/ninja-build/ninja/releases/download/v1.7.2/ninja-mac.zip;
+      unzip -q ninja-mac.zip;
+      chmod +x ninja;
+      export PATH="$PWD:$PATH";
+      fi
 
 before_script:
   - git submodule update --init
diff --git a/cmake/modules/FindD3D12.cmake b/cmake/modules/FindD3D12.cmake
index cfee95ca3..01e7a3ae9 100644
--- a/cmake/modules/FindD3D12.cmake
+++ b/cmake/modules/FindD3D12.cmake
@@ -1,13 +1,17 @@
 # Find the win10 SDK path.
-get_filename_component(WIN10_SDK_PATH "[HKEY_LOCAL_MACHINE\\SOFTWARE\\WOW6432Node\\Microsoft\\Microsoft SDKs\\Windows\\v10.0;InstallationFolder]" ABSOLUTE CACHE)
-get_filename_component(TEMP_WIN10_SDK_VERSION "[HKEY_LOCAL_MACHINE\\SOFTWARE\\WOW6432Node\\Microsoft\\Microsoft SDKs\\Windows\\v10.0;ProductVersion]" ABSOLUTE CACHE)
-
-get_filename_component(WIN10_SDK_VERSION ${TEMP_WIN10_SDK_VERSION} NAME)
+if ("$ENV{WIN10_SDK_PATH}$ENV{WIN10_SDK_VERSION}" STREQUAL "" )
+  get_filename_component(WIN10_SDK_PATH "[HKEY_LOCAL_MACHINE\\SOFTWARE\\WOW6432Node\\Microsoft\\Microsoft SDKs\\Windows\\v10.0;InstallationFolder]" ABSOLUTE CACHE)
+  get_filename_component(TEMP_WIN10_SDK_VERSION "[HKEY_LOCAL_MACHINE\\SOFTWARE\\WOW6432Node\\Microsoft\\Microsoft SDKs\\Windows\\v10.0;ProductVersion]" ABSOLUTE CACHE)
+  get_filename_component(WIN10_SDK_VERSION ${TEMP_WIN10_SDK_VERSION} NAME)
+elseif(TRUE)
+  set (WIN10_SDK_PATH $ENV{WIN10_SDK_PATH})
+  set (WIN10_SDK_VERSION $ENV{WIN10_SDK_VERSION})
+endif ("$ENV{WIN10_SDK_PATH}$ENV{WIN10_SDK_VERSION}" STREQUAL "" )
 
 # WIN10_SDK_PATH will be something like C:\Program Files (x86)\Windows Kits\10
-
 # WIN10_SDK_VERSION will be something like 10.0.14393 or 10.0.14393.0; we need the
 # one that matches the directory name.
+
 if (IS_DIRECTORY "${WIN10_SDK_PATH}/Include/${WIN10_SDK_VERSION}.0")
   set(WIN10_SDK_VERSION "${WIN10_SDK_VERSION}.0")
 endif (IS_DIRECTORY "${WIN10_SDK_PATH}/Include/${WIN10_SDK_VERSION}.0")
diff --git a/cmake/modules/FindDiaSDK.cmake b/cmake/modules/FindDiaSDK.cmake
index 21cd75e5c..f11e5117f 100644
--- a/cmake/modules/FindDiaSDK.cmake
+++ b/cmake/modules/FindDiaSDK.cmake
@@ -4,14 +4,12 @@ get_filename_component(VS_PATH64 "[HKEY_LOCAL_MACHINE\\SOFTWARE\\WOW6432Node\\Mi
 # VS_PATH32 will be something like C:/Program Files (x86)/Microsoft Visual Studio 14.0/Common7/IDE
 
 # Also look for in vs15 install.
-# TODO: update this to be a non-hardcoded path. Registry keys were removed
-# in vs15 in favor of COM server dlls.
-# https://blogs.msdn.microsoft.com/heaths/2016/09/15/changes-to-visual-studio-15-setup/
-get_filename_component(VS15_C_PATH32 "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/Common7/IDE" ABSOLUTE CACHE)
-get_filename_component(VS15_P_PATH32 "C:/Program Files (x86)/Microsoft Visual Studio/2017/Professional/Common7/IDE" ABSOLUTE CACHE)
-get_filename_component(VS15_E_PATH32 "C:/Program Files (x86)/Microsoft Visual Studio/2017/Enterprise/Common7/IDE" ABSOLUTE CACHE)
+set(PROGRAMFILES_X86 "ProgramFiles(x86)")
+get_filename_component(VS15_C_PATH32 "$ENV{${PROGRAMFILES_X86}}/Microsoft Visual Studio/2017/Community/Common7/IDE" ABSOLUTE CACHE)
+get_filename_component(VS15_P_PATH32 "$ENV{${PROGRAMFILES_X86}}/Microsoft Visual Studio/2017/Professional/Common7/IDE" ABSOLUTE CACHE)
+get_filename_component(VS15_E_PATH32 "$ENV{${PROGRAMFILES_X86}}/Microsoft Visual Studio/2017/Enterprise/Common7/IDE" ABSOLUTE CACHE)
 
-# Find the TAEF path, it will typically look something like this.
+# Find the DIA SDK path, it will typically look something like this.
 # C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\DIA SDK\include
 # C:\Program Files (x86)\Microsoft Visual Studio 14.0\DIA SDK\include\dia2.h
 find_path(DIASDK_INCLUDE_DIR    # Set variable DIASDK_INCLUDE_DIR
diff --git a/external/SPIRV-Headers b/external/SPIRV-Headers
index d5b2e1255..17da9f823 160000
--- a/external/SPIRV-Headers
+++ b/external/SPIRV-Headers
@@ -1 +1 @@
-Subproject commit d5b2e1255f706ce1f88812217e9a554f299848af
+Subproject commit 17da9f8231f78cf519b4958c2229463a63ead9e2
diff --git a/external/SPIRV-Tools b/external/SPIRV-Tools
index 6e85d1a6f..248debf55 160000
--- a/external/SPIRV-Tools
+++ b/external/SPIRV-Tools
@@ -1 +1 @@
-Subproject commit 6e85d1a6fc75c4d37ccf7772fbb05e11b4fd69b0
+Subproject commit 248debf55ad6de4a80f6d4a128ef195b6ed05a30
diff --git a/include/dxc/DXIL/DxilUtil.h b/include/dxc/DXIL/DxilUtil.h
index 995a624dc..c17762d08 100644
--- a/include/dxc/DXIL/DxilUtil.h
+++ b/include/dxc/DXIL/DxilUtil.h
@@ -14,6 +14,7 @@
 #include <string>
 #include <memory>
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/IR/Constants.h"
 
 namespace llvm {
@@ -30,6 +31,7 @@ class BasicBlock;
 class raw_ostream;
 class ModulePass;
 class PassRegistry;
+class DebugLoc;
 
 ModulePass *createDxilLoadMetadataPass();
 void initializeDxilLoadMetadataPass(llvm::PassRegistry&);
@@ -52,12 +54,13 @@ namespace dxilutil {
   bool HasDynamicIndexing(llvm::Value *V);
 
   // Find alloca insertion point, given instruction
-  llvm::Instruction *FindAllocaInsertionPt(llvm::Instruction* I);
+  llvm::Instruction *FindAllocaInsertionPt(llvm::Instruction* I); // Considers entire parent function
+  llvm::Instruction *FindAllocaInsertionPt(llvm::BasicBlock* BB); // Only considers provided block
   llvm::Instruction *FindAllocaInsertionPt(llvm::Function* F);
   llvm::Instruction *SkipAllocas(llvm::Instruction *I);
   // Get first non-alloca insertion point, to avoid inserting non-allocas before alloca
-  llvm::Instruction *FirstNonAllocaInsertionPt(llvm::Instruction* I);
-  llvm::Instruction *FirstNonAllocaInsertionPt(llvm::BasicBlock* BB);
+  llvm::Instruction *FirstNonAllocaInsertionPt(llvm::Instruction* I); // Considers entire parent function
+  llvm::Instruction *FirstNonAllocaInsertionPt(llvm::BasicBlock* BB); // Only considers provided block
   llvm::Instruction *FirstNonAllocaInsertionPt(llvm::Function* F);
 
   bool IsStaticGlobal(llvm::GlobalVariable *GV);
@@ -66,6 +69,8 @@ namespace dxilutil {
                              llvm::Function *PatchConstantFunc, bool IsLib);
   void EmitErrorOnInstruction(llvm::Instruction *I, llvm::StringRef Msg);
   void EmitResMappingError(llvm::Instruction *Res);
+  std::string FormatMessageAtLocation(const llvm::DebugLoc &DL, const llvm::Twine& Msg);
+  llvm::Twine FormatMessageWithoutLocation(const llvm::Twine& Msg);
   // Simple demangle just support case "\01?name@" pattern.
   llvm::StringRef DemangleFunctionName(llvm::StringRef name);
   // ReplaceFunctionName replaces the undecorated portion of originalName with undecorated newName
@@ -91,6 +96,7 @@ namespace dxilutil {
   llvm::Value *MergeSelectOnSameValue(llvm::Instruction *SelInst,
                                       unsigned startOpIdx,
                                       unsigned numOperands);
+  bool SimplifyTrivialPHIs(llvm::BasicBlock *BB);
   std::unique_ptr<llvm::Module> LoadModuleFromBitcode(llvm::StringRef BC,
     llvm::LLVMContext &Ctx, std::string &DiagStr);
   std::unique_ptr<llvm::Module> LoadModuleFromBitcode(llvm::MemoryBuffer *MB,
@@ -99,6 +105,7 @@ namespace dxilutil {
   // Returns true if type contains HLSL Object type (resource)
   bool ContainsHLSLObjectType(llvm::Type *Ty);
   bool IsHLSLObjectType(llvm::Type *Ty);
+  bool IsHLSLMatrixType(llvm::Type *Ty);
   bool IsSplat(llvm::ConstantDataVector *cdv);
 }
 
diff --git a/include/dxc/DxilPIXPasses/DxilPIXPasses.h b/include/dxc/DxilPIXPasses/DxilPIXPasses.h
index 2bbd5eadb..60198d1f2 100644
--- a/include/dxc/DxilPIXPasses/DxilPIXPasses.h
+++ b/include/dxc/DxilPIXPasses/DxilPIXPasses.h
@@ -16,6 +16,7 @@ class ModulePass;
 class PassRegistry;
 
 ModulePass *createDxilAddPixelHitInstrumentationPass();
+ModulePass *createDxilAnnotateWithVirtualRegisterPass();
 ModulePass *createDxilOutputColorBecomesConstantPass();
 ModulePass *createDxilRemoveDiscardsPass();
 ModulePass *createDxilReduceMSAAToSingleSamplePass();
@@ -24,6 +25,7 @@ ModulePass *createDxilDebugInstrumentationPass();
 ModulePass *createDxilShaderAccessTrackingPass();
 
 void initializeDxilAddPixelHitInstrumentationPass(llvm::PassRegistry&);
+void initializeDxilAnnotateWithVirtualRegisterPass(llvm::PassRegistry&);
 void initializeDxilOutputColorBecomesConstantPass(llvm::PassRegistry&);
 void initializeDxilRemoveDiscardsPass(llvm::PassRegistry&);
 void initializeDxilReduceMSAAToSingleSamplePass(llvm::PassRegistry&);
diff --git a/include/dxc/HLSL/DxilGenerationPass.h b/include/dxc/HLSL/DxilGenerationPass.h
index 65605cc62..22e480c09 100644
--- a/include/dxc/HLSL/DxilGenerationPass.h
+++ b/include/dxc/HLSL/DxilGenerationPass.h
@@ -66,6 +66,7 @@ ModulePass *createDxilPromoteStaticResources();
 ModulePass *createDxilLegalizeResources();
 ModulePass *createDxilLegalizeEvalOperationsPass();
 FunctionPass *createDxilLegalizeSampleOffsetPass();
+FunctionPass *createDxilSimpleGVNHoistPass();
 ModulePass *createFailUndefResourcePass();
 FunctionPass *createSimplifyInstPass();
 ModulePass *createDxilTranslateRawBuffer();
@@ -96,6 +97,7 @@ void initializeDxilPromoteStaticResourcesPass(llvm::PassRegistry&);
 void initializeDxilLegalizeResourcesPass(llvm::PassRegistry&);
 void initializeDxilLegalizeEvalOperationsPass(llvm::PassRegistry&);
 void initializeDxilLegalizeSampleOffsetPassPass(llvm::PassRegistry&);
+void initializeDxilSimpleGVNHoistPass(llvm::PassRegistry&);
 void initializeFailUndefResourcePass(llvm::PassRegistry&);
 void initializeSimplifyInstPass(llvm::PassRegistry&);
 void initializeDxilTranslateRawBufferPass(llvm::PassRegistry&);
diff --git a/include/dxc/HLSL/HLMatrixLowerHelper.h b/include/dxc/HLSL/HLMatrixLowerHelper.h
index 39a432319..0b7ad6257 100644
--- a/include/dxc/HLSL/HLMatrixLowerHelper.h
+++ b/include/dxc/HLSL/HLMatrixLowerHelper.h
@@ -27,21 +27,30 @@ class DxilTypeSystem;
 
 namespace HLMatrixLower {
 // TODO: use type annotation.
-bool IsMatrixType(llvm::Type *Ty);
 DxilFieldAnnotation *FindAnnotationFromMatUser(llvm::Value *Mat,
                                                DxilTypeSystem &typeSys);
 // Translate matrix type to vector type.
-llvm::Type *LowerMatrixType(llvm::Type *Ty);
+llvm::Type *LowerMatrixType(llvm::Type *Ty, bool forMem = false);
 // TODO: use type annotation.
 llvm::Type *GetMatrixInfo(llvm::Type *Ty, unsigned &col, unsigned &row);
 // TODO: use type annotation.
 bool IsMatrixArrayPointer(llvm::Type *Ty);
 // Translate matrix array pointer type to vector array pointer type.
-llvm::Type *LowerMatrixArrayPointer(llvm::Type *Ty);
+llvm::Type *LowerMatrixArrayPointer(llvm::Type *Ty, bool forMem = false);
 
 llvm::Value *BuildVector(llvm::Type *EltTy, unsigned size,
                          llvm::ArrayRef<llvm::Value *> elts,
                          llvm::IRBuilder<> &Builder);
+
+llvm::Value *VecMatrixMemToReg(llvm::Value *VecVal, llvm::Type *MatType,
+                               llvm::IRBuilder<> &Builder);
+llvm::Value *VecMatrixRegToMem(llvm::Value* VecVal, llvm::Type *MatType,
+                               llvm::IRBuilder<> &Builder);
+llvm::Instruction *CreateVecMatrixLoad(llvm::Value *VecPtr,
+                                       llvm::Type *MatType, llvm::IRBuilder<> &Builder);
+llvm::Instruction *CreateVecMatrixStore(llvm::Value* VecVal, llvm::Value *VecPtr,
+                                        llvm::Type *MatType, llvm::IRBuilder<> &Builder);
+
 // For case like mat[i][j].
 // IdxList is [i][0], [i][1], [i][2],[i][3].
 // Idx is j.
diff --git a/include/dxc/HLSL/HLModule.h b/include/dxc/HLSL/HLModule.h
index 10ff5a8ec..4d42bd96f 100644
--- a/include/dxc/HLSL/HLModule.h
+++ b/include/dxc/HLSL/HLModule.h
@@ -196,8 +196,9 @@ public:
                                           llvm::ArrayRef<llvm::Value *> paramList,
                                           llvm::Module &M);
 
-  static unsigned FindCastOp(bool fromUnsigned, bool toUnsigned,
-                             llvm::Type *SrcTy, llvm::Type *DstTy);
+  // Caller must handle conversions to bool and no-ops
+  static unsigned GetNumericCastOp(
+    llvm::Type *SrcTy, bool SrcIsUnsigned, llvm::Type *DstTy, bool DstIsUnsigned);
 
   // Precise attribute.
   // Note: Precise will be marked on alloca inst with metadata in code gen.
@@ -319,3 +320,4 @@ public:
 };
 
 } // namespace hlsl
+
diff --git a/include/dxc/HlslIntrinsicOp.h b/include/dxc/HlslIntrinsicOp.h
index 4706f2926..28a930087 100644
--- a/include/dxc/HlslIntrinsicOp.h
+++ b/include/dxc/HlslIntrinsicOp.h
@@ -265,12 +265,14 @@ import hctdb_instrhelp
   IOP_WaveActiveUSum,
   IOP_WavePrefixUProduct,
   IOP_WavePrefixUSum,
+  IOP_uabs,
   IOP_uclamp,
   IOP_ufirstbithigh,
   IOP_umad,
   IOP_umax,
   IOP_umin,
   IOP_umul,
+  IOP_usign,
   MOP_InterlockedUMax,
   MOP_InterlockedUMin,
   Num_Intrinsics,
@@ -293,12 +295,14 @@ import hctdb_instrhelp
   case IntrinsicOp::IOP_WaveActiveSum:
   case IntrinsicOp::IOP_WavePrefixProduct:
   case IntrinsicOp::IOP_WavePrefixSum:
+  case IntrinsicOp::IOP_abs:
   case IntrinsicOp::IOP_clamp:
   case IntrinsicOp::IOP_firstbithigh:
   case IntrinsicOp::IOP_mad:
   case IntrinsicOp::IOP_max:
   case IntrinsicOp::IOP_min:
   case IntrinsicOp::IOP_mul:
+  case IntrinsicOp::IOP_sign:
   case IntrinsicOp::MOP_InterlockedMax:
   case IntrinsicOp::MOP_InterlockedMin:
 // HLSL-HAS-UNSIGNED-INTRINSICS:END
@@ -332,6 +336,8 @@ import hctdb_instrhelp
     return static_cast<unsigned>(IntrinsicOp::IOP_WavePrefixUProduct);
   case IntrinsicOp::IOP_WavePrefixSum:
     return static_cast<unsigned>(IntrinsicOp::IOP_WavePrefixUSum);
+  case IntrinsicOp::IOP_abs:
+    return static_cast<unsigned>(IntrinsicOp::IOP_uabs);
   case IntrinsicOp::IOP_clamp:
     return static_cast<unsigned>(IntrinsicOp::IOP_uclamp);
   case IntrinsicOp::IOP_firstbithigh:
@@ -344,6 +350,8 @@ import hctdb_instrhelp
     return static_cast<unsigned>(IntrinsicOp::IOP_umin);
   case IntrinsicOp::IOP_mul:
     return static_cast<unsigned>(IntrinsicOp::IOP_umul);
+  case IntrinsicOp::IOP_sign:
+    return static_cast<unsigned>(IntrinsicOp::IOP_usign);
   case IntrinsicOp::MOP_InterlockedMax:
     return static_cast<unsigned>(IntrinsicOp::MOP_InterlockedUMax);
   case IntrinsicOp::MOP_InterlockedMin:
diff --git a/include/dxc/Support/HLSLOptions.h b/include/dxc/Support/HLSLOptions.h
index 5207e4ccd..47e1f74e8 100644
--- a/include/dxc/Support/HLSLOptions.h
+++ b/include/dxc/Support/HLSLOptions.h
@@ -162,6 +162,7 @@ public:
   bool LegacyResourceReservation = false; // OPT_flegacy_resource_reservation
   unsigned long AutoBindingSpace = UINT_MAX; // OPT_auto_binding_space
   bool ExportShadersOnly = false; // OPT_export_shaders_only
+  bool ResMayAlias = false; // OPT_res_may_alias
 
   bool IsRootSignatureProfile();
   bool IsLibraryProfile();
diff --git a/include/dxc/Support/HLSLOptions.td b/include/dxc/Support/HLSLOptions.td
index 27f805cfe..12b1e0743 100644
--- a/include/dxc/Support/HLSLOptions.td
+++ b/include/dxc/Support/HLSLOptions.td
@@ -365,11 +365,11 @@ def mergeUAVs : JoinedOrSeparate<["-", "/"], "mergeUAVs">, MetaVarName<"<file>">
   HelpText<"Merge UAV slots of template shader and current shader">;
 def matchUAVs : JoinedOrSeparate<["-", "/"], "matchUAVs">, MetaVarName<"<file>">, Group<hlslcomp_Group>,
   HelpText<"Match template shader UAV slots in current shader">;
-def res_may_alias : Flag<["-", "/"], "res_may_alias">, Flags<[CoreOption]>, Group<hlslcomp_Group>,
-  HelpText<"Assume that UAVs/SRVs may alias">;
 def enable_unbounded_descriptor_tables : Flag<["-", "/"], "enable_unbounded_descriptor_tables">, Flags<[CoreOption]>, Group<hlslcomp_Group>,
   HelpText<"Enables unbounded descriptor tables">;
 */
+def res_may_alias : Flag<["-", "/"], "res_may_alias">, Flags<[CoreOption]>, Group<hlslcomp_Group>,
+  HelpText<"Assume that UAVs/SRVs may alias">;
 def all_resources_bound : Flag<["-", "/"], "all_resources_bound">, Flags<[CoreOption]>, Group<hlslcomp_Group>,
   HelpText<"Enables agressive flattening">;
 
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index ee847ce9e..4406a4e90 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -259,6 +259,7 @@ void initializeMultiDimArrayToOneDimArrayPass(PassRegistry&);
 void initializeResourceToHandlePass(PassRegistry&);
 void initializeSROA_SSAUp_HLSLPass(PassRegistry&);
 void initializeHoistConstantArrayPass(PassRegistry&);
+void initializeDxilLoopUnrollPass(PassRegistry&);
 // HLSL Change Ends
 void initializeScalarEvolutionAliasAnalysisPass(PassRegistry&);
 void initializeScalarEvolutionPass(PassRegistry&);
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 9867f7753..fc044b57b 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -128,6 +128,7 @@ public:
   bool PrepareForLTO;
   bool HLSLHighLevel = false; // HLSL Change
   hlsl::HLSLExtensionsCodegenHelper *HLSLExtensionsCodeGen = nullptr; // HLSL Change
+  bool HLSLResMayAlias = false; // HLSL Change
 
 private:
   /// ExtensionList - This is list of all of the extensions that are registered.
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 5318706b8..d1ab85a0b 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -94,7 +94,8 @@ FunctionPass *createBitTrackingDCEPass();
 //
 // SROA - Replace aggregates or pieces of aggregates with scalar SSA values.
 //
-FunctionPass *createSROAPass(bool RequiresDomTree = true);
+FunctionPass *createSROAPass(bool RequiresDomTree = true,
+                             bool SkipHLSLMat = true);
 
 //===----------------------------------------------------------------------===//
 //
@@ -122,6 +123,9 @@ void initializeSROA_DT_HLSLPass(PassRegistry&);
 //
 ModulePass *createSROA_Parameter_HLSL();
 void initializeSROA_Parameter_HLSLPass(PassRegistry&);
+
+Pass *createDxilLoopUnrollPass(unsigned MaxIterationAttempt);
+void initializeDxilLoopUnrollPass(PassRegistry&);
 //===----------------------------------------------------------------------===//
 //
 // LowerStaticGlobalIntoAlloca. Replace static globals with alloca if only used
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index cba9bdbcc..d446c5b20 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -24,6 +24,9 @@ add_subdirectory(DxcSupport) # HLSL Change
 add_subdirectory(HLSL) # HLSL Change
 add_subdirectory(DXIL) # HLSL Change
 add_subdirectory(DxilContainer) # HLSL Change
+if(WIN32) # HLSL Change
+  add_subdirectory(DxilDia) # HLSL Change
+endif(WIN32) # HLSL Change
 add_subdirectory(DxilPIXPasses) # HLSL Change
 add_subdirectory(DxilRootSignature) # HLSL Change
 add_subdirectory(DxrFallback) # HLSL Change
diff --git a/lib/DXIL/DxilSubobject.cpp b/lib/DXIL/DxilSubobject.cpp
index a1aa62a0e..b0bec8b72 100644
--- a/lib/DXIL/DxilSubobject.cpp
+++ b/lib/DXIL/DxilSubobject.cpp
@@ -76,6 +76,9 @@ void DxilSubobject::CopyUnionedContents(const DxilSubobject &other) {
     HitGroup.ClosestHit = other.HitGroup.ClosestHit;
     HitGroup.Intersection = other.HitGroup.Intersection;
     break;
+  default:
+    DXASSERT(0, "invalid kind");
+    break;
   }
 }
 
diff --git a/lib/DXIL/DxilUtil.cpp b/lib/DXIL/DxilUtil.cpp
index 2a13f1d40..f36f4db5e 100644
--- a/lib/DXIL/DxilUtil.cpp
+++ b/lib/DXIL/DxilUtil.cpp
@@ -192,9 +192,8 @@ std::unique_ptr<llvm::Module> LoadModuleFromBitcode(llvm::MemoryBuffer *MB,
   llvm::LLVMContext &Ctx,
   std::string &DiagStr) {
   // Note: the DiagStr is not used.
-  ErrorOr<std::unique_ptr<llvm::Module>> pModule(
-    llvm::parseBitcodeFile(MB->getMemBufferRef(), Ctx));
-  if (std::error_code ec = pModule.getError()) {
+  auto pModule = llvm::parseBitcodeFile(MB->getMemBufferRef(), Ctx);
+  if (!pModule) {
     return nullptr;
   }
   return std::unique_ptr<llvm::Module>(pModule.get().release());
@@ -229,20 +228,29 @@ static bool EmitErrorOnInstructionFollowPhiSelect(
   return false;
 }
 
+std::string FormatMessageAtLocation(const DebugLoc &DL, const Twine& Msg) {
+  std::string locString;
+  raw_string_ostream os(locString);
+  DL.print(os);
+  os << ": " << Msg;
+  return os.str();
+}
+
+Twine FormatMessageWithoutLocation(const Twine& Msg) {
+  return Msg + " Use /Zi for source location.";
+}
+
 void EmitErrorOnInstruction(Instruction *I, StringRef Msg) {
   const DebugLoc &DL = I->getDebugLoc();
   if (DL.get()) {
-    std::string locString;
-    raw_string_ostream os(locString);
-    DL.print(os);
-    I->getContext().emitError(os.str() + ": " + Twine(Msg));
+    I->getContext().emitError(FormatMessageAtLocation(DL, Msg));
     return;
   } else if (isa<PHINode>(I) || isa<SelectInst>(I)) {
     if (EmitErrorOnInstructionFollowPhiSelect(I, Msg))
       return;
   }
 
-  I->getContext().emitError(Twine(Msg) + " Use /Zi for source location.");
+  I->getContext().emitError(FormatMessageWithoutLocation(Msg));
 }
 
 const StringRef kResourceMapErrorMsg =
@@ -296,6 +304,28 @@ Value *MergeSelectOnSameValue(Instruction *SelInst, unsigned startOpIdx,
   return op0;
 }
 
+bool SimplifyTrivialPHIs(BasicBlock *BB) {
+  bool Changed = false;
+  SmallVector<Instruction *, 16> Removed;
+  for (Instruction &I : *BB) {
+    PHINode *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      continue;
+
+    if (PN->getNumIncomingValues() == 1) {
+      Value *V = PN->getIncomingValue(0);
+      PN->replaceAllUsesWith(V);
+      Removed.push_back(PN);
+      Changed = true;
+    }
+  }
+
+  for (Instruction *I : Removed)
+    I->eraseFromParent();
+
+  return Changed;
+}
+
 Value *SelectOnOperation(llvm::Instruction *Inst, unsigned operandIdx) {
   Instruction *prototype = Inst;
   for (unsigned i = 0; i < prototype->getNumOperands(); i++) {
@@ -343,26 +373,27 @@ llvm::Instruction *SkipAllocas(llvm::Instruction *I) {
     I = I->getNextNode();
   return I;
 }
+llvm::Instruction *FindAllocaInsertionPt(llvm::BasicBlock* BB) {
+  return &*BB->getFirstInsertionPt();
+}
+llvm::Instruction *FindAllocaInsertionPt(llvm::Function* F) {
+  return FindAllocaInsertionPt(&F->getEntryBlock());
+}
 llvm::Instruction *FindAllocaInsertionPt(llvm::Instruction* I) {
   Function *F = I->getParent()->getParent();
   if (F)
-    return &*F->getEntryBlock().getFirstInsertionPt();
+    return FindAllocaInsertionPt(F);
   else // BB with no parent function
-    return &*I->getParent()->getFirstInsertionPt();
-}
-llvm::Instruction *FindAllocaInsertionPt(llvm::Function* F) {
-  return &*F->getEntryBlock().getFirstInsertionPt();
+    return FindAllocaInsertionPt(I->getParent());
 }
 llvm::Instruction *FirstNonAllocaInsertionPt(llvm::Instruction* I) {
   return SkipAllocas(FindAllocaInsertionPt(I));
 }
 llvm::Instruction *FirstNonAllocaInsertionPt(llvm::BasicBlock* BB) {
-  return SkipAllocas(
-    &*BB->getFirstInsertionPt());
+  return SkipAllocas(FindAllocaInsertionPt(BB));
 }
 llvm::Instruction *FirstNonAllocaInsertionPt(llvm::Function* F) {
-  return SkipAllocas(
-    &*F->getEntryBlock().getFirstInsertionPt());
+  return SkipAllocas(FindAllocaInsertionPt(F));
 }
 
 bool IsHLSLObjectType(llvm::Type *Ty) {
@@ -432,6 +463,20 @@ bool IsHLSLObjectType(llvm::Type *Ty) {
   return false;
 }
 
+bool IsHLSLMatrixType(Type *Ty) {
+  if (StructType *ST = dyn_cast<StructType>(Ty)) {
+    Type *EltTy = ST->getElementType(0);
+    if (!ST->getName().startswith("class.matrix"))
+      return false;
+
+    bool isVecArray =
+        EltTy->isArrayTy() && EltTy->getArrayElementType()->isVectorTy();
+
+    return isVecArray && EltTy->getArrayNumElements() <= 4;
+  }
+  return false;
+}
+
 bool ContainsHLSLObjectType(llvm::Type *Ty) {
   // Unwrap pointer/array
   while (llvm::isa<llvm::PointerType>(Ty))
diff --git a/lib/DxcSupport/HLSLOptions.cpp b/lib/DxcSupport/HLSLOptions.cpp
index 31110a8af..63451c026 100644
--- a/lib/DxcSupport/HLSLOptions.cpp
+++ b/lib/DxcSupport/HLSLOptions.cpp
@@ -544,6 +544,7 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
   opts.LegacyMacroExpansion = Args.hasFlag(OPT_flegacy_macro_expansion, OPT_INVALID, false);
   opts.LegacyResourceReservation = Args.hasFlag(OPT_flegacy_resource_reservation, OPT_INVALID, false);
   opts.ExportShadersOnly = Args.hasFlag(OPT_export_shaders_only, OPT_INVALID, false);
+  opts.ResMayAlias = Args.hasFlag(OPT_res_may_alias, OPT_INVALID, false);
 
   if (opts.DefaultColMajor && opts.DefaultRowMajor) {
     errors << "Cannot specify /Zpr and /Zpc together, use /? to get usage information";
diff --git a/lib/DxilDia/CMakeLists.txt b/lib/DxilDia/CMakeLists.txt
new file mode 100644
index 000000000..085aab546
--- /dev/null
+++ b/lib/DxilDia/CMakeLists.txt
@@ -0,0 +1,30 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# This file is distributed under the University of Illinois Open Source License. See LICENSE.TXT for details.
+
+if (WIN32)
+  find_package(DiaSDK REQUIRED) # Used for constants and declarations.
+endif (WIN32)
+
+add_llvm_library(LLVMDxilDia
+  DxilDia.cpp
+  DxilDiaDataSource.cpp
+  DxilDiaEnumTables.cpp
+  DxilDiaSession.cpp
+  DxilDiaTable.cpp
+  DxilDiaTableFrameData.cpp
+  DxilDiaTableInjectedSources.cpp
+  DxilDiaTableInputAssemblyFile.cpp
+  DxilDiaTableLineNumbers.cpp
+  DxilDiaTableSections.cpp
+  DxilDiaTableSegmentMap.cpp
+  DxilDiaTableSourceFiles.cpp
+  DxilDiaTableSymbols.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/IR
+)
+
+if (WIN32)
+  target_link_libraries(LLVMDxilDia PRIVATE ${LIBRARIES} ${DIASDK_LIBRARIES})
+  include_directories(AFTER ${LLVM_INCLUDE_DIR}/dxc/Tracing ${DIASDK_INCLUDE_DIRS})
+endif (WIN32)
diff --git a/lib/DxilDia/DxilDia.cpp b/lib/DxilDia/DxilDia.cpp
new file mode 100644
index 000000000..1b9aab543
--- /dev/null
+++ b/lib/DxilDia/DxilDia.cpp
@@ -0,0 +1,33 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDia.cpp                                                               //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "DxilDia.h"
+
+#include "dxc/Support/Global.h"
+#include "dxc/Support/Unicode.h"
+
+HRESULT dxil_dia::StringRefToBSTR(llvm::StringRef value, BSTR *pRetVal) {
+  try {
+    wchar_t *wide;
+    size_t sideSize;
+    if (!Unicode::UTF8BufferToUTF16Buffer(value.data(), value.size(), &wide,
+      &sideSize))
+      return E_FAIL;
+    *pRetVal = SysAllocString(wide);
+    delete[] wide;
+  }
+  CATCH_CPP_RETURN_HRESULT();
+  return S_OK;
+}
+
+HRESULT dxil_dia::ENotImpl() {
+  return E_NOTIMPL;
+}
\ No newline at end of file
diff --git a/lib/DxilDia/DxilDia.h b/lib/DxilDia/DxilDia.h
new file mode 100644
index 000000000..94f810b61
--- /dev/null
+++ b/lib/DxilDia/DxilDia.h
@@ -0,0 +1,30 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDia.h                                                                 //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+#include "dxc/Support/WinIncludes.h"
+
+#include "llvm/ADT/StringRef.h"
+
+namespace dxil_dia {
+// Single program, single compiland allows for some simplifications.
+static constexpr DWORD HlslProgramId = 1;
+static constexpr DWORD HlslCompilandId = 2;
+static constexpr DWORD HlslCompilandDetailsId = 3;
+static constexpr DWORD HlslCompilandEnvFlagsId = 4;
+static constexpr DWORD HlslCompilandEnvTargetId = 5;
+static constexpr DWORD HlslCompilandEnvEntryId = 6;
+static constexpr DWORD HlslCompilandEnvDefinesId = 7;
+static constexpr DWORD HlslCompilandEnvArgumentsId = 8;
+
+HRESULT ENotImpl();
+HRESULT StringRefToBSTR(llvm::StringRef value, BSTR *pRetVal);
+}  // namespace dxil_dia
diff --git a/lib/DxilDia/DxilDiaDataSource.cpp b/lib/DxilDia/DxilDiaDataSource.cpp
new file mode 100644
index 000000000..be336b76c
--- /dev/null
+++ b/lib/DxilDia/DxilDiaDataSource.cpp
@@ -0,0 +1,145 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaDataSource.cpp                                                     //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "DxilDiaDataSource.h"
+
+#include "dxc/DxilContainer/DxilContainer.h"
+#include "dxc/DXIL/DxilUtil.h"
+#include "dxc/Support/FileIOHelper.h"
+#include "dxc/Support/dxcapi.impl.h"
+
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+
+#include "DxilDiaSession.h"
+
+dxil_dia::DataSource::DataSource(IMalloc *pMalloc) : m_pMalloc(pMalloc) {
+}
+
+dxil_dia::DataSource::~DataSource() {
+  // These are cross-referenced, so let's be explicit.
+  m_finder.reset();
+  m_module.reset();
+  m_context.reset();
+}
+
+STDMETHODIMP dxil_dia::DataSource::get_lastError(BSTR *pRetVal) {
+  *pRetVal = nullptr;
+  return S_OK;
+}
+
+namespace dxil_dia
+{
+std::unique_ptr<llvm::MemoryBuffer> getMemBufferFromBlob(_In_ IDxcBlob *pBlob,
+                                                         const llvm::Twine &BufferName) {
+  llvm::StringRef Data((LPSTR)pBlob->GetBufferPointer(), pBlob->GetBufferSize());
+  return llvm::MemoryBuffer::getMemBufferCopy(Data, BufferName);
+}
+
+std::unique_ptr<llvm::MemoryBuffer> getMemBufferFromStream(_In_ IStream *pStream,
+                                                           const llvm::Twine &BufferName) {
+  CComPtr<IDxcBlob> pBlob;
+  if (SUCCEEDED(pStream->QueryInterface(&pBlob))) {
+    return getMemBufferFromBlob(pBlob, BufferName);
+  }
+
+  STATSTG statstg;
+  IFT(pStream->Stat(&statstg, STATFLAG_NONAME));
+  size_t size = statstg.cbSize.LowPart;
+  std::unique_ptr<llvm::MemoryBuffer> result(
+    llvm::MemoryBuffer::getNewUninitMemBuffer(size, BufferName));
+  char *pBuffer = (char *)result.get()->getBufferStart();
+  ULONG read;
+  IFT(pStream->Read(pBuffer, size, &read));
+  return result;
+}
+}  // namespace dxil_dia
+
+STDMETHODIMP dxil_dia::DataSource::loadDataFromIStream(_In_ IStream *pIStream) {
+  DxcThreadMalloc TM(m_pMalloc);
+  if (m_module.get() != nullptr) {
+    return E_FAIL;
+  }
+  m_context.reset();
+  m_finder.reset();
+  try {
+    m_context = std::make_shared<llvm::LLVMContext>();
+    llvm::MemoryBuffer *pBitcodeBuffer;
+    std::unique_ptr<llvm::MemoryBuffer> pEmbeddedBuffer;
+    std::unique_ptr<llvm::MemoryBuffer> pBuffer =
+      getMemBufferFromStream(pIStream, "data");
+    size_t bufferSize = pBuffer->getBufferSize();
+
+    // The buffer can hold LLVM bitcode for a module, or the ILDB
+    // part from a container.
+    if (bufferSize < sizeof(UINT32)) {
+      return DXC_E_MALFORMED_CONTAINER;
+    }
+    const UINT32 BC_C0DE = ((INT32)(INT8)'B' | (INT32)(INT8)'C' << 8 | (INT32)0xDEC0 << 16); // BC0xc0de in big endian
+    if (BC_C0DE == *(const UINT32*)pBuffer->getBufferStart()) {
+      pBitcodeBuffer = pBuffer.get();
+    } else {
+      if (bufferSize <= sizeof(hlsl::DxilProgramHeader)) {
+        return DXC_E_MALFORMED_CONTAINER;
+      }
+
+      hlsl::DxilProgramHeader *pDxilProgramHeader = (hlsl::DxilProgramHeader *)pBuffer->getBufferStart();
+      if (pDxilProgramHeader->BitcodeHeader.DxilMagic != hlsl::DxilMagicValue) {
+        return DXC_E_MALFORMED_CONTAINER;
+      }
+
+      UINT32 BlobSize;
+      const char *pBitcode = nullptr;
+      hlsl::GetDxilProgramBitcode(pDxilProgramHeader, &pBitcode, &BlobSize);
+      UINT32 offset = (UINT32)(pBitcode - (const char *)pDxilProgramHeader);
+      std::unique_ptr<llvm::MemoryBuffer> p = llvm::MemoryBuffer::getMemBuffer(
+        llvm::StringRef(pBitcode, bufferSize - offset), "data");
+      pEmbeddedBuffer.swap(p);
+      pBitcodeBuffer = pEmbeddedBuffer.get();
+    }
+
+    std::string DiagStr;
+    std::unique_ptr<llvm::Module> pModule = hlsl::dxilutil::LoadModuleFromBitcode(
+      pBitcodeBuffer, *m_context.get(), DiagStr);
+    if (!pModule.get())
+      return E_FAIL;
+    m_finder = std::make_shared<llvm::DebugInfoFinder>();
+    m_finder->processModule(*pModule.get());
+    m_module.reset(pModule.release());
+  }
+  CATCH_CPP_RETURN_HRESULT();
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::DataSource::openSession(_COM_Outptr_ IDiaSession **ppSession) {
+  DxcThreadMalloc TM(m_pMalloc);
+  *ppSession = nullptr;
+  if (m_module.get() == nullptr)
+    return E_FAIL;
+  CComPtr<Session> pSession = Session::Alloc(DxcGetThreadMallocNoRef());
+  IFROOM(pSession.p);
+  pSession->Init(m_context, m_module, m_finder);
+  *ppSession = pSession.Detach();
+  return S_OK;
+}
+
+
+HRESULT CreateDxcDiaDataSource(_In_ REFIID riid, _Out_ LPVOID* ppv) {
+  CComPtr<dxil_dia::DataSource> result = CreateOnMalloc<dxil_dia::DataSource>(DxcGetThreadMallocNoRef());
+  if (result == nullptr) {
+    *ppv = nullptr;
+    return E_OUTOFMEMORY;
+  }
+
+  return result.p->QueryInterface(riid, ppv);
+}
\ No newline at end of file
diff --git a/lib/DxilDia/DxilDiaDataSource.h b/lib/DxilDia/DxilDiaDataSource.h
new file mode 100644
index 000000000..7bd9af986
--- /dev/null
+++ b/lib/DxilDia/DxilDiaDataSource.h
@@ -0,0 +1,84 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaDataSource.cpp                                                     //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "dxc/Support/WinIncludes.h"
+
+#include <memory>
+
+#include "dia2.h"
+
+#include "dxc/DXIL/DxilModule.h"
+#include "dxc/Support/Global.h"
+
+#include "DxilDia.h"
+#include "DxilDiaTable.h"
+
+namespace dxil_dia {
+class Session;
+
+class DataSource : public IDiaDataSource {
+private:
+  DXC_MICROCOM_TM_REF_FIELDS()
+  std::shared_ptr<llvm::Module> m_module;
+  std::shared_ptr<llvm::LLVMContext> m_context;
+  std::shared_ptr<llvm::DebugInfoFinder> m_finder;
+
+public:
+  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
+
+  STDMETHODIMP QueryInterface(REFIID iid, void **ppvObject) {
+    return DoBasicQueryInterface<IDiaDataSource>(this, iid, ppvObject);
+  }
+
+  DataSource(IMalloc *pMalloc);
+
+  ~DataSource();
+
+  STDMETHODIMP get_lastError(BSTR *pRetVal) override;
+
+  STDMETHODIMP loadDataFromPdb(_In_ LPCOLESTR pdbPath) override { return ENotImpl(); }
+
+  STDMETHODIMP loadAndValidateDataFromPdb(
+    _In_ LPCOLESTR pdbPath,
+    _In_ GUID *pcsig70,
+    _In_ DWORD sig,
+    _In_ DWORD age) override { return ENotImpl(); }
+
+  STDMETHODIMP loadDataForExe(
+    _In_ LPCOLESTR executable,
+    _In_ LPCOLESTR searchPath,
+    _In_ IUnknown *pCallback) override { return ENotImpl(); }
+
+  STDMETHODIMP loadDataFromIStream(_In_ IStream *pIStream) override;
+
+  STDMETHODIMP openSession(_COM_Outptr_ IDiaSession **ppSession) override;
+
+  HRESULT STDMETHODCALLTYPE loadDataFromCodeViewInfo(
+    _In_ LPCOLESTR executable,
+    _In_ LPCOLESTR searchPath,
+    _In_ DWORD cbCvInfo,
+    _In_ BYTE *pbCvInfo,
+    _In_ IUnknown *pCallback) override { return ENotImpl(); }
+
+  HRESULT STDMETHODCALLTYPE loadDataFromMiscInfo(
+    _In_ LPCOLESTR executable,
+    _In_ LPCOLESTR searchPath,
+    _In_ DWORD timeStampExe,
+    _In_ DWORD timeStampDbg,
+    _In_ DWORD sizeOfExe,
+    _In_ DWORD cbMiscInfo,
+    _In_ BYTE *pbMiscInfo,
+    _In_ IUnknown *pCallback) override { return ENotImpl(); }
+};
+
+}  // namespace dxil_dia
diff --git a/lib/DxilDia/DxilDiaEnumTables.cpp b/lib/DxilDia/DxilDiaEnumTables.cpp
new file mode 100644
index 000000000..8e72e4059
--- /dev/null
+++ b/lib/DxilDia/DxilDiaEnumTables.cpp
@@ -0,0 +1,88 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaEnumTables.cpp                                                     //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "DxilDiaEnumTables.h"
+
+#include "DxilDia.h"
+#include "DxilDiaSession.h"
+#include "DxilDiaTable.h"
+
+STDMETHODIMP dxil_dia::EnumTables::get_Count(_Out_ LONG *pRetVal) {
+  *pRetVal = ((unsigned)Table::LastKind - (unsigned)Table::FirstKind) + 1;
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::EnumTables::Item(
+    /* [in] */ VARIANT index,
+    /* [retval][out] */ IDiaTable **table) {
+  // Avoid pulling in additional variant support (could have used VariantChangeType instead).
+  DWORD indexVal;
+  switch (index.vt) {
+  case VT_UI4:
+    indexVal = index.uintVal;
+    break;
+  case VT_I4:
+    IFR(IntToDWord(index.intVal, &indexVal));
+    break;
+  default:
+    return E_INVALIDARG;
+  }
+  if (indexVal > (unsigned)Table::LastKind) {
+    return E_INVALIDARG;
+  }
+  HRESULT hr = S_OK;
+  if (!m_tables[indexVal]) {
+    DxcThreadMalloc TM(m_pMalloc);
+    hr = Table::Create(m_pSession, (Table::Kind)indexVal, &m_tables[indexVal]);
+  }
+  m_tables[indexVal].p->AddRef();
+  *table = m_tables[indexVal];
+  return hr;
+}
+
+STDMETHODIMP dxil_dia::EnumTables::Next(
+    ULONG celt,
+    IDiaTable **rgelt,
+    ULONG *pceltFetched) {
+  DxcThreadMalloc TM(m_pMalloc);
+  ULONG fetched = 0;
+  while (fetched < celt && m_next <= (unsigned)Table::LastKind) {
+    HRESULT hr = S_OK;
+    if (!m_tables[m_next]) {
+      DxcThreadMalloc TM(m_pMalloc);
+      hr = Table::Create(m_pSession, (Table::Kind)m_next, &m_tables[m_next]);
+      if (FAILED(hr)) {
+        return hr; // TODO: this leaks prior tables.
+      }
+    }
+    m_tables[m_next].p->AddRef();
+    rgelt[fetched] = m_tables[m_next];
+    ++m_next, ++fetched;
+  }
+  if (pceltFetched != nullptr)
+    *pceltFetched = fetched;
+  return (fetched == celt) ? S_OK : S_FALSE;
+}
+
+STDMETHODIMP dxil_dia::EnumTables::Reset() {
+  m_next = 0;
+  return S_OK;
+}
+
+HRESULT dxil_dia::EnumTables::Create(
+    /* [in] */ dxil_dia::Session *pSession,
+    /* [out] */ IDiaEnumTables **ppEnumTables) {
+  *ppEnumTables = CreateOnMalloc<EnumTables>(pSession->GetMallocNoRef(), pSession);
+  if (*ppEnumTables == nullptr)
+    return E_OUTOFMEMORY;
+  (*ppEnumTables)->AddRef();
+  return S_OK;
+}
diff --git a/lib/DxilDia/DxilDiaEnumTables.h b/lib/DxilDia/DxilDiaEnumTables.h
new file mode 100644
index 000000000..3758ceae4
--- /dev/null
+++ b/lib/DxilDia/DxilDiaEnumTables.h
@@ -0,0 +1,76 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaEnumTable.h                                                        //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "dxc/Support/WinIncludes.h"
+
+#include <array>
+
+#include "dia2.h"
+
+#include "dxc/Support/Global.h"
+#include "dxc/Support/microcom.h"
+
+#include "DxilDia.h"
+#include "DxilDiaTable.h"
+
+namespace dxil_dia {
+
+class Session;
+
+class EnumTables : public IDiaEnumTables {
+private:
+  DXC_MICROCOM_TM_REF_FIELDS()
+protected:
+  CComPtr<Session> m_pSession;
+  unsigned m_next;
+public:
+  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
+
+  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
+    return DoBasicQueryInterface<IDiaEnumTables>(this, iid, ppvObject);
+  }
+
+  EnumTables(IMalloc *pMalloc, Session *pSession)
+      : m_pMalloc(pMalloc), m_pSession(pSession), m_dwRef(0), m_next(0) {
+    m_tables.fill(nullptr);
+  }
+
+  STDMETHODIMP get__NewEnum(
+    /* [retval][out] */ IUnknown **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_Count(_Out_ LONG *pRetVal) override;
+
+  STDMETHODIMP Item(
+    /* [in] */ VARIANT index,
+    /* [retval][out] */ IDiaTable **table) override;
+
+  STDMETHODIMP Next(
+    ULONG celt,
+    IDiaTable **rgelt,
+    ULONG *pceltFetched) override;
+
+  STDMETHODIMP Skip(
+    /* [in] */ ULONG celt) override { return ENotImpl(); }
+
+  STDMETHODIMP Reset(void) override;
+
+  STDMETHODIMP Clone(
+    /* [out] */ IDiaEnumTables **ppenum) override { return ENotImpl(); }
+
+  static HRESULT Create(Session *pSession,
+                        IDiaEnumTables **ppEnumTables);
+private:
+  std::array<CComPtr<IDiaTable>, (int)Table::LastKind+1> m_tables;
+};
+
+}  // namespace dxil_dia
diff --git a/lib/DxilDia/DxilDiaSession.cpp b/lib/DxilDia/DxilDiaSession.cpp
new file mode 100644
index 000000000..5dcf7cbbe
--- /dev/null
+++ b/lib/DxilDia/DxilDiaSession.cpp
@@ -0,0 +1,205 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaSession.cpp                                                        //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "DxilDiaSession.h"
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+
+#include "DxilDia.h"
+#include "DxilDiaEnumTables.h"
+#include "DxilDiaTable.h"
+#include "DxilDiaTableInjectedSources.h"
+#include "DxilDiaTableLineNumbers.h"
+
+void dxil_dia::Session::Init(
+    std::shared_ptr<llvm::LLVMContext> context,
+    std::shared_ptr<llvm::Module> module,
+    std::shared_ptr<llvm::DebugInfoFinder> finder) {
+  m_pEnumTables = nullptr;
+  m_module = module;
+  m_context = context;
+  m_finder = finder;
+  m_dxilModule = std::make_unique<hlsl::DxilModule>(module.get());
+
+  // Extract HLSL metadata.
+  m_dxilModule->LoadDxilMetadata();
+
+  // Get file contents.
+  m_contents =
+    m_module->getNamedMetadata(hlsl::DxilMDHelper::kDxilSourceContentsMDName);
+  if (!m_contents)
+    m_contents = m_module->getNamedMetadata("llvm.dbg.contents");
+
+  m_defines =
+    m_module->getNamedMetadata(hlsl::DxilMDHelper::kDxilSourceDefinesMDName);
+  if (!m_defines)
+    m_defines = m_module->getNamedMetadata("llvm.dbg.defines");
+
+  m_mainFileName =
+    m_module->getNamedMetadata(hlsl::DxilMDHelper::kDxilSourceMainFileNameMDName);
+  if (!m_mainFileName)
+    m_mainFileName = m_module->getNamedMetadata("llvm.dbg.mainFileName");
+
+  m_arguments =
+    m_module->getNamedMetadata(hlsl::DxilMDHelper::kDxilSourceArgsMDName);
+  if (!m_arguments)
+    m_arguments = m_module->getNamedMetadata("llvm.dbg.args");
+
+  // Build up a linear list of instructions. The index will be used as the
+  // RVA. Debug instructions are ommitted from this enumeration.
+  for (const llvm::Function &fn : m_module->functions()) {
+    for (llvm::const_inst_iterator it = inst_begin(fn), end = inst_end(fn); it != end; ++it) {
+      const llvm::Instruction &i = *it;
+      if (const auto *call = llvm::dyn_cast<const llvm::CallInst>(&i)) {
+        const llvm::Function *pFn = call->getCalledFunction();
+        if (pFn && pFn->getName().startswith("llvm.dbg.")) {
+          continue;
+        }
+      }
+
+      m_rvaMap.insert({ &i, static_cast<RVA>(m_instructions.size()) });
+      m_instructions.push_back(&i);
+      if (i.getDebugLoc()) {
+        m_instructionLines.push_back(&i);
+      }
+    }
+  }
+
+  // Sanity check to make sure rva map is same as instruction index.
+  for (size_t i = 0, e = m_instructions.size(); i < e; ++i) {
+    DXASSERT(m_rvaMap.find(m_instructions[i]) != m_rvaMap.end(), "instruction not mapped to rva");
+    DXASSERT(m_rvaMap[m_instructions[i]] == i, "instruction mapped to wrong rva");
+  }
+}
+
+HRESULT dxil_dia::Session::getSourceFileIdByName(
+    llvm::StringRef fileName,
+    DWORD *pRetVal) {
+  if (Contents() != nullptr) {
+    for (unsigned i = 0; i < Contents()->getNumOperands(); ++i) {
+      llvm::StringRef fn =
+        llvm::dyn_cast<llvm::MDString>(Contents()->getOperand(i)->getOperand(0))
+        ->getString();
+      if (fn.equals(fileName)) {
+        *pRetVal = i;
+        return S_OK;
+      }
+    }
+  }
+  *pRetVal = 0;
+  return S_FALSE;
+}
+
+STDMETHODIMP dxil_dia::Session::get_loadAddress(
+    /* [retval][out] */ ULONGLONG *pRetVal) {
+  *pRetVal = 0;
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::Session::getEnumTables(
+    /* [out] */ _COM_Outptr_ IDiaEnumTables **ppEnumTables) {
+  if (!m_pEnumTables) {
+    DxcThreadMalloc TM(m_pMalloc);
+    IFR(EnumTables::Create(this, &m_pEnumTables));
+  }
+  m_pEnumTables.p->AddRef();
+  *ppEnumTables = m_pEnumTables;
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::Session::findFileById(
+    /* [in] */ DWORD uniqueId,
+    /* [out] */ IDiaSourceFile **ppResult) {
+  if (!m_pEnumTables) {
+    return E_INVALIDARG;
+  }
+  CComPtr<IDiaTable> pTable;
+  VARIANT vtIndex;
+  vtIndex.vt = VT_UI4;
+  vtIndex.uintVal = (int)Table::Kind::SourceFiles;
+  IFR(m_pEnumTables->Item(vtIndex, &pTable));
+  CComPtr<IUnknown> pElt;
+  IFR(pTable->Item(uniqueId, &pElt));
+  return pElt->QueryInterface(ppResult);
+}
+
+namespace dxil_dia {
+static HRESULT DxcDiaFindLineNumbersByRVA(
+  Session *pSession,
+  DWORD rva,
+  DWORD length,
+  IDiaEnumLineNumbers **ppResult)
+{
+  if (!ppResult)
+    return E_POINTER;
+
+  std::vector<const llvm::Instruction*> instructions;
+  const std::vector<const llvm::Instruction*> &allInstructions = pSession->InstructionsRef();
+
+  // Gather the list of insructions that map to the given rva range.
+  for (DWORD i = rva; i < rva + length; ++i) {
+    if (i >= allInstructions.size())
+      return E_INVALIDARG;
+
+    // Only include the instruction if it has debug info for line mappings.
+    const llvm::Instruction *inst = allInstructions[i];
+    if (inst->getDebugLoc())
+      instructions.push_back(inst);
+  }
+
+  // Create line number table from explicit instruction list.
+  IMalloc *pMalloc = pSession->GetMallocNoRef();
+  *ppResult = CreateOnMalloc<LineNumbersTable>(pMalloc, pSession, std::move(instructions));
+  if (*ppResult == nullptr)
+    return E_OUTOFMEMORY;
+  (*ppResult)->AddRef();
+  return S_OK;
+}
+}  // namespace dxil_dia
+
+STDMETHODIMP dxil_dia::Session::findLinesByAddr(
+  /* [in] */ DWORD seg,
+  /* [in] */ DWORD offset,
+  /* [in] */ DWORD length,
+  /* [out] */ IDiaEnumLineNumbers **ppResult) {
+  DxcThreadMalloc TM(m_pMalloc);
+  return DxcDiaFindLineNumbersByRVA(this, offset, length, ppResult);
+}
+
+STDMETHODIMP dxil_dia::Session::findLinesByRVA(
+  /* [in] */ DWORD rva,
+  /* [in] */ DWORD length,
+  /* [out] */ IDiaEnumLineNumbers **ppResult) {
+  DxcThreadMalloc TM(m_pMalloc);
+  return DxcDiaFindLineNumbersByRVA(this, rva, length, ppResult);
+}
+
+STDMETHODIMP dxil_dia::Session::findInjectedSource(
+  /* [in] */ LPCOLESTR srcFile,
+  /* [out] */ IDiaEnumInjectedSources **ppResult) {
+  if (Contents() != nullptr) {
+    CW2A pUtf8FileName(srcFile);
+    DxcThreadMalloc TM(m_pMalloc);
+    IDiaTable *pTable;
+    IFT(Table::Create(this, Table::Kind::InjectedSource, &pTable));
+    auto *pInjectedSource =
+      reinterpret_cast<InjectedSourcesTable *>(pTable);
+    pInjectedSource->Init(pUtf8FileName.m_psz);
+    *ppResult = pInjectedSource;
+    return S_OK;
+  }
+  return S_FALSE;
+}
diff --git a/lib/DxilDia/DxilDiaSession.h b/lib/DxilDia/DxilDiaSession.h
new file mode 100644
index 000000000..d90cefeb4
--- /dev/null
+++ b/lib/DxilDia/DxilDiaSession.h
@@ -0,0 +1,379 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaSession.h                                                          //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "dxc/Support/WinIncludes.h"
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "dia2.h"
+
+#include "dxc/DXIL/DxilModule.h"
+
+#include "dxc/Support/Global.h"
+#include "dxc/Support/microcom.h"
+
+#include "DxilDia.h"
+
+namespace dxil_dia {
+
+class Session : public IDiaSession {
+public:
+  using RVA = unsigned;
+
+  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
+  DXC_MICROCOM_TM_CTOR(Session)
+
+  IMalloc *GetMallocNoRef() { return m_pMalloc.p; }
+
+  void Init(std::shared_ptr<llvm::LLVMContext> context,
+            std::shared_ptr<llvm::Module> module,
+            std::shared_ptr<llvm::DebugInfoFinder> finder);
+
+  llvm::NamedMDNode *Contents() { return m_contents; }
+  llvm::NamedMDNode *Defines() { return m_defines; }
+  llvm::NamedMDNode *MainFileName() { return m_mainFileName; }
+  llvm::NamedMDNode *Arguments() { return m_arguments; }
+  hlsl::DxilModule &DxilModuleRef() { return *m_dxilModule.get(); }
+  llvm::Module &ModuleRef() { return *m_module.get(); }
+  llvm::DebugInfoFinder &InfoRef() { return *m_finder.get(); }
+  std::vector<const llvm::Instruction *> &InstructionsRef() { return m_instructions; }
+  std::vector<const llvm::Instruction *> &InstructionLinesRef() { return m_instructionLines; }
+  std::unordered_map<const llvm::Instruction *, RVA> &RvaMapRef() { return m_rvaMap; }
+
+  HRESULT getSourceFileIdByName(llvm::StringRef fileName, DWORD *pRetVal);
+
+  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
+    return DoBasicQueryInterface<IDiaSession>(this, iid, ppvObject);
+  }
+
+  STDMETHODIMP get_loadAddress(
+    /* [retval][out] */ ULONGLONG *pRetVal) override;
+
+  STDMETHODIMP put_loadAddress(
+    /* [in] */ ULONGLONG NewVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_globalScope(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP getEnumTables(
+    _COM_Outptr_ IDiaEnumTables **ppEnumTables) override;
+
+  STDMETHODIMP getSymbolsByAddr(
+    /* [out] */ IDiaEnumSymbolsByAddr **ppEnumbyAddr) override { return ENotImpl(); }
+
+  STDMETHODIMP findChildren(
+    /* [in] */ IDiaSymbol *parent,
+  /* [in] */ enum SymTagEnum symtag,
+    /* [in] */ LPCOLESTR name,
+    /* [in] */ DWORD compareFlags,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findChildrenEx(
+    /* [in] */ IDiaSymbol *parent,
+  /* [in] */ enum SymTagEnum symtag,
+    /* [in] */ LPCOLESTR name,
+    /* [in] */ DWORD compareFlags,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findChildrenExByAddr(
+    /* [in] */ IDiaSymbol *parent,
+  /* [in] */ enum SymTagEnum symtag,
+    /* [in] */ LPCOLESTR name,
+    /* [in] */ DWORD compareFlags,
+    /* [in] */ DWORD isect,
+    /* [in] */ DWORD offset,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findChildrenExByVA(
+    /* [in] */ IDiaSymbol *parent,
+  /* [in] */ enum SymTagEnum symtag,
+    /* [in] */ LPCOLESTR name,
+    /* [in] */ DWORD compareFlags,
+    /* [in] */ ULONGLONG va,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findChildrenExByRVA(
+    /* [in] */ IDiaSymbol *parent,
+  /* [in] */ enum SymTagEnum symtag,
+    /* [in] */ LPCOLESTR name,
+    /* [in] */ DWORD compareFlags,
+    /* [in] */ DWORD rva,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findSymbolByAddr(
+    /* [in] */ DWORD isect,
+    /* [in] */ DWORD offset,
+  /* [in] */ enum SymTagEnum symtag,
+    /* [out] */ IDiaSymbol **ppSymbol) override { return ENotImpl(); }
+
+  STDMETHODIMP findSymbolByRVA(
+    /* [in] */ DWORD rva,
+  /* [in] */ enum SymTagEnum symtag,
+    /* [out] */ IDiaSymbol **ppSymbol) override { return ENotImpl(); }
+
+  STDMETHODIMP findSymbolByVA(
+    /* [in] */ ULONGLONG va,
+  /* [in] */ enum SymTagEnum symtag,
+    /* [out] */ IDiaSymbol **ppSymbol) override { return ENotImpl(); }
+
+  STDMETHODIMP findSymbolByToken(
+    /* [in] */ ULONG token,
+  /* [in] */ enum SymTagEnum symtag,
+    /* [out] */ IDiaSymbol **ppSymbol) override { return ENotImpl(); }
+
+  STDMETHODIMP symsAreEquiv(
+    /* [in] */ IDiaSymbol *symbolA,
+    /* [in] */ IDiaSymbol *symbolB) override { return ENotImpl(); }
+
+  STDMETHODIMP symbolById(
+    /* [in] */ DWORD id,
+    /* [out] */ IDiaSymbol **ppSymbol) override { return ENotImpl(); }
+
+  STDMETHODIMP findSymbolByRVAEx(
+    /* [in] */ DWORD rva,
+  /* [in] */ enum SymTagEnum symtag,
+    /* [out] */ IDiaSymbol **ppSymbol,
+    /* [out] */ long *displacement) override { return ENotImpl(); }
+
+  STDMETHODIMP findSymbolByVAEx(
+    /* [in] */ ULONGLONG va,
+  /* [in] */ enum SymTagEnum symtag,
+    /* [out] */ IDiaSymbol **ppSymbol,
+    /* [out] */ long *displacement) override { return ENotImpl(); }
+
+  STDMETHODIMP findFile(
+    /* [in] */ IDiaSymbol *pCompiland,
+    /* [in] */ LPCOLESTR name,
+    /* [in] */ DWORD compareFlags,
+    /* [out] */ IDiaEnumSourceFiles **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findFileById(
+    /* [in] */ DWORD uniqueId,
+    /* [out] */ IDiaSourceFile **ppResult) override;
+
+  STDMETHODIMP findLines(
+    /* [in] */ IDiaSymbol *compiland,
+    /* [in] */ IDiaSourceFile *file,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findLinesByAddr(
+    /* [in] */ DWORD seg,
+    /* [in] */ DWORD offset,
+    /* [in] */ DWORD length,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override;
+
+  STDMETHODIMP findLinesByRVA(
+    /* [in] */ DWORD rva,
+    /* [in] */ DWORD length,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override;
+
+  STDMETHODIMP findLinesByVA(
+    /* [in] */ ULONGLONG va,
+    /* [in] */ DWORD length,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findLinesByLinenum(
+    /* [in] */ IDiaSymbol *compiland,
+    /* [in] */ IDiaSourceFile *file,
+    /* [in] */ DWORD linenum,
+    /* [in] */ DWORD column,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInjectedSource(
+      /* [in] */ LPCOLESTR srcFile,
+      /* [out] */ IDiaEnumInjectedSources **ppResult) override;
+
+  STDMETHODIMP getEnumDebugStreams(
+    /* [out] */ IDiaEnumDebugStreams **ppEnumDebugStreams) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineFramesByAddr(
+    /* [in] */ IDiaSymbol *parent,
+    /* [in] */ DWORD isect,
+    /* [in] */ DWORD offset,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineFramesByRVA(
+    /* [in] */ IDiaSymbol *parent,
+    /* [in] */ DWORD rva,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineFramesByVA(
+    /* [in] */ IDiaSymbol *parent,
+    /* [in] */ ULONGLONG va,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineeLines(
+    /* [in] */ IDiaSymbol *parent,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineeLinesByAddr(
+    /* [in] */ IDiaSymbol *parent,
+    /* [in] */ DWORD isect,
+    /* [in] */ DWORD offset,
+    /* [in] */ DWORD length,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineeLinesByRVA(
+    /* [in] */ IDiaSymbol *parent,
+    /* [in] */ DWORD rva,
+    /* [in] */ DWORD length,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineeLinesByVA(
+    /* [in] */ IDiaSymbol *parent,
+    /* [in] */ ULONGLONG va,
+    /* [in] */ DWORD length,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineeLinesByLinenum(
+    /* [in] */ IDiaSymbol *compiland,
+    /* [in] */ IDiaSourceFile *file,
+    /* [in] */ DWORD linenum,
+    /* [in] */ DWORD column,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineesByName(
+    /* [in] */ LPCOLESTR name,
+    /* [in] */ DWORD option,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findAcceleratorInlineeLinesByLinenum(
+    /* [in] */ IDiaSymbol *parent,
+    /* [in] */ IDiaSourceFile *file,
+    /* [in] */ DWORD linenum,
+    /* [in] */ DWORD column,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findSymbolsForAcceleratorPointerTag(
+    /* [in] */ IDiaSymbol *parent,
+    /* [in] */ DWORD tagValue,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findSymbolsByRVAForAcceleratorPointerTag(
+    /* [in] */ IDiaSymbol *parent,
+    /* [in] */ DWORD tagValue,
+    /* [in] */ DWORD rva,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findAcceleratorInlineesByName(
+    /* [in] */ LPCOLESTR name,
+    /* [in] */ DWORD option,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP addressForVA(
+    /* [in] */ ULONGLONG va,
+    /* [out] */ DWORD *pISect,
+    /* [out] */ DWORD *pOffset) override { return ENotImpl(); }
+
+  STDMETHODIMP addressForRVA(
+    /* [in] */ DWORD rva,
+    /* [out] */ DWORD *pISect,
+    /* [out] */ DWORD *pOffset) override { return ENotImpl(); }
+
+  STDMETHODIMP findILOffsetsByAddr(
+    /* [in] */ DWORD isect,
+    /* [in] */ DWORD offset,
+    /* [in] */ DWORD length,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findILOffsetsByRVA(
+    /* [in] */ DWORD rva,
+    /* [in] */ DWORD length,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findILOffsetsByVA(
+    /* [in] */ ULONGLONG va,
+    /* [in] */ DWORD length,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInputAssemblyFiles(
+    /* [out] */ IDiaEnumInputAssemblyFiles **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInputAssembly(
+    /* [in] */ DWORD index,
+    /* [out] */ IDiaInputAssemblyFile **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInputAssemblyById(
+    /* [in] */ DWORD uniqueId,
+    /* [out] */ IDiaInputAssemblyFile **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP getFuncMDTokenMapSize(
+    /* [out] */ DWORD *pcb) override { return ENotImpl(); }
+
+  STDMETHODIMP getFuncMDTokenMap(
+    /* [in] */ DWORD cb,
+    /* [out] */ DWORD *pcb,
+    /* [size_is][out] */ BYTE *pb) override { return ENotImpl(); }
+
+  STDMETHODIMP getTypeMDTokenMapSize(
+    /* [out] */ DWORD *pcb) override { return ENotImpl(); }
+
+  STDMETHODIMP getTypeMDTokenMap(
+    /* [in] */ DWORD cb,
+    /* [out] */ DWORD *pcb,
+    /* [size_is][out] */ BYTE *pb) override { return ENotImpl(); }
+
+  STDMETHODIMP getNumberOfFunctionFragments_VA(
+    /* [in] */ ULONGLONG vaFunc,
+    /* [in] */ DWORD cbFunc,
+    /* [out] */ DWORD *pNumFragments) override { return ENotImpl(); }
+
+  STDMETHODIMP getNumberOfFunctionFragments_RVA(
+    /* [in] */ DWORD rvaFunc,
+    /* [in] */ DWORD cbFunc,
+    /* [out] */ DWORD *pNumFragments) override { return ENotImpl(); }
+
+  STDMETHODIMP getFunctionFragments_VA(
+    /* [in] */ ULONGLONG vaFunc,
+    /* [in] */ DWORD cbFunc,
+    /* [in] */ DWORD cFragments,
+    /* [size_is][out] */ ULONGLONG *pVaFragment,
+    /* [size_is][out] */ DWORD *pLenFragment) override { return ENotImpl(); }
+
+  STDMETHODIMP getFunctionFragments_RVA(
+    /* [in] */ DWORD rvaFunc,
+    /* [in] */ DWORD cbFunc,
+    /* [in] */ DWORD cFragments,
+    /* [size_is][out] */ DWORD *pRvaFragment,
+    /* [size_is][out] */ DWORD *pLenFragment) override { return ENotImpl(); }
+
+  STDMETHODIMP getExports(
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP getHeapAllocationSites(
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInputAssemblyFile(
+    /* [in] */ IDiaSymbol *pSymbol,
+    /* [out] */ IDiaInputAssemblyFile **ppResult) override { return ENotImpl(); }
+
+private:
+  DXC_MICROCOM_TM_REF_FIELDS()
+  std::shared_ptr<llvm::LLVMContext> m_context;
+  std::shared_ptr<llvm::Module> m_module;
+  std::shared_ptr<llvm::DebugInfoFinder> m_finder;
+  std::unique_ptr<hlsl::DxilModule> m_dxilModule;
+  llvm::NamedMDNode *m_contents;
+  llvm::NamedMDNode *m_defines;
+  llvm::NamedMDNode *m_mainFileName;
+  llvm::NamedMDNode *m_arguments;
+  std::vector<const llvm::Instruction *> m_instructions;
+  std::vector<const llvm::Instruction *> m_instructionLines; // Instructions with line info.
+  std::unordered_map<const llvm::Instruction *, RVA> m_rvaMap; // Map instruction to its RVA.
+
+private:
+  CComPtr<IDiaEnumTables> m_pEnumTables;
+};
+}  // namespace dxil_dia
diff --git a/lib/DxilDia/DxilDiaTable.cpp b/lib/DxilDia/DxilDiaTable.cpp
new file mode 100644
index 000000000..e704f99ae
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTable.cpp
@@ -0,0 +1,46 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTable.cpp                                                          //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "DxilDiaTable.h"
+
+#include "DxilDiaSession.h"
+#include "DxilDiaTableFrameData.h"
+#include "DxilDiaTableInjectedSources.h"
+#include "DxilDiaTableInputAssemblyFile.h"
+#include "DxilDiaTableLineNumbers.h"
+#include "DxilDiaTableSections.h"
+#include "DxilDiaTableSegmentMap.h"
+#include "DxilDiaTableSourceFiles.h"
+#include "DxilDiaTableSymbols.h"
+
+HRESULT dxil_dia::Table::Create(
+    /* [in] */ Session *pSession,
+    /* [in] */ Table::Kind kind,
+    /* [out] */ IDiaTable **ppTable) {
+  *ppTable = nullptr;
+  IMalloc *pMalloc = pSession->GetMallocNoRef();
+  switch (kind) {
+  case Table::Kind::Symbols: *ppTable = CreateOnMalloc<SymbolsTable>(pMalloc, pSession); break;
+  case Table::Kind::SourceFiles: *ppTable = CreateOnMalloc<SourceFilesTable>(pMalloc, pSession); break;
+  case Table::Kind::LineNumbers: *ppTable = CreateOnMalloc<LineNumbersTable>(pMalloc, pSession); break;
+  case Table::Kind::Sections: *ppTable = CreateOnMalloc<SectionsTable>(pMalloc, pSession); break;
+  case Table::Kind::SegmentMap: *ppTable = CreateOnMalloc<SegmentMapTable>(pMalloc, pSession); break;
+  case Table::Kind::InjectedSource: *ppTable = CreateOnMalloc<InjectedSourcesTable>(pMalloc, pSession); break;
+  case Table::Kind::FrameData: *ppTable = CreateOnMalloc<FrameDataTable>(pMalloc, pSession); break;
+  case Table::Kind::InputAssemblyFile: *ppTable = CreateOnMalloc<InputAssemblyFilesTable>(pMalloc, pSession); break;
+  default: return E_FAIL;
+  }
+  if (*ppTable == nullptr)
+    return E_OUTOFMEMORY;
+  (*ppTable)->AddRef();
+  return S_OK;
+  return E_NOTIMPL;
+}
diff --git a/lib/DxilDia/DxilDiaTable.h b/lib/DxilDia/DxilDiaTable.h
new file mode 100644
index 000000000..8a7e51399
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTable.h
@@ -0,0 +1,173 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTable.h                                                            //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "dxc/Support/WinIncludes.h"
+
+#include "dia2.h"
+
+#include "dxc/Support/microcom.h"
+
+#include "DxilDia.h"
+
+namespace dxil_dia {
+
+class Session;
+
+namespace Table {
+enum class Kind {
+  Symbols,
+  SourceFiles,
+  LineNumbers,
+  Sections,
+  SegmentMap,
+  InjectedSource,
+  FrameData,
+  InputAssemblyFile
+};
+static constexpr Kind FirstKind = Kind::Symbols;
+static constexpr Kind LastKind = Kind::InputAssemblyFile;
+
+HRESULT Create(
+    /* [in] */ Session *pSession,
+    /* [in] */ Kind kind,
+    /* [out] */ IDiaTable **ppTable);
+}  // namespace Table
+
+namespace impl {
+
+template<typename T, typename TItem>
+class TableBase : public IDiaTable, public T {
+protected:
+  static constexpr LPCWSTR TableNames[] = {
+    L"Symbols",
+    L"SourceFiles",
+    L"LineNumbers",
+    L"Sections",
+    L"SegmentMap",
+    L"InjectedSource",
+    L"FrameData",
+    L"InputAssemblyFiles"
+  };
+
+  DXC_MICROCOM_TM_REF_FIELDS()
+  CComPtr<Session> m_pSession;
+  unsigned m_next;
+  unsigned m_count;
+  Table::Kind m_kind;
+public:
+  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
+
+  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
+    return DoBasicQueryInterface<IDiaTable, T, IEnumUnknown>(this, iid, ppvObject);
+  }
+
+  TableBase(IMalloc *pMalloc, Session *pSession, Table::Kind kind) {
+    m_pMalloc = pMalloc;
+    m_pSession = pSession;
+    m_kind = kind;
+    m_next = 0;
+    m_count = 0;
+  }
+
+  // IEnumUnknown implementation.
+  STDMETHODIMP Next(
+    _In_  ULONG celt,
+    _Out_writes_to_(celt, *pceltFetched)  IUnknown **rgelt,
+    _Out_opt_  ULONG *pceltFetched) override {
+    DxcThreadMalloc TM(m_pMalloc);
+    ULONG fetched = 0;
+    while (fetched < celt && m_next < m_count) {
+      HRESULT hr = Item(m_next, &rgelt[fetched]);
+      if (FAILED(hr)) {
+        return hr; // TODO: this leaks prior tables.
+      }
+      ++m_next, ++fetched;
+    }
+    if (pceltFetched != nullptr)
+      *pceltFetched = fetched;
+    return (fetched == celt) ? S_OK : S_FALSE;
+  }
+
+  STDMETHODIMP Skip(ULONG celt) override {
+    if (celt + m_next <= m_count) {
+      m_next += celt;
+      return S_OK;
+    }
+    return S_FALSE;
+  }
+
+  STDMETHODIMP Reset(void) override {
+    m_next = 0;
+    return S_OK;
+  }
+
+  STDMETHODIMP Clone(IEnumUnknown **ppenum) override {
+    return ENotImpl();
+  }
+
+  // IDiaTable implementation.
+  STDMETHODIMP get__NewEnum(IUnknown **pRetVal) override {
+    return ENotImpl();
+  }
+
+  STDMETHODIMP get_name(BSTR *pRetVal) override {
+    *pRetVal = SysAllocString(TableNames[(unsigned)m_kind]);
+    return (*pRetVal) ? S_OK : E_OUTOFMEMORY;
+  }
+
+  STDMETHODIMP get_Count(_Out_ LONG *pRetVal) override {
+    *pRetVal = m_count;
+    return S_OK;
+  }
+
+  STDMETHODIMP Item(DWORD index, _COM_Outptr_ IUnknown **table) override {
+    if (index >= m_count)
+      return E_INVALIDARG;
+    return GetItem(index, (TItem **)table);
+  }
+
+  // T implementation (partial).
+  STDMETHODIMP Clone(_COM_Outptr_ T **ppenum) override {
+    *ppenum = nullptr;
+    return ENotImpl();
+  }
+  STDMETHODIMP Next(
+    /* [in] */ ULONG celt,
+    /* [out] */ TItem **rgelt,
+    /* [out] */ ULONG *pceltFetched) override {
+    DxcThreadMalloc TM(m_pMalloc);
+    ULONG fetched = 0;
+    while (fetched < celt && m_next < m_count) {
+      HRESULT hr = GetItem(m_next, &rgelt[fetched]);
+      if (FAILED(hr)) {
+        return hr; // TODO: this leaks prior items.
+      }
+      ++m_next, ++fetched;
+    }
+    if (pceltFetched != nullptr)
+      *pceltFetched = fetched;
+    return (fetched == celt) ? S_OK : S_FALSE;
+  }
+  STDMETHODIMP Item(
+    /* [in] */ DWORD index,
+    /* [retval][out] */ TItem **ppItem) override {
+    DxcThreadMalloc TM(m_pMalloc);
+    if (index >= m_count)
+      return E_INVALIDARG;
+    return GetItem(index, ppItem);
+  }
+
+  virtual HRESULT GetItem(DWORD index, TItem **ppItem) = 0;
+};
+}  // namespace impl
+}  // namespace dxil_dia
diff --git a/lib/DxilDia/DxilDiaTableFrameData.cpp b/lib/DxilDia/DxilDiaTableFrameData.cpp
new file mode 100644
index 000000000..bdb882a15
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableFrameData.cpp
@@ -0,0 +1,23 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableFrameData.cpp                                                 //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "DxilDiaTableFrameData.h"
+
+#include "DxilDiaSession.h"
+
+dxil_dia::FrameDataTable::FrameDataTable(IMalloc *pMalloc, Session *pSession)
+  : impl::TableBase<IDiaEnumFrameData, IDiaFrameData>(pMalloc, pSession, Table::Kind::FrameData) {
+}
+
+HRESULT dxil_dia::FrameDataTable::GetItem(DWORD index, IDiaFrameData **ppItem) {
+  *ppItem = nullptr;
+  return E_FAIL;
+}
\ No newline at end of file
diff --git a/lib/DxilDia/DxilDiaTableFrameData.h b/lib/DxilDia/DxilDiaTableFrameData.h
new file mode 100644
index 000000000..65631201e
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableFrameData.h
@@ -0,0 +1,43 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableFrameData.h                                                   //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "dxc/Support/WinIncludes.h"
+
+#include "dia2.h"
+
+#include "dxc/Support/Global.h"
+#include "dxc/Support/microcom.h"
+
+#include "DxilDia.h"
+#include "DxilDiaTable.h"
+
+namespace dxil_dia {
+class Session;
+
+class FrameDataTable : public impl::TableBase<IDiaEnumFrameData, IDiaFrameData> {
+public:
+  FrameDataTable(IMalloc *pMalloc, Session *pSession);
+
+  // HLSL inlines functions for a program, so no data to return.
+  STDMETHODIMP frameByRVA(
+    /* [in] */ DWORD relativeVirtualAddress,
+    /* [retval][out] */ IDiaFrameData **frame) override { return ENotImpl(); }
+
+  STDMETHODIMP frameByVA(
+    /* [in] */ ULONGLONG virtualAddress,
+    /* [retval][out] */ IDiaFrameData **frame) override { return ENotImpl(); }
+
+  HRESULT GetItem(DWORD index, IDiaFrameData **ppItem) override;
+};
+
+}  // namespace dxil_dia
diff --git a/lib/DxilDia/DxilDiaTableInjectedSources.cpp b/lib/DxilDia/DxilDiaTableInjectedSources.cpp
new file mode 100644
index 000000000..bfcd3d435
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableInjectedSources.cpp
@@ -0,0 +1,102 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableInjectedSources.cpp                                           //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "DxilDiaTableInjectedSources.h"
+
+#include "DxilDia.h"
+#include "DxilDiaSession.h"
+#include "DxilDiaTable.h"
+
+llvm::MDTuple *dxil_dia::InjectedSource::NameContent() {
+  return llvm::cast<llvm::MDTuple>(m_pSession->Contents()->getOperand(m_index));
+}
+
+llvm::StringRef dxil_dia::InjectedSource::Name() {
+  return llvm::dyn_cast<llvm::MDString>(NameContent()->getOperand(0))->getString();
+}
+
+llvm::StringRef dxil_dia::InjectedSource::Content() {
+  return llvm::dyn_cast<llvm::MDString>(NameContent()->getOperand(1))->getString();
+}
+
+STDMETHODIMP dxil_dia::InjectedSource::get_length(_Out_ ULONGLONG *pRetVal) {
+  *pRetVal = Content().size();
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::InjectedSource::get_filename(BSTR *pRetVal) {
+  DxcThreadMalloc TM(m_pMalloc);
+  return StringRefToBSTR(Name(), pRetVal);
+}
+
+STDMETHODIMP dxil_dia::InjectedSource::get_objectFilename(BSTR *pRetVal) {
+  *pRetVal = nullptr;
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::InjectedSource::get_virtualFilename(BSTR *pRetVal) {
+  return get_filename(pRetVal);
+}
+
+STDMETHODIMP dxil_dia::InjectedSource::get_source(
+  /* [in] */ DWORD cbData,
+  /* [out] */ DWORD *pcbData,
+  /* [size_is][out] */ BYTE *pbData) {
+  if (pbData == nullptr) {
+    if (pcbData != nullptr) {
+      *pcbData = Content().size();
+    }
+    return S_OK;
+  }
+
+  cbData = std::min((DWORD)Content().size(), cbData);
+  memcpy(pbData, Content().begin(), cbData);
+  if (pcbData) {
+    *pcbData = cbData;
+  }
+  return S_OK;
+}
+
+dxil_dia::InjectedSourcesTable::InjectedSourcesTable(
+  IMalloc *pMalloc,
+  Session *pSession)
+  : impl::TableBase<IDiaEnumInjectedSources,
+                    IDiaInjectedSource>(pMalloc, pSession, Table::Kind::InjectedSource) {
+  // Count the number of source files available.
+  // m_count = m_pSession->InfoRef().compile_unit_count();
+  m_count =
+    (m_pSession->Contents() == nullptr) ? 0 : m_pSession->Contents()->getNumOperands();
+}
+
+HRESULT dxil_dia::InjectedSourcesTable::GetItem(DWORD index, IDiaInjectedSource **ppItem) {
+  if (index >= m_count)
+    return E_INVALIDARG;
+  unsigned itemIndex = index;
+  if (m_count == m_indexList.size())
+    itemIndex = m_indexList[index];
+  *ppItem = CreateOnMalloc<InjectedSource>(m_pMalloc, m_pSession, itemIndex);
+  if (*ppItem == nullptr)
+    return E_OUTOFMEMORY;
+  (*ppItem)->AddRef();
+  return S_OK;
+}
+
+void dxil_dia::InjectedSourcesTable::Init(llvm::StringRef filename) {
+  for (unsigned i = 0; i < m_pSession->Contents()->getNumOperands(); ++i) {
+    llvm::StringRef fn =
+      llvm::dyn_cast<llvm::MDString>(m_pSession->Contents()->getOperand(i)->getOperand(0))
+      ->getString();
+    if (fn.equals(filename)) {
+      m_indexList.emplace_back(i);
+    }
+  }
+  m_count = m_indexList.size();
+}
\ No newline at end of file
diff --git a/lib/DxilDia/DxilDiaTableInjectedSources.h b/lib/DxilDia/DxilDiaTableInjectedSources.h
new file mode 100644
index 000000000..091d53e08
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableInjectedSources.h
@@ -0,0 +1,82 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableInjectedSources.h                                             //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "dxc/Support/WinIncludes.h"
+
+#include <vector>
+
+#include "dia2.h"
+
+#include "dxc/Support/Global.h"
+#include "dxc/Support/microcom.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Metadata.h"
+
+#include "DxilDia.h"
+#include "DxilDiaTable.h"
+
+namespace dxil_dia {
+
+class InjectedSource : public IDiaInjectedSource {
+private:
+  DXC_MICROCOM_TM_REF_FIELDS()
+  CComPtr<Session> m_pSession;
+  DWORD m_index;
+
+public:
+  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
+  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
+    return DoBasicQueryInterface<IDiaInjectedSource>(this, iid, ppvObject);
+  }
+
+  InjectedSource(IMalloc *pMalloc, Session *pSession, DWORD index)
+    : m_pMalloc(pMalloc), m_pSession(pSession), m_index(index) {}
+
+  llvm::MDTuple *NameContent();
+  llvm::StringRef Name();
+  llvm::StringRef Content();
+
+  STDMETHODIMP get_crc(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_length(_Out_ ULONGLONG *pRetVal) override;
+
+  STDMETHODIMP get_filename(BSTR *pRetVal) override;
+
+  STDMETHODIMP get_objectFilename(BSTR *pRetVal) override;
+
+  STDMETHODIMP get_virtualFilename(BSTR *pRetVal) override;
+
+  STDMETHODIMP get_sourceCompression(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_source(
+    /* [in] */ DWORD cbData,
+    /* [out] */ DWORD *pcbData,
+    /* [size_is][out] */ BYTE *pbData) override;
+};
+
+class InjectedSourcesTable : public impl::TableBase<IDiaEnumInjectedSources,
+                                                    IDiaInjectedSource> {
+public:
+  InjectedSourcesTable(IMalloc *pMalloc, Session *pSession);
+
+  HRESULT GetItem(DWORD index, IDiaInjectedSource **ppItem) override;
+
+  void Init(llvm::StringRef filename);
+
+private:
+  std::vector<unsigned> m_indexList;
+};
+
+}  // namespace dxil_dia
diff --git a/lib/DxilDia/DxilDiaTableInputAssemblyFile.cpp b/lib/DxilDia/DxilDiaTableInputAssemblyFile.cpp
new file mode 100644
index 000000000..1323d760b
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableInputAssemblyFile.cpp
@@ -0,0 +1,24 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableInputAssemblyFile.cpp                                         //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "DxilDiaTableInputAssemblyFile.h"
+
+#include "DxilDiaSession.h"
+
+dxil_dia::InputAssemblyFilesTable::InputAssemblyFilesTable(IMalloc *pMalloc, Session *pSession)
+  : impl::TableBase<IDiaEnumInputAssemblyFiles, IDiaInputAssemblyFile>(pMalloc, pSession, Table::Kind::InputAssemblyFile) {
+
+}
+
+HRESULT dxil_dia::InputAssemblyFilesTable::GetItem(DWORD index, IDiaInputAssemblyFile **ppItem) {
+  *ppItem = nullptr;
+  return E_FAIL;
+}
\ No newline at end of file
diff --git a/lib/DxilDia/DxilDiaTableInputAssemblyFile.h b/lib/DxilDia/DxilDiaTableInputAssemblyFile.h
new file mode 100644
index 000000000..0a3acbf1b
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableInputAssemblyFile.h
@@ -0,0 +1,32 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableInputAssemblyFile.h                                           //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "dxc/Support/WinIncludes.h"
+
+#include "dia2.h"
+
+#include "dxc/Support/Global.h"
+#include "dxc/Support/microcom.h"
+
+#include "DxilDiaTable.h"
+
+namespace dxil_dia {
+class Session;
+
+class InputAssemblyFilesTable
+  : public impl::TableBase<IDiaEnumInputAssemblyFiles, IDiaInputAssemblyFile> {
+public:
+  InputAssemblyFilesTable(IMalloc *pMalloc, Session *pSession);
+  HRESULT GetItem(DWORD index, IDiaInputAssemblyFile **ppItem) override;
+};
+}  // namespace dxil_dia
diff --git a/lib/DxilDia/DxilDiaTableLineNumbers.cpp b/lib/DxilDia/DxilDiaTableLineNumbers.cpp
new file mode 100644
index 000000000..807a30052
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableLineNumbers.cpp
@@ -0,0 +1,119 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableLineNumbers.cpp                                               //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "DxilDiaTableLineNumbers.h"
+
+#include <utility>
+
+#include "llvm/IR/DebugInfoMetadata.h"
+
+#include "DxilDiaSession.h"
+
+dxil_dia::LineNumber::LineNumber(
+  /* [in] */ IMalloc *pMalloc,
+  /* [in] */ Session *pSession,
+  /* [in] */ const llvm::Instruction * inst)
+  : m_pMalloc(pMalloc),
+    m_pSession(pSession),
+    m_inst(inst) {
+}
+
+const llvm::DebugLoc &dxil_dia::LineNumber::DL() const {
+  DXASSERT(bool(m_inst->getDebugLoc()), "Trying to read line info from invalid debug location");
+  return m_inst->getDebugLoc();
+}
+
+STDMETHODIMP dxil_dia::LineNumber::get_sourceFile(
+  /* [retval][out] */ IDiaSourceFile **pRetVal) {
+  DWORD id;
+  HRESULT hr = get_sourceFileId(&id);
+  if (hr != S_OK)
+    return hr;
+  return m_pSession->findFileById(id, pRetVal);
+}
+
+STDMETHODIMP dxil_dia::LineNumber::get_lineNumber(
+  /* [retval][out] */ DWORD *pRetVal) {
+  *pRetVal = DL().getLine();
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::LineNumber::get_lineNumberEnd(
+  /* [retval][out] */ DWORD *pRetVal) {
+  *pRetVal = DL().getLine();
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::LineNumber::get_columnNumber(
+  /* [retval][out] */ DWORD *pRetVal) {
+  *pRetVal = DL().getCol();
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::LineNumber::get_columnNumberEnd(
+  /* [retval][out] */ DWORD *pRetVal) {
+  *pRetVal = DL().getCol();
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::LineNumber::get_relativeVirtualAddress(
+  /* [retval][out] */ DWORD *pRetVal) {
+  *pRetVal = m_pSession->RvaMapRef()[m_inst];
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::LineNumber::get_sourceFileId(
+  /* [retval][out] */ DWORD *pRetVal) {
+  llvm::MDNode *pScope = DL().getScope();
+  auto *pBlock = llvm::dyn_cast_or_null<llvm::DILexicalBlock>(pScope);
+  if (pBlock != nullptr) {
+    return m_pSession->getSourceFileIdByName(pBlock->getFile()->getFilename(), pRetVal);
+  }
+  auto *pSubProgram = llvm::dyn_cast_or_null<llvm::DISubprogram>(pScope);
+  if (pSubProgram != nullptr) {
+    return m_pSession->getSourceFileIdByName(pSubProgram->getFile()->getFilename(), pRetVal);
+  }
+  *pRetVal = 0;
+  return S_FALSE;
+}
+
+STDMETHODIMP dxil_dia::LineNumber::get_compilandId(
+  /* [retval][out] */ DWORD *pRetVal) {
+  // Single compiland for now, so pretty simple.
+  *pRetVal = HlslCompilandId;
+  return S_OK;
+}
+
+dxil_dia::LineNumbersTable::LineNumbersTable(IMalloc *pMalloc, Session *pSession)
+  : impl::TableBase<IDiaEnumLineNumbers, IDiaLineNumber>(pMalloc, pSession, Table::Kind::LineNumbers)
+  , m_instructions(pSession->InstructionLinesRef())
+{
+  m_count = m_instructions.size();
+}
+
+dxil_dia::LineNumbersTable::LineNumbersTable(IMalloc *pMalloc, Session *pSession, std::vector<const llvm::Instruction*> &&instructions)
+  : impl::TableBase<IDiaEnumLineNumbers, IDiaLineNumber>(pMalloc, pSession, Table::Kind::LineNumbers)
+  , m_instructions(m_instructionsStorage)
+  , m_instructionsStorage(std::move(instructions))
+{
+  m_count = m_instructions.size();
+}
+
+
+HRESULT dxil_dia::LineNumbersTable::GetItem(DWORD index, IDiaLineNumber **ppItem) {
+  if (index >= m_instructions.size())
+    return E_INVALIDARG;
+  *ppItem = CreateOnMalloc<LineNumber>(m_pMalloc, m_pSession, m_instructions[index]);
+  if (*ppItem == nullptr)
+    return E_OUTOFMEMORY;
+  (*ppItem)->AddRef();
+  return S_OK;
+}
\ No newline at end of file
diff --git a/lib/DxilDia/DxilDiaTableLineNumbers.h b/lib/DxilDia/DxilDiaTableLineNumbers.h
new file mode 100644
index 000000000..25b49fb9f
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableLineNumbers.h
@@ -0,0 +1,118 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableLineNumbers.h                                                 //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "dxc/Support/WinIncludes.h"
+
+#include <vector>
+
+#include "dia2.h"
+
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Instruction.h"
+
+#include "dxc/Support/Global.h"
+#include "dxc/Support/microcom.h"
+
+#include "DxilDia.h"
+#include "DxilDiaTable.h"
+
+namespace dxil_dia {
+class Session;
+
+class LineNumber : public IDiaLineNumber {
+private:
+  DXC_MICROCOM_TM_REF_FIELDS()
+  CComPtr<Session> m_pSession;
+  const llvm::Instruction *m_inst;
+
+public:
+  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
+  STDMETHODIMP QueryInterface(REFIID iid, void **ppvObject) {
+    return DoBasicQueryInterface<IDiaLineNumber>(this, iid, ppvObject);
+  }
+
+  LineNumber(
+    /* [in] */ IMalloc *pMalloc,
+    /* [in] */ Session *pSession,
+    /* [in] */ const llvm::Instruction * inst);
+
+  const llvm::DebugLoc &DL() const;
+
+  STDMETHODIMP get_compiland(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_sourceFile(
+    /* [retval][out] */ IDiaSourceFile **pRetVal) override;
+
+  STDMETHODIMP get_lineNumber(
+    /* [retval][out] */ DWORD *pRetVal) override;
+
+  STDMETHODIMP get_lineNumberEnd(
+    /* [retval][out] */ DWORD *pRetVal) override;
+
+  STDMETHODIMP get_columnNumber(
+    /* [retval][out] */ DWORD *pRetVal) override;
+
+  STDMETHODIMP get_columnNumberEnd(
+    /* [retval][out] */ DWORD *pRetVal) override;
+
+  STDMETHODIMP get_addressSection(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_addressOffset(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_relativeVirtualAddress(
+    /* [retval][out] */ DWORD *pRetVal) override;
+
+  STDMETHODIMP get_virtualAddress(
+    /* [retval][out] */ ULONGLONG *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_length(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_sourceFileId(
+    /* [retval][out] */ DWORD *pRetVal) override;
+
+  STDMETHODIMP get_statement(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_compilandId(
+    /* [retval][out] */ DWORD *pRetVal) override;
+};
+
+class LineNumbersTable : public impl::TableBase<IDiaEnumLineNumbers, IDiaLineNumber> {
+public:
+  LineNumbersTable(
+    /* [in] */ IMalloc *pMalloc,
+    /* [in] */ Session *pSession);
+
+  LineNumbersTable(
+    /* [in] */ IMalloc *pMalloc,
+    /* [in] */ Session *pSession,
+    /* [in] */ std::vector<const llvm::Instruction*> &&instructions);
+
+  HRESULT GetItem(
+    /* [in] */ DWORD index, 
+    /* [out] */ IDiaLineNumber **ppItem) override;
+
+private:
+  // Keep a reference to the instructions that contain the line numbers.
+  const std::vector<const llvm::Instruction *> &m_instructions;
+
+  // Provide storage space for instructions for when the table contains
+  // a subset of all instructions.
+  std::vector<const llvm::Instruction *> m_instructionsStorage;
+};
+
+}  // namespace dxil_dia
\ No newline at end of file
diff --git a/lib/DxilDia/DxilDiaTableSections.cpp b/lib/DxilDia/DxilDiaTableSections.cpp
new file mode 100644
index 000000000..66e2c8fda
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableSections.cpp
@@ -0,0 +1,23 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableSections.cpp                                                  //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "DxilDiaTableSections.h"
+
+#include "DxilDiaSession.h"
+
+dxil_dia::SectionsTable::SectionsTable(IMalloc *pMalloc, Session *pSession)
+  : impl::TableBase<IDiaEnumSectionContribs, IDiaSectionContrib>(pMalloc, pSession, Table::Kind::Sections) {
+}
+
+HRESULT dxil_dia::SectionsTable::GetItem(DWORD index, IDiaSectionContrib **ppItem) {
+  *ppItem = nullptr;
+  return E_FAIL;
+}
diff --git a/lib/DxilDia/DxilDiaTableSections.h b/lib/DxilDia/DxilDiaTableSections.h
new file mode 100644
index 000000000..169ddd041
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableSections.h
@@ -0,0 +1,33 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableSections.h                                                    //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "dxc/Support/WinIncludes.h"
+
+#include "dia2.h"
+
+#include "dxc/Support/Global.h"
+#include "dxc/Support/microcom.h"
+
+#include "DxilDia.h"
+#include "DxilDiaTable.h"
+
+namespace dxil_dia {
+class Session;
+
+class SectionsTable : public impl::TableBase<IDiaEnumSectionContribs, IDiaSectionContrib> {
+public:
+  SectionsTable(IMalloc *pMalloc, Session *pSession);
+  HRESULT GetItem(DWORD index, IDiaSectionContrib **ppItem) override;
+};
+
+}  // namespace dxil_dia
diff --git a/lib/DxilDia/DxilDiaTableSegmentMap.cpp b/lib/DxilDia/DxilDiaTableSegmentMap.cpp
new file mode 100644
index 000000000..3c447dbc1
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableSegmentMap.cpp
@@ -0,0 +1,23 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableSegmentMap.cpp                                                //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "DxilDiaTableSegmentMap.h"
+
+#include "DxilDiaSession.h"
+
+dxil_dia::SegmentMapTable::SegmentMapTable(IMalloc *pMalloc, Session *pSession)
+  : impl::TableBase<IDiaEnumSegments, IDiaSegment>(pMalloc, pSession, Table::Kind::SegmentMap) {
+}
+
+HRESULT dxil_dia::SegmentMapTable::GetItem(DWORD index, IDiaSegment **ppItem) {
+  *ppItem = nullptr;
+  return E_FAIL;
+}
\ No newline at end of file
diff --git a/lib/DxilDia/DxilDiaTableSegmentMap.h b/lib/DxilDia/DxilDiaTableSegmentMap.h
new file mode 100644
index 000000000..18101fdb9
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableSegmentMap.h
@@ -0,0 +1,33 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableSegmentMap.h                                                  //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "dxc/Support/WinIncludes.h"
+
+#include "dia2.h"
+
+#include "dxc/Support/Global.h"
+#include "dxc/Support/microcom.h"
+
+#include "DxilDiaTable.h"
+
+namespace dxil_dia {
+class Session;
+
+class SegmentMapTable : public impl::TableBase<IDiaEnumSegments, IDiaSegment> {
+public:
+  SegmentMapTable(IMalloc *pMalloc, Session *pSession);
+
+  HRESULT GetItem(DWORD index, IDiaSegment **ppItem) override;
+};
+
+}  // namespace dxil_dia
diff --git a/lib/DxilDia/DxilDiaTableSourceFiles.cpp b/lib/DxilDia/DxilDiaTableSourceFiles.cpp
new file mode 100644
index 000000000..8181e9967
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableSourceFiles.cpp
@@ -0,0 +1,57 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableSourceFiles.cpp                                               //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "DxilDiaTableSourceFiles.h"
+
+#include "DxilDiaSession.h"
+
+dxil_dia::SourceFile::SourceFile(IMalloc *pMalloc, Session *pSession, DWORD index)
+  : m_pMalloc(pMalloc), m_pSession(pSession), m_index(index) {}
+
+llvm::MDTuple *dxil_dia::SourceFile::NameContent() const {
+  return llvm::cast<llvm::MDTuple>(m_pSession->Contents()->getOperand(m_index));
+}
+llvm::StringRef dxil_dia::SourceFile::Name() const {
+  return llvm::dyn_cast<llvm::MDString>(NameContent()->getOperand(0))->getString();
+}
+
+STDMETHODIMP dxil_dia::SourceFile::get_uniqueId(
+  /* [retval][out] */ DWORD *pRetVal) {
+  *pRetVal = m_index;
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::SourceFile::get_fileName(
+  /* [retval][out] */ BSTR *pRetVal) {
+  DxcThreadMalloc TM(m_pMalloc);
+  return StringRefToBSTR(Name(), pRetVal);
+}
+
+dxil_dia::SourceFilesTable::SourceFilesTable(
+  IMalloc *pMalloc,
+  Session *pSession)
+  : impl::TableBase<IDiaEnumSourceFiles, IDiaSourceFile>(pMalloc, pSession, Table::Kind::SourceFiles) {
+    m_count =
+      (m_pSession->Contents() == nullptr) ? 0 : m_pSession->Contents()->getNumOperands();
+    m_items.assign(m_count, nullptr);
+  }
+
+  HRESULT dxil_dia::SourceFilesTable::GetItem(DWORD index, IDiaSourceFile **ppItem) {
+    if (!m_items[index]) {
+      m_items[index] = CreateOnMalloc<SourceFile>(m_pMalloc, m_pSession, index);
+      if (m_items[index] == nullptr)
+        return E_OUTOFMEMORY;
+    }
+    m_items[index].p->AddRef();
+    *ppItem = m_items[index];
+    (*ppItem)->AddRef();
+    return S_OK;
+  }
diff --git a/lib/DxilDia/DxilDiaTableSourceFiles.h b/lib/DxilDia/DxilDiaTableSourceFiles.h
new file mode 100644
index 000000000..c2d4fd944
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableSourceFiles.h
@@ -0,0 +1,79 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableSourceFiles.h                                                 //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "dxc/Support/WinIncludes.h"
+
+#include <vector>
+
+#include "dia2.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Metadata.h"
+
+#include "dxc/Support/Global.h"
+#include "dxc/Support/microcom.h"
+
+#include "DxilDia.h"
+#include "DxilDiaTable.h"
+
+namespace dxil_dia {
+
+class Session;
+
+class SourceFile : public IDiaSourceFile {
+private:
+  DXC_MICROCOM_TM_REF_FIELDS()
+  CComPtr<Session> m_pSession;
+  DWORD m_index;
+
+public:
+  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
+  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
+    return DoBasicQueryInterface<IDiaSourceFile>(this, iid, ppvObject);
+  }
+
+  SourceFile(IMalloc *pMalloc, Session *pSession, DWORD index);
+
+  llvm::MDTuple *NameContent() const;
+
+  llvm::StringRef Name() const;
+
+  STDMETHODIMP get_uniqueId(
+    /* [retval][out] */ DWORD *pRetVal) override;
+
+  STDMETHODIMP get_fileName(
+    /* [retval][out] */ BSTR *pRetVal) override;
+
+  STDMETHODIMP get_checksumType(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_compilands(
+    /* [retval][out] */ IDiaEnumSymbols **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_checksum(
+    /* [in] */ DWORD cbData,
+    /* [out] */ DWORD *pcbData,
+    /* [size_is][out] */ BYTE *pbData) override { return ENotImpl(); }
+};
+
+class SourceFilesTable : public impl::TableBase<IDiaEnumSourceFiles, IDiaSourceFile> {
+public:
+  SourceFilesTable(IMalloc *pMalloc, Session *pSession);
+
+  HRESULT GetItem(DWORD index, IDiaSourceFile **ppItem) override;
+
+private:
+  std::vector<CComPtr<IDiaSourceFile>> m_items;
+};
+
+}  // namespace dxil_dia
diff --git a/lib/DxilDia/DxilDiaTableSymbols.cpp b/lib/DxilDia/DxilDiaTableSymbols.cpp
new file mode 100644
index 000000000..2b471ef3d
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableSymbols.cpp
@@ -0,0 +1,165 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableSymbols.cpp                                                   //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "DxilDiaTableSymbols.h"
+
+#include <comdef.h>
+
+#include "dxc/Support/Unicode.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Metadata.h"
+
+#include "DxilDiaSession.h"
+
+HRESULT dxil_dia::Symbol::Create(IMalloc *pMalloc, Session *pSession, DWORD index, DWORD symTag, Symbol **pSymbol) {
+  *pSymbol = Alloc(pMalloc);
+  if (*pSymbol == nullptr) return E_OUTOFMEMORY;
+  (*pSymbol)->AddRef();
+  (*pSymbol)->Init(pSession, index, symTag);
+  return S_OK;
+}
+
+void dxil_dia::Symbol::Init(Session *pSession, DWORD index, DWORD symTag) {
+  m_pSession = pSession;
+  m_index = index;
+  m_symTag = symTag;
+}
+
+STDMETHODIMP dxil_dia::Symbol::get_symIndexId(
+  /* [retval][out] */ DWORD *pRetVal) {
+  *pRetVal = m_index;
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::Symbol::get_symTag(
+  /* [retval][out] */ DWORD *pRetVal) {
+  *pRetVal = m_symTag;
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::Symbol::get_name(
+  /* [retval][out] */ BSTR *pRetVal) {
+  return m_name.CopyTo(pRetVal);
+}
+
+STDMETHODIMP dxil_dia::Symbol::get_dataKind(
+  /* [retval][out] */ DWORD *pRetVal) {
+  *pRetVal = m_dataKind;
+  return m_dataKind ? S_OK : S_FALSE;
+}
+
+STDMETHODIMP dxil_dia::Symbol::get_sourceFileName(
+  /* [retval][out] */ BSTR *pRetVal) {
+  if (pRetVal == nullptr) {
+    return E_INVALIDARG;
+  }
+  *pRetVal = m_sourceFileName.Copy();
+  return S_OK;
+}
+
+STDMETHODIMP dxil_dia::Symbol::get_value(
+  /* [retval][out] */ VARIANT *pRetVal) {
+  return VariantCopy(pRetVal, &m_value);
+}
+
+dxil_dia::SymbolsTable::SymbolsTable(IMalloc *pMalloc, Session *pSession)
+  : impl::TableBase<IDiaEnumSymbols, IDiaSymbol>(pMalloc, pSession, Table::Kind::Symbols) {
+    // The count is as follows:
+    // One symbol for the program.
+    // One Compiland per compilation unit.
+    // One CompilandDetails per compilation unit.
+    // Three CompilandEnv per Compliands: hlslFlags, hlslTarget, hlslEntry, hlslDefines, hlslArguments.
+    // One Function/Data for each global.
+    // One symbol for each type.
+  const size_t SymbolsPerCU = 1 + 1 + 5;
+  m_count = 1 + pSession->InfoRef().compile_unit_count() * SymbolsPerCU;
+}
+
+HRESULT dxil_dia::SymbolsTable::GetItem(DWORD index, IDiaSymbol **ppItem) {
+  DxcThreadMalloc TM(m_pMalloc);
+
+  // Ids are one-based, so adjust the index.
+  ++index;
+
+  // Program symbol.
+  CComPtr<Symbol> item;
+  if (index == HlslProgramId) {
+    IFR(Symbol::Create(m_pMalloc, m_pSession, index, SymTagExe, &item));
+    item->SetName(L"HLSL");
+  } else if (index == HlslCompilandId) {
+    IFR(Symbol::Create(m_pMalloc, m_pSession, index, SymTagCompiland, &item));
+    item->SetName(L"main");
+    item->SetLexicalParent(HlslProgramId);
+    if (m_pSession->MainFileName()) {
+      llvm::StringRef strRef = llvm::dyn_cast<llvm::MDString>(m_pSession->MainFileName()->getOperand(0)->getOperand(0))->getString();
+      std::string str(strRef.begin(), strRef.size()); // To make sure str is null terminated
+      item->SetSourceFileName(_bstr_t(Unicode::UTF8ToUTF16StringOrThrow(str.data()).c_str()));
+    }
+  } else if (index == HlslCompilandDetailsId) {
+    IFR(Symbol::Create(m_pMalloc, m_pSession, index, SymTagCompilandDetails, &item));
+    item->SetLexicalParent(HlslCompilandId);
+    // TODO: complete the rest of the compiland details
+    // platform: 256, language: 16, frontEndMajor: 6, frontEndMinor: 3, value: 0, hasDebugInfo: 1, compilerName: comiler string goes here
+  } else if (index == HlslCompilandEnvFlagsId) {
+    IFR(Symbol::Create(m_pMalloc, m_pSession, index, SymTagCompilandEnv, &item));
+    item->SetLexicalParent(HlslCompilandId);
+    item->SetName(L"hlslFlags");
+    item->SetValue(m_pSession->DxilModuleRef().GetGlobalFlags());
+  } else if (index == HlslCompilandEnvTargetId) {
+    IFR(Symbol::Create(m_pMalloc, m_pSession, index, SymTagCompilandEnv, &item));
+    item->SetLexicalParent(HlslCompilandId);
+    item->SetName(L"hlslTarget");
+    item->SetValue(m_pSession->DxilModuleRef().GetShaderModel()->GetName());
+  } else if (index == HlslCompilandEnvEntryId) {
+    IFR(Symbol::Create(m_pMalloc, m_pSession, index, SymTagCompilandEnv, &item));
+    item->SetLexicalParent(HlslCompilandId);
+    item->SetName(L"hlslEntry");
+    item->SetValue(m_pSession->DxilModuleRef().GetEntryFunctionName().c_str());
+  } else if (index == HlslCompilandEnvDefinesId) {
+    IFR(Symbol::Create(m_pMalloc, m_pSession, index, SymTagCompilandEnv, &item));
+    item->SetLexicalParent(HlslCompilandId);
+    item->SetName(L"hlslDefines");
+    llvm::MDNode *definesNode = m_pSession->Defines()->getOperand(0);
+    // Construct a double null terminated string for defines with L"\0" as a delimiter
+    CComBSTR pBSTR;
+    for (llvm::MDNode::op_iterator it = definesNode->op_begin(); it != definesNode->op_end(); ++it) {
+      llvm::StringRef strRef = llvm::dyn_cast<llvm::MDString>(*it)->getString();
+      std::string str(strRef.begin(), strRef.size());
+      CA2W cv(str.c_str());
+      pBSTR.Append(cv);
+      pBSTR.Append(L"\0", 1);
+    }
+    pBSTR.Append(L"\0", 1);
+    VARIANT Variant;
+    Variant.bstrVal = pBSTR;
+    Variant.vt = VARENUM::VT_BSTR;
+    item->SetValue(&Variant);
+  } else if (index == HlslCompilandEnvArgumentsId) {
+    IFR(Symbol::Create(m_pMalloc, m_pSession, index, SymTagCompilandEnv, &item));
+    item->SetLexicalParent(HlslCompilandId);
+    item->SetName(L"hlslArguments");
+    auto Arguments = m_pSession->Arguments()->getOperand(0);
+    auto NumArguments = Arguments->getNumOperands();
+    std::string args;
+    for (unsigned i = 0; i < NumArguments; ++i) {
+      llvm::StringRef strRef = llvm::dyn_cast<llvm::MDString>(Arguments->getOperand(i))->getString();
+      if (!args.empty())
+        args.push_back(' ');
+      args = args + strRef.str();
+    }
+    item->SetValue(args.c_str());
+  }
+
+  // TODO: add support for global data and functions as well as types.
+
+  *ppItem = item.Detach();
+  return (*ppItem == nullptr) ? E_FAIL : S_OK;
+}
diff --git a/lib/DxilDia/DxilDiaTableSymbols.h b/lib/DxilDia/DxilDiaTableSymbols.h
new file mode 100644
index 000000000..498e54340
--- /dev/null
+++ b/lib/DxilDia/DxilDiaTableSymbols.h
@@ -0,0 +1,821 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDiaTableSymbols.h                                                     //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DIA API implementation for DXIL modules.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "dxc/Support/WinIncludes.h"
+
+#include "dia2.h"
+
+#include "dxc/Support/Global.h"
+#include "dxc/Support/microcom.h"
+
+#include "DxilDia.h"
+#include "DxilDiaTable.h"
+
+namespace dxil_dia {
+class Session;
+
+class Symbol : public IDiaSymbol {
+  DXC_MICROCOM_TM_REF_FIELDS()
+    CComPtr<Session> m_pSession;
+  DWORD m_index;
+  DWORD m_symTag;
+  DWORD m_lexicalParent = 0;
+  DWORD m_dataKind = 0;
+  CComBSTR m_sourceFileName;
+  CComBSTR m_name;
+  CComVariant m_value;
+public:
+  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
+    DXC_MICROCOM_TM_CTOR(Symbol)
+    HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
+    return DoBasicQueryInterface<IDiaSymbol>(this, iid, ppvObject);
+  }
+
+  static HRESULT Create(IMalloc *pMalloc, Session *pSession, DWORD index, DWORD symTag, Symbol **pSymbol);
+
+  void Init(Session *pSession, DWORD index, DWORD symTag);
+
+  void SetDataKind(DWORD value) { m_dataKind = value; }
+  void SetLexicalParent(DWORD value) { m_lexicalParent = value; }
+  void SetName(LPCWSTR value) { m_name = value; }
+  void SetValue(LPCSTR value) { m_value = value; }
+  void SetValue(VARIANT *pValue) { m_value.Copy(pValue); }
+  void SetValue(unsigned value) { m_value = value; }
+  void SetSourceFileName(BSTR value) { m_sourceFileName = value; }
+
+#pragma region IDiaSymbol implementation.
+  STDMETHODIMP get_symIndexId(
+    /* [retval][out] */ DWORD *pRetVal) override;
+
+  STDMETHODIMP get_symTag(
+    /* [retval][out] */ DWORD *pRetVal) override;
+
+  STDMETHODIMP get_name(
+    /* [retval][out] */ BSTR *pRetVal) override;
+
+  STDMETHODIMP get_lexicalParent(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_classParent(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_type(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_dataKind(
+    /* [retval][out] */ DWORD *pRetVal) override;
+
+  STDMETHODIMP get_locationType(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_addressSection(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_addressOffset(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_relativeVirtualAddress(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_virtualAddress(
+    /* [retval][out] */ ULONGLONG *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_registerId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_offset(
+    /* [retval][out] */ LONG *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_length(
+    /* [retval][out] */ ULONGLONG *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_slot(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_volatileType(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_constType(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_unalignedType(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_access(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_libraryName(
+    /* [retval][out] */ BSTR *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_platform(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_language(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_editAndContinueEnabled(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_frontEndMajor(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_frontEndMinor(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_frontEndBuild(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_backEndMajor(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_backEndMinor(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_backEndBuild(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_sourceFileName(
+    /* [retval][out] */ BSTR *pRetVal) override;
+
+  STDMETHODIMP get_unused(
+    /* [retval][out] */ BSTR *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_thunkOrdinal(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_thisAdjust(
+    /* [retval][out] */ LONG *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_virtualBaseOffset(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_virtual(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_intro(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_pure(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_callingConvention(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_value(
+    /* [retval][out] */ VARIANT *pRetVal) override;
+
+  STDMETHODIMP get_baseType(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_token(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_timeStamp(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_guid(
+    /* [retval][out] */ GUID *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_symbolsFileName(
+    /* [retval][out] */ BSTR *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_reference(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_count(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_bitPosition(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_arrayIndexType(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_packed(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_constructor(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_overloadedOperator(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_nested(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasNestedTypes(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasAssignmentOperator(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasCastOperator(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_scoped(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_virtualBaseClass(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_indirectVirtualBaseClass(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_virtualBasePointerOffset(
+    /* [retval][out] */ LONG *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_virtualTableShape(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_lexicalParentId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_classParentId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_typeId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_arrayIndexTypeId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_virtualTableShapeId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_code(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_function(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_managed(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_msil(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_virtualBaseDispIndex(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_undecoratedName(
+    /* [retval][out] */ BSTR *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_age(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_signature(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_compilerGenerated(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_addressTaken(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_rank(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_lowerBound(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_upperBound(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_lowerBoundId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_upperBoundId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_dataBytes(
+    /* [in] */ DWORD cbData,
+    /* [out] */ DWORD *pcbData,
+    /* [size_is][out] */ BYTE *pbData) override { return ENotImpl(); }
+
+  STDMETHODIMP findChildren(
+    /* [in] */ enum SymTagEnum symtag,
+    /* [in] */ LPCOLESTR name,
+    /* [in] */ DWORD compareFlags,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findChildrenEx(
+    /* [in] */ enum SymTagEnum symtag,
+    /* [in] */ LPCOLESTR name,
+    /* [in] */ DWORD compareFlags,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findChildrenExByAddr(
+    /* [in] */ enum SymTagEnum symtag,
+    /* [in] */ LPCOLESTR name,
+    /* [in] */ DWORD compareFlags,
+    /* [in] */ DWORD isect,
+    /* [in] */ DWORD offset,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findChildrenExByVA(
+    /* [in] */ enum SymTagEnum symtag,
+    /* [in] */ LPCOLESTR name,
+    /* [in] */ DWORD compareFlags,
+    /* [in] */ ULONGLONG va,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findChildrenExByRVA(
+    /* [in] */ enum SymTagEnum symtag,
+    /* [in] */ LPCOLESTR name,
+    /* [in] */ DWORD compareFlags,
+    /* [in] */ DWORD rva,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP get_targetSection(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_targetOffset(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_targetRelativeVirtualAddress(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_targetVirtualAddress(
+    /* [retval][out] */ ULONGLONG *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_machineType(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_oemId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_oemSymbolId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_types(
+    /* [in] */ DWORD cTypes,
+    /* [out] */ DWORD *pcTypes,
+    /* [size_is][size_is][out] */ IDiaSymbol **pTypes) override { return ENotImpl(); }
+
+  STDMETHODIMP get_typeIds(
+    /* [in] */ DWORD cTypeIds,
+    /* [out] */ DWORD *pcTypeIds,
+    /* [size_is][out] */ DWORD *pdwTypeIds) override { return ENotImpl(); }
+
+  STDMETHODIMP get_objectPointerType(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_udtKind(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_undecoratedNameEx(
+    /* [in] */ DWORD undecorateOptions,
+    /* [out] */ BSTR *name) override { return ENotImpl(); }
+
+  STDMETHODIMP get_noReturn(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_customCallingConvention(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_noInline(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_optimizedCodeDebugInfo(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_notReached(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_interruptReturn(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_farReturn(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isStatic(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasDebugInfo(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isLTCG(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isDataAligned(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasSecurityChecks(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_compilerName(
+    /* [retval][out] */ BSTR *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasAlloca(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasSetJump(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasLongJump(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasInlAsm(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasEH(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasSEH(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasEHa(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isNaked(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isAggregated(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isSplitted(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_container(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_inlSpec(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_noStackOrdering(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_virtualBaseTableType(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasManagedCode(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isHotpatchable(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isCVTCIL(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isMSILNetmodule(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isCTypes(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isStripped(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_frontEndQFE(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_backEndQFE(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_wasInlined(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_strictGSCheck(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isCxxReturnUdt(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isConstructorVirtualBase(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_RValueReference(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_unmodifiedType(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_framePointerPresent(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isSafeBuffers(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_intrinsic(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_sealed(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hfaFloat(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hfaDouble(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_liveRangeStartAddressSection(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_liveRangeStartAddressOffset(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_liveRangeStartRelativeVirtualAddress(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_countLiveRanges(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_liveRangeLength(
+    /* [retval][out] */ ULONGLONG *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_offsetInUdt(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_paramBasePointerRegisterId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_localBasePointerRegisterId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isLocationControlFlowDependent(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_stride(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_numberOfRows(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_numberOfColumns(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isMatrixRowMajor(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_numericProperties(
+    /* [in] */ DWORD cnt,
+    /* [out] */ DWORD *pcnt,
+    /* [size_is][out] */ DWORD *pProperties) override { return ENotImpl(); }
+
+  STDMETHODIMP get_modifierValues(
+    /* [in] */ DWORD cnt,
+    /* [out] */ DWORD *pcnt,
+    /* [size_is][out] */ WORD *pModifiers) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isReturnValue(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isOptimizedAway(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_builtInKind(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_registerType(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_baseDataSlot(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_baseDataOffset(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_textureSlot(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_samplerSlot(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_uavSlot(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_sizeInUdt(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_memorySpaceKind(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_unmodifiedTypeId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_subTypeId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_subType(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_numberOfModifiers(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_numberOfRegisterIndices(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isHLSLData(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isPointerToDataMember(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isPointerToMemberFunction(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isSingleInheritance(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isMultipleInheritance(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isVirtualInheritance(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_restrictedType(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isPointerBasedOnSymbolValue(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_baseSymbol(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_baseSymbolId(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_objectFileName(
+    /* [retval][out] */ BSTR *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isAcceleratorGroupSharedLocal(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isAcceleratorPointerTagLiveRange(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isAcceleratorStubFunction(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_numberOfAcceleratorPointerTags(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isSdl(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isWinRTPointer(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isRefUdt(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isValueUdt(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isInterfaceUdt(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineFramesByAddr(
+    /* [in] */ DWORD isect,
+    /* [in] */ DWORD offset,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineFramesByRVA(
+    /* [in] */ DWORD rva,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineFramesByVA(
+    /* [in] */ ULONGLONG va,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineeLines(
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineeLinesByAddr(
+    /* [in] */ DWORD isect,
+    /* [in] */ DWORD offset,
+    /* [in] */ DWORD length,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineeLinesByRVA(
+    /* [in] */ DWORD rva,
+    /* [in] */ DWORD length,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findInlineeLinesByVA(
+    /* [in] */ ULONGLONG va,
+    /* [in] */ DWORD length,
+    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findSymbolsForAcceleratorPointerTag(
+    /* [in] */ DWORD tagValue,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP findSymbolsByRVAForAcceleratorPointerTag(
+    /* [in] */ DWORD tagValue,
+    /* [in] */ DWORD rva,
+    /* [out] */ IDiaEnumSymbols **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP get_acceleratorPointerTags(
+    /* [in] */ DWORD cnt,
+    /* [out] */ DWORD *pcnt,
+    /* [size_is][out] */ DWORD *pPointerTags) override { return ENotImpl(); }
+
+  STDMETHODIMP getSrcLineOnTypeDefn(
+    /* [out] */ IDiaLineNumber **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isPGO(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasValidPGOCounts(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_isOptimizedForSpeed(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_PGOEntryCount(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_PGOEdgeCount(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_PGODynamicInstructionCount(
+    /* [retval][out] */ ULONGLONG *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_staticSize(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_finalLiveStaticSize(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_phaseName(
+    /* [retval][out] */ BSTR *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_hasControlFlowCheck(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_constantExport(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_dataExport(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_privateExport(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_noNameExport(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_exportHasExplicitlyAssignedOrdinal(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_exportIsForwarder(
+    /* [retval][out] */ BOOL *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_ordinal(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_frameSize(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_exceptionHandlerAddressSection(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_exceptionHandlerAddressOffset(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_exceptionHandlerRelativeVirtualAddress(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_exceptionHandlerVirtualAddress(
+    /* [retval][out] */ ULONGLONG *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP findInputAssemblyFile(
+    /* [out] */ IDiaInputAssemblyFile **ppResult) override { return ENotImpl(); }
+
+  STDMETHODIMP get_characteristics(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_coffGroup(
+    /* [retval][out] */ IDiaSymbol **pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_bindID(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_bindSpace(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+  STDMETHODIMP get_bindSlot(
+    /* [retval][out] */ DWORD *pRetVal) override { return ENotImpl(); }
+
+#pragma endregion IDiaSymbol implementation.
+};
+
+class SymbolsTable : public impl::TableBase<IDiaEnumSymbols, IDiaSymbol> {
+public:
+  SymbolsTable(IMalloc *pMalloc, Session *pSession);
+
+  HRESULT GetItem(DWORD index, IDiaSymbol **ppItem) override;
+};
+
+}  // namespace dxil_dia
diff --git a/lib/DxilDia/LLVMBuild.txt b/lib/DxilDia/LLVMBuild.txt
new file mode 100644
index 000000000..8565bc5ba
--- /dev/null
+++ b/lib/DxilDia/LLVMBuild.txt
@@ -0,0 +1,16 @@
+; Copyright (C) Microsoft Corporation. All rights reserved.
+; This file is distributed under the University of Illinois Open Source License. See LICENSE.TXT for details.
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DxilDia
+parent = Libraries
+required_libraries = Core DxcSupport Support
diff --git a/lib/DxilPIXPasses/CMakeLists.txt b/lib/DxilPIXPasses/CMakeLists.txt
index 5c8df1cd8..9982dce6f 100644
--- a/lib/DxilPIXPasses/CMakeLists.txt
+++ b/lib/DxilPIXPasses/CMakeLists.txt
@@ -2,6 +2,7 @@
 # This file is distributed under the University of Illinois Open Source License. See LICENSE.TXT for details.
 add_llvm_library(LLVMDxilPIXPasses
   DxilAddPixelHitInstrumentation.cpp
+  DxilAnnotateWithVirtualRegister.cpp
   DxilDebugInstrumentation.cpp
   DxilForceEarlyZ.cpp
   DxilOutputColorBecomesConstant.cpp
@@ -9,6 +10,7 @@ add_llvm_library(LLVMDxilPIXPasses
   DxilReduceMSAAToSingleSample.cpp
   DxilShaderAccessTracking.cpp
   DxilPIXPasses.cpp
+  DxilPIXVirtualRegisters.cpp
 
 
   ADDITIONAL_HEADER_DIRS
diff --git a/lib/DxilPIXPasses/DxilAnnotateWithVirtualRegister.cpp b/lib/DxilPIXPasses/DxilAnnotateWithVirtualRegister.cpp
new file mode 100644
index 000000000..ac89d1a81
--- /dev/null
+++ b/lib/DxilPIXPasses/DxilAnnotateWithVirtualRegister.cpp
@@ -0,0 +1,211 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilAnnotateWithVirtualRegister.cpp                                       //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Annotates the llvm instructions with a virtual register number to be used //
+// during PIX debugging.                                                     //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxc/DxilPIXPasses/DxilPIXPasses.h"
+
+#include <memory>
+
+#include "dxc/DXIL/DxilModule.h"
+#include "dxc/Support/Global.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "DxilPIXVirtualRegisters.h"
+
+#define DEBUG_TYPE "dxil-annotate-with-virtual-regs"
+
+namespace {
+using namespace pix_dxil;
+
+class DxilAnnotateWithVirtualRegister : public llvm::ModulePass {
+public:
+  static char ID;
+  DxilAnnotateWithVirtualRegister() : llvm::ModulePass(ID) {}
+
+  bool runOnModule(llvm::Module &M) override;
+
+private:
+  void AnnotateValues(llvm::Instruction *pI);
+  void AnnotateStore(llvm::Instruction *pI);
+  bool IsAllocaRegisterWrite(llvm::Value *V, llvm::AllocaInst **pAI, llvm::Value **pIdx);
+  void AnnotateAlloca(llvm::AllocaInst *pAlloca);
+  void AnnotateGeneric(llvm::Instruction *pI);
+  void AssignNewDxilRegister(llvm::Instruction *pI);
+  void AssignNewAllocaRegister(llvm::AllocaInst *pAlloca, std::uint32_t C);
+
+  hlsl::DxilModule *m_DM;
+  std::uint32_t m_uVReg;
+  void Init(llvm::Module &M) {
+    m_DM = &M.GetOrCreateDxilModule();
+    m_uVReg = 0;
+  }
+};
+
+char DxilAnnotateWithVirtualRegister::ID = 0;
+
+bool DxilAnnotateWithVirtualRegister::runOnModule(llvm::Module &M) {
+  Init(M);
+  if (m_DM == nullptr) {
+    return false;
+  }
+
+  if (OSOverride != nullptr) {
+    *OSOverride << "\nBegin - dxil values to virtual register mapping\n";
+  }
+
+  std::uint32_t InstNum = 0;
+  for (llvm::Instruction &I : llvm::inst_range(m_DM->GetEntryFunction())) {
+    pix_dxil::PixDxilInstNum::AddMD(M.getContext(), &I, ++InstNum);
+  }
+
+  for (llvm::Instruction &I : llvm::inst_range(m_DM->GetEntryFunction())) {
+    AnnotateValues(&I);
+  }
+
+  for (llvm::Instruction &I : llvm::inst_range(m_DM->GetEntryFunction())) {
+    AnnotateStore(&I);
+  }
+
+  if (OSOverride != nullptr) {
+    *OSOverride << "\nEnd - dxil values to virtual register mapping\n";
+  }
+
+  m_DM = nullptr;
+  return m_uVReg > 0;
+}
+
+void DxilAnnotateWithVirtualRegister::AnnotateValues(llvm::Instruction *pI) {
+  if (auto *pAlloca = llvm::dyn_cast<llvm::AllocaInst>(pI)) {
+    AnnotateAlloca(pAlloca);
+  } else if (!pI->getType()->isVoidTy()) {
+    AnnotateGeneric(pI);
+  }
+}
+
+void DxilAnnotateWithVirtualRegister::AnnotateStore(llvm::Instruction *pI) {
+  auto *pSt = llvm::dyn_cast<llvm::StoreInst>(pI);
+  if (pSt == nullptr) {
+    return;
+  }
+
+  llvm::AllocaInst *Alloca;
+  llvm::Value *Index;
+  if (!IsAllocaRegisterWrite(pSt->getPointerOperand(), &Alloca, &Index)) {
+    return;
+  }
+
+  llvm::MDNode *AllocaReg = Alloca->getMetadata(PixAllocaReg::MDName);
+  if (AllocaReg == nullptr) {
+    return;
+  }
+
+  PixAllocaRegWrite::AddMD(m_DM->GetCtx(), pSt, AllocaReg, Index);
+}
+
+bool DxilAnnotateWithVirtualRegister::IsAllocaRegisterWrite(llvm::Value *V, llvm::AllocaInst **pAI, llvm::Value **pIdx) {
+  llvm::IRBuilder<> B(m_DM->GetCtx());
+
+  *pAI = nullptr;
+  *pIdx = nullptr;
+
+  if (auto *pGEP = llvm::dyn_cast<llvm::GetElementPtrInst>(V)) {
+    auto *Alloca = llvm::dyn_cast<llvm::AllocaInst>(pGEP->getPointerOperand());
+    if (Alloca == nullptr) {
+      return false;
+    }
+
+    llvm::SmallVector<llvm::Value *, 2> Indices(pGEP->idx_begin(), pGEP->idx_end());
+    if (Indices.size() != 2) {
+      return false;
+    }
+    auto *pIdx0 = llvm::dyn_cast<llvm::ConstantInt>(Indices[0]);
+
+    if (pIdx0 == nullptr || pIdx0->getLimitedValue() != 0) {
+      return false;
+    }
+    
+    *pAI = Alloca;
+    *pIdx = Indices[1];
+    return true;
+  }
+
+  if (auto *pAlloca = llvm::dyn_cast<llvm::AllocaInst>(V)) {
+    llvm::Type *pAllocaTy = pAlloca->getType()->getElementType();
+    if (!pAllocaTy->isFloatTy() && !pAllocaTy->isIntegerTy()) {
+      return false;
+    }
+
+    *pAI = pAlloca;
+    *pIdx = B.getInt32(0);
+    return true;
+  }
+
+  return false;
+}
+
+void DxilAnnotateWithVirtualRegister::AnnotateAlloca(llvm::AllocaInst *pAlloca) {
+  llvm::Type *pAllocaTy = pAlloca->getType()->getElementType();
+  if (pAllocaTy->isFloatTy() || pAllocaTy->isIntegerTy()) {
+    AssignNewAllocaRegister(pAlloca, 1);
+  } else if (auto *AT = llvm::dyn_cast<llvm::ArrayType>(pAllocaTy)) {
+    AssignNewAllocaRegister(pAlloca, AT->getNumElements());
+  } else {
+    DXASSERT_ARGS(false, "Unhandled alloca kind: %d", pAllocaTy->getTypeID());
+  }
+}
+
+void DxilAnnotateWithVirtualRegister::AnnotateGeneric(llvm::Instruction *pI) {
+  if (!pI->getType()->isFloatTy() && !pI->getType()->isIntegerTy()) {
+    return;
+  }
+  AssignNewDxilRegister(pI);
+}
+
+void DxilAnnotateWithVirtualRegister::AssignNewDxilRegister(llvm::Instruction *pI) {
+  PixDxilReg::AddMD(m_DM->GetCtx(), pI, m_uVReg);
+  if (OSOverride != nullptr) {
+    static constexpr bool DontPrintType = false;
+    pI->printAsOperand(*OSOverride, DontPrintType, m_DM->GetModule());
+    *OSOverride << " dxil " << m_uVReg << "\n";
+  }
+  m_uVReg++;
+}
+
+void DxilAnnotateWithVirtualRegister::AssignNewAllocaRegister(llvm::AllocaInst *pAlloca, std::uint32_t C) {
+  PixAllocaReg::AddMD(m_DM->GetCtx(), pAlloca, m_uVReg, C);
+  if (OSOverride != nullptr) {
+    static constexpr bool DontPrintType = false;
+    pAlloca->printAsOperand(*OSOverride, DontPrintType, m_DM->GetModule());
+    *OSOverride << " alloca " << m_uVReg << " " << C << "\n";
+  }
+  m_uVReg += C;
+}
+}
+
+using namespace llvm;
+
+INITIALIZE_PASS(DxilAnnotateWithVirtualRegister, DEBUG_TYPE, "Annotates each instruction in the DXIL module with a virtual register number", false, false)
+
+ModulePass *llvm::createDxilAnnotateWithVirtualRegisterPass() {
+  return new DxilAnnotateWithVirtualRegister();
+}
diff --git a/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp b/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp
index 8c8e4871a..bdcb362d6 100644
--- a/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp
+++ b/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp
@@ -21,6 +21,8 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/ADT/STLExtras.h"
 
+#include "DxilPIXVirtualRegisters.h"
+
 using namespace llvm;
 using namespace hlsl;
 
@@ -125,6 +127,13 @@ struct DebugShaderModifierRecordDXILStepBase {
 template< typename ReturnType >
 struct DebugShaderModifierRecordDXILStep : public DebugShaderModifierRecordDXILStepBase {
   ReturnType ReturnValue;
+  union {
+    struct {
+      uint32_t ValueOrdinalBase : 16;
+      uint32_t ValueOrdinalIndex : 16;
+    } Details;
+    uint32_t u32ValueOrdinal;
+  } ValueOrdinal;
 };
 
 template< >
@@ -195,8 +204,6 @@ private:
 
   std::map<uint32_t, Value *> m_IncrementInstructionBySize;
 
-  unsigned int m_InstructionIndex = 0;
-
   struct BuilderContext {
     Module &M;
     DxilModule &DM;
@@ -226,10 +233,12 @@ private:
   void addDebugEntryValue(BuilderContext &BC, Value * TheValue);
   void addInvocationStartMarker(BuilderContext &BC);
   void reserveDebugEntrySpace(BuilderContext &BC, uint32_t SpaceInDwords);
+  void addStoreStepDebugEntry(BuilderContext &BC, StoreInst *Inst);
   void addStepDebugEntry(BuilderContext &BC, Instruction *Inst);
+  void addStepDebugEntryValue(BuilderContext &BC, std::uint32_t InstNum, Value *V, std::uint32_t ValueOrdinal, Value *ValueOrdinalIndex);
   uint32_t UAVDumpingGroundOffset();
   template<typename ReturnType>
-  void addStepEntryForType(DebugShaderModifierRecordType RecordType, BuilderContext &BC, Instruction *Inst);
+  void addStepEntryForType(DebugShaderModifierRecordType RecordType, BuilderContext &BC, std::uint32_t InstNum, Value *V, std::uint32_t ValueOrdinal, Value *ValueOrdinalIndex);
 
 };
 
@@ -632,8 +641,7 @@ void DxilDebugInstrumentation::addInvocationStartMarker(BuilderContext &BC) {
 }
 
 template<typename ReturnType>
-void DxilDebugInstrumentation::addStepEntryForType(DebugShaderModifierRecordType RecordType, BuilderContext &BC, Instruction *Inst) {
-
+void DxilDebugInstrumentation::addStepEntryForType(DebugShaderModifierRecordType RecordType, BuilderContext &BC, std::uint32_t InstNum, Value *V, std::uint32_t ValueOrdinal, Value *ValueOrdinalIndex) {
   DebugShaderModifierRecordDXILStep<ReturnType> step = {};
   reserveDebugEntrySpace(BC, sizeof(step));
 
@@ -641,41 +649,83 @@ void DxilDebugInstrumentation::addStepEntryForType(DebugShaderModifierRecordType
   step.Header.Details.Type = static_cast<uint8_t>(RecordType);
   addDebugEntryValue(BC, BC.HlslOP->GetU32Const(step.Header.u32Header));
   addDebugEntryValue(BC, m_InvocationId);
-  addDebugEntryValue(BC, BC.HlslOP->GetU32Const(m_InstructionIndex++));
+  addDebugEntryValue(BC, BC.HlslOP->GetU32Const(InstNum));
 
   if (RecordType != DebugShaderModifierRecordTypeDXILStepVoid) {
-    addDebugEntryValue(BC, Inst);
+    addDebugEntryValue(BC, V);
+
+    IRBuilder<> &B = BC.Builder;
+
+    Value *VO = BC.HlslOP->GetU32Const(ValueOrdinal << 16);
+    Value *VOI = B.CreateAnd(ValueOrdinalIndex, BC.HlslOP->GetU32Const(0xFFFF), "ValueOrdinalIndex");
+    Value *EncodedValueOrdinalAndIndex = BC.Builder.CreateOr(VO, VOI, "ValueOrdinal");
+    addDebugEntryValue(BC, EncodedValueOrdinalAndIndex);
   }
 }
 
+void DxilDebugInstrumentation::addStoreStepDebugEntry(BuilderContext &BC, StoreInst *Inst) {
+  std::uint32_t ValueOrdinalBase;
+  std::uint32_t UnusedValueOrdinalSize;
+  llvm::Value *ValueOrdinalIndex;
+  if (!pix_dxil::PixAllocaRegWrite::FromInst(Inst, &ValueOrdinalBase, &UnusedValueOrdinalSize, &ValueOrdinalIndex)) {
+    return;
+  }
+
+  std::uint32_t InstNum;
+  if (!pix_dxil::PixDxilInstNum::FromInst(Inst, &InstNum)) {
+    return;
+  }
+
+  addStepDebugEntryValue(BC, InstNum, Inst->getValueOperand(), ValueOrdinalBase, ValueOrdinalIndex);
+}
+
 void DxilDebugInstrumentation::addStepDebugEntry(BuilderContext &BC, Instruction *Inst) {
   if (Inst->getOpcode() == Instruction::OtherOps::PHI) {
     return;
   }
 
-  Type::TypeID ID = Inst->getType()->getTypeID();
+  if (auto *St = llvm::dyn_cast<llvm::StoreInst>(Inst)) {
+    addStoreStepDebugEntry(BC, St);
+    return;
+  }
+
+  std::uint32_t RegNum;
+  if (!pix_dxil::PixDxilReg::FromInst(Inst, &RegNum)) {
+    return;
+  }
+
+  std::uint32_t InstNum;
+  if (!pix_dxil::PixDxilInstNum::FromInst(Inst, &InstNum)) {
+    return;
+  }
+
+  addStepDebugEntryValue(BC, InstNum, Inst, RegNum, BC.Builder.getInt32(0));
+}
+
+void DxilDebugInstrumentation::addStepDebugEntryValue(BuilderContext &BC, std::uint32_t InstNum, Value *V, std::uint32_t ValueOrdinal, Value *ValueOrdinalIndex) {
+  const Type::TypeID ID = V->getType()->getTypeID();
 
   switch (ID) {
   case Type::TypeID::StructTyID:
   case Type::TypeID::VoidTyID:
-    addStepEntryForType<void>(DebugShaderModifierRecordTypeDXILStepVoid, BC, Inst);
+    addStepEntryForType<void>(DebugShaderModifierRecordTypeDXILStepVoid, BC, InstNum, V, ValueOrdinal, ValueOrdinalIndex);
     break;
   case Type::TypeID::FloatTyID:
-    addStepEntryForType<float>(DebugShaderModifierRecordTypeDXILStepFloat, BC, Inst);
+    addStepEntryForType<float>(DebugShaderModifierRecordTypeDXILStepFloat, BC, InstNum, V, ValueOrdinal, ValueOrdinalIndex);
     break;
   case Type::TypeID::IntegerTyID:
-    if (Inst->getType()->getIntegerBitWidth() == 64) {
-      addStepEntryForType<uint64_t>(DebugShaderModifierRecordTypeDXILStepUint64, BC, Inst);
+    if (V->getType()->getIntegerBitWidth() == 64) {
+      addStepEntryForType<uint64_t>(DebugShaderModifierRecordTypeDXILStepUint64, BC, InstNum, V, ValueOrdinal, ValueOrdinalIndex);
     }
     else {
-      addStepEntryForType<uint32_t>(DebugShaderModifierRecordTypeDXILStepUint32, BC, Inst);
+      addStepEntryForType<uint32_t>(DebugShaderModifierRecordTypeDXILStepUint32, BC, InstNum, V, ValueOrdinal, ValueOrdinalIndex);
     }
     break;
   case Type::TypeID::DoubleTyID:
-    addStepEntryForType<double>(DebugShaderModifierRecordTypeDXILStepDouble, BC, Inst);
+    addStepEntryForType<double>(DebugShaderModifierRecordTypeDXILStepDouble, BC, InstNum, V, ValueOrdinal, ValueOrdinalIndex);
     break;
   case Type::TypeID::HalfTyID:
-    addStepEntryForType<float>(DebugShaderModifierRecordTypeDXILStepFloat, BC, Inst);
+    addStepEntryForType<float>(DebugShaderModifierRecordTypeDXILStepFloat, BC, InstNum, V, ValueOrdinal, ValueOrdinalIndex);
     break;
   case Type::TypeID::PointerTyID:
     // Skip pointer calculation instructions. They aren't particularly meaningful to the user (being a mere
@@ -741,10 +791,9 @@ bool DxilDebugInstrumentation::runOnModule(Module &M) {
 
   // Instrument original instructions:
   for (auto & Inst : AllInstructions) {
-    // Instrumentation goes after the instruction if it has a return value.
-    // Otherwise, the instruction might be a terminator so we HAVE to put the instrumentation before
-    if (Inst->getType()->getTypeID() != Type::TypeID::VoidTyID) {
-      // Has a return type, so can't be a terminator, so start inserting before the next instruction
+    // Instrumentation goes after the instruction if it is not a terminator. Otherwise,
+    // Instrumentation goes prior to the instruction.
+    if (!Inst->isTerminator()) {
       IRBuilder<> Builder(Inst->getNextNode());
       BuilderContext BC2{ BC.M, BC.DM, BC.Ctx, BC.HlslOP, Builder };
       addStepDebugEntry(BC2, Inst);
diff --git a/lib/DxilPIXPasses/DxilPIXPasses.cpp b/lib/DxilPIXPasses/DxilPIXPasses.cpp
index 838e04c01..d0b25396a 100644
--- a/lib/DxilPIXPasses/DxilPIXPasses.cpp
+++ b/lib/DxilPIXPasses/DxilPIXPasses.cpp
@@ -31,6 +31,7 @@ HRESULT SetupRegistryPassForPIX() {
     /* <py::lines('INIT-PASSES')>hctdb_instrhelp.get_init_passes(set(["pix"]))</py>*/
     // INIT-PASSES:BEGIN
     initializeDxilAddPixelHitInstrumentationPass(Registry);
+    initializeDxilAnnotateWithVirtualRegisterPass(Registry);
     initializeDxilDebugInstrumentationPass(Registry);
     initializeDxilForceEarlyZPass(Registry);
     initializeDxilOutputColorBecomesConstantPass(Registry);
diff --git a/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp b/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp
new file mode 100644
index 000000000..839646f94
--- /dev/null
+++ b/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp
@@ -0,0 +1,169 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilPIXVirtualRegisters.cpp                                               //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Defines functions for dealing with the virtual register annotations in    //
+// DXIL instructions.                                                        //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "DxilPIXVirtualRegisters.h"
+
+#include "dxc/Support/Global.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+
+static llvm::Metadata *MetadataForValue(llvm::Value *V) {
+  if (auto *C = llvm::dyn_cast<llvm::Constant>(V)) {
+    return llvm::ConstantAsMetadata::get(C);
+  }
+  return llvm::ValueAsMetadata::get(V);
+}
+
+void pix_dxil::PixDxilInstNum::AddMD(llvm::LLVMContext &Ctx, llvm::Instruction *pI, std::uint32_t InstNum) {
+  llvm::IRBuilder<> B(Ctx);
+  pI->setMetadata(
+    llvm::StringRef(MDName),
+    llvm::MDNode::get(Ctx, { llvm::ConstantAsMetadata::get(B.getInt32(ID)),
+                             llvm::ConstantAsMetadata::get(B.getInt32(InstNum)) }));
+}
+
+bool pix_dxil::PixDxilInstNum::FromInst(llvm::Instruction *pI, std::uint32_t *pInstNum) {
+  *pInstNum = 0;
+
+  auto *mdNodes = pI->getMetadata(MDName);
+
+  if (mdNodes == nullptr) {
+    return false;
+  }
+
+  if (mdNodes->getNumOperands() != 2) {
+    return false;
+  }
+
+  auto *mdID = llvm::mdconst::dyn_extract<llvm::ConstantInt>(mdNodes->getOperand(0));
+  if (mdID == nullptr || mdID->getLimitedValue() != ID) {
+    return false;
+  }
+
+  auto *mdInstNum = llvm::mdconst::dyn_extract<llvm::ConstantInt>(mdNodes->getOperand(1));
+  if (mdInstNum == nullptr) {
+    return false;
+  }
+
+  *pInstNum = mdInstNum->getLimitedValue();
+  return true;
+}
+
+void pix_dxil::PixDxilReg::AddMD(llvm::LLVMContext &Ctx, llvm::Instruction *pI, std::uint32_t RegNum) {
+  llvm::IRBuilder<> B(Ctx);
+  pI->setMetadata(
+      llvm::StringRef(MDName),
+      llvm::MDNode::get(Ctx, { llvm::ConstantAsMetadata::get(B.getInt32(ID)),
+                               llvm::ConstantAsMetadata::get(B.getInt32(RegNum)) }));
+}
+
+bool pix_dxil::PixDxilReg::FromInst(llvm::Instruction *pI, std::uint32_t *pRegNum) {
+  *pRegNum = 0;
+
+  auto *mdNodes = pI->getMetadata(MDName);
+
+  if (mdNodes == nullptr) {
+    return false;
+  }
+
+  if (mdNodes->getNumOperands() != 2) {
+    return false;
+  }
+
+  auto *mdID = llvm::mdconst::dyn_extract<llvm::ConstantInt>(mdNodes->getOperand(0));
+  if (mdID == nullptr || mdID->getLimitedValue() != ID) {
+    return false;
+  }
+
+  auto *mdRegNum = llvm::mdconst::dyn_extract<llvm::ConstantInt>(mdNodes->getOperand(1));
+  if (mdRegNum == nullptr) {
+    return false;
+  }
+
+  *pRegNum = mdRegNum->getLimitedValue();
+  return true;
+}
+
+static bool ParsePixAllocaReg(llvm::MDNode *MD, std::uint32_t *RegNum, std::uint32_t *Count) {
+  if (MD->getNumOperands() != 3) {
+    return false;
+  }
+
+  auto *mdID = llvm::mdconst::dyn_extract<llvm::ConstantInt>(MD->getOperand(0));
+  if (mdID == nullptr || mdID->getLimitedValue() != pix_dxil::PixAllocaReg::ID) {
+    return false;
+  }
+
+  auto *mdRegNum = llvm::mdconst::dyn_extract<llvm::ConstantInt>(MD->getOperand(1));
+  auto *mdCount = llvm::mdconst::dyn_extract<llvm::ConstantInt>(MD->getOperand(2));
+
+  if (mdRegNum == nullptr || mdCount == nullptr) {
+    return false;
+  }
+
+  *RegNum = mdRegNum->getLimitedValue();
+  *Count = mdCount->getLimitedValue();
+  return true;
+}
+
+void pix_dxil::PixAllocaReg::AddMD(llvm::LLVMContext &Ctx, llvm::AllocaInst *pAlloca, std::uint32_t RegNum, std::uint32_t Count) {
+  llvm::IRBuilder<> B(Ctx);
+  pAlloca->setMetadata(
+      llvm::StringRef(MDName),
+      llvm::MDNode::get(Ctx, { llvm::ConstantAsMetadata::get(B.getInt32(ID)),
+                               llvm::ConstantAsMetadata::get(B.getInt32(RegNum)),
+                               llvm::ConstantAsMetadata::get(B.getInt32(Count)) }));
+}
+
+void pix_dxil::PixAllocaRegWrite::AddMD(llvm::LLVMContext &Ctx, llvm::StoreInst *pSt, llvm::MDNode *pAllocaReg, llvm::Value *Index) {
+  llvm::IRBuilder<> B(Ctx);
+  pSt->setMetadata(
+      llvm::StringRef(MDName),
+      llvm::MDNode::get(Ctx, { llvm::ConstantAsMetadata::get(B.getInt32(ID)),
+                               pAllocaReg,
+                               MetadataForValue(Index) }));
+}
+
+bool pix_dxil::PixAllocaRegWrite::FromInst(llvm::StoreInst *pI, std::uint32_t *pRegBase, std::uint32_t *pRegSize, llvm::Value **pIndex) {
+  *pRegBase = 0;
+  *pRegSize = 0;
+  *pIndex = nullptr;
+
+  auto *mdNodes = pI->getMetadata(MDName);
+  if (mdNodes == nullptr || mdNodes->getNumOperands() != 3) {
+    return false;
+  }
+
+  auto *mdID = llvm::mdconst::dyn_extract<llvm::ConstantInt>(mdNodes->getOperand(0));
+  if (mdID == nullptr || mdID->getLimitedValue() != ID) {
+    return false;
+  }
+
+  auto *mdAllocaReg = llvm::dyn_cast<llvm::MDNode>(mdNodes->getOperand(1));
+  if (mdAllocaReg == nullptr || !ParsePixAllocaReg(mdAllocaReg, pRegBase, pRegSize)) {
+    return false;
+  }
+
+  auto *mdIndex = llvm::dyn_cast<llvm::ValueAsMetadata>(mdNodes->getOperand(2));
+  if (mdIndex == nullptr) {
+    return false;
+  }
+  *pIndex = mdIndex->getValue();
+
+  return true;
+}
diff --git a/lib/DxilPIXPasses/DxilPIXVirtualRegisters.h b/lib/DxilPIXPasses/DxilPIXVirtualRegisters.h
new file mode 100644
index 000000000..5db31e961
--- /dev/null
+++ b/lib/DxilPIXPasses/DxilPIXVirtualRegisters.h
@@ -0,0 +1,54 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilPIXVirtualRegisters.cpp                                               //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Declares functions for dealing with the virtual register annotations in   //
+// DXIL instructions.                                                        //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include <cstdint>
+
+namespace llvm {
+class AllocaInst;
+class Instruction;
+class LLVMContext;
+class MDNode;
+class StoreInst;
+class Value;
+}  // namespace llvm
+
+namespace pix_dxil {
+namespace PixDxilInstNum {
+static constexpr char MDName[] = "pix-dxil-inst-num";
+static constexpr uint32_t ID = 3;
+
+void AddMD(llvm::LLVMContext &Ctx, llvm::Instruction *pI, std::uint32_t InstNum);
+bool FromInst(llvm::Instruction *pI, std::uint32_t *pInstNum);
+}  // namespace PixDxilInstNum
+
+namespace PixDxilReg {
+static constexpr char MDName[] = "pix-dxil-reg";
+static constexpr uint32_t ID = 0;
+
+void AddMD(llvm::LLVMContext &Ctx, llvm::Instruction *pI, std::uint32_t RegNum);
+bool FromInst(llvm::Instruction *pI, std::uint32_t *pRegNum);
+}  // namespace PixDxilReg
+
+namespace PixAllocaReg {
+static constexpr char MDName[] = "pix-alloca-reg";
+static constexpr uint32_t ID = 1;
+
+void AddMD(llvm::LLVMContext &Ctx, llvm::AllocaInst *pAlloca, std::uint32_t RegNum, std::uint32_t Count);
+}  // namespace PixAllocaReg
+
+namespace PixAllocaRegWrite {
+static constexpr char MDName[] = "pix-alloca-reg-write";
+static constexpr uint32_t ID = 2;
+void AddMD(llvm::LLVMContext &Ctx, llvm::StoreInst *pSt, llvm::MDNode *pAllocaReg, llvm::Value *Index);
+bool FromInst(llvm::StoreInst *pI, std::uint32_t *pRegBase, std::uint32_t *pRegSize, llvm::Value **pIndex);
+}  // namespace PixAllocaRegWrite
+}  // namespace pix_dxil
\ No newline at end of file
diff --git a/lib/HLSL/CMakeLists.txt b/lib/HLSL/CMakeLists.txt
index a6afab123..c637f5764 100644
--- a/lib/HLSL/CMakeLists.txt
+++ b/lib/HLSL/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_library(LLVMHLSL
   DxilPackSignatureElement.cpp
   DxilPatchShaderRecordBindings.cpp
   DxilPreserveAllOutputs.cpp
+  DxilSimpleGVNHoist.cpp
   DxilSignatureValidation.cpp
   DxilTargetLowering.cpp
   DxilTargetTransformInfo.cpp
diff --git a/lib/HLSL/DxcOptimizer.cpp b/lib/HLSL/DxcOptimizer.cpp
index 5184abd38..d2c1543b3 100644
--- a/lib/HLSL/DxcOptimizer.cpp
+++ b/lib/HLSL/DxcOptimizer.cpp
@@ -98,11 +98,13 @@ HRESULT SetupRegistryPassForHLSL() {
     initializeDxilLegalizeResourcesPass(Registry);
     initializeDxilLegalizeSampleOffsetPassPass(Registry);
     initializeDxilLoadMetadataPass(Registry);
+    initializeDxilLoopUnrollPass(Registry);
     initializeDxilLowerCreateHandleForLibPass(Registry);
     initializeDxilPrecisePropagatePassPass(Registry);
     initializeDxilPreserveAllOutputsPass(Registry);
     initializeDxilPromoteLocalResourcesPass(Registry);
     initializeDxilPromoteStaticResourcesPass(Registry);
+    initializeDxilSimpleGVNHoistPass(Registry);
     initializeDxilTranslateRawBufferPass(Registry);
     initializeDynamicIndexingVectorToArrayPass(Registry);
     initializeEarlyCSELegacyPassPass(Registry);
@@ -203,7 +205,7 @@ static ArrayRef<LPCSTR> GetPassArgNames(LPCSTR passName) {
   static const LPCSTR LowerExpectIntrinsicArgs[] = { "likely-branch-weight", "unlikely-branch-weight" };
   static const LPCSTR MergeFunctionsArgs[] = { "mergefunc-sanity" };
   static const LPCSTR RewriteSymbolsArgs[] = { "DL", "rewrite-map-file" };
-  static const LPCSTR SROAArgs[] = { "RequiresDomTree", "force-ssa-updater", "sroa-random-shuffle-slices", "sroa-strict-inbounds" };
+  static const LPCSTR SROAArgs[] = { "RequiresDomTree", "SkipHLSLMat", "force-ssa-updater", "sroa-random-shuffle-slices", "sroa-strict-inbounds" };
   static const LPCSTR SROA_DTArgs[] = { "Threshold", "StructMemberThreshold", "ArrayElementThreshold", "ScalarLoadThreshold" };
   static const LPCSTR SROA_SSAUpArgs[] = { "Threshold", "StructMemberThreshold", "ArrayElementThreshold", "ScalarLoadThreshold" };
   static const LPCSTR SampleProfileLoaderArgs[] = { "sample-profile-file", "sample-profile-max-propagate-iterations" };
@@ -276,7 +278,7 @@ static ArrayRef<LPCSTR> GetPassArgDescriptions(LPCSTR passName) {
   static const LPCSTR LowerExpectIntrinsicArgs[] = { "Weight of the branch likely to be taken (default = 64)", "Weight of the branch unlikely to be taken (default = 4)" };
   static const LPCSTR MergeFunctionsArgs[] = { "How many functions in module could be used for MergeFunctions pass sanity check. '0' disables this check. Works only with '-debug' key." };
   static const LPCSTR RewriteSymbolsArgs[] = { "None", "None" };
-  static const LPCSTR SROAArgs[] = { "None", "Force the pass to not use DomTree and mem2reg, insteadforming SSA values through the SSAUpdater infrastructure.", "Enable randomly shuffling the slices to help uncover instability in their order.", "Experiment with completely strict handling of inbounds GEPs." };
+  static const LPCSTR SROAArgs[] = { "None", "None", "Force the pass to not use DomTree and mem2reg, insteadforming SSA values through the SSAUpdater infrastructure.", "Enable randomly shuffling the slices to help uncover instability in their order.", "Experiment with completely strict handling of inbounds GEPs." };
   static const LPCSTR SROA_DTArgs[] = { "None", "None", "None", "None" };
   static const LPCSTR SROA_SSAUpArgs[] = { "None", "None", "None", "None" };
   static const LPCSTR SampleProfileLoaderArgs[] = { "None", "None" };
@@ -341,6 +343,7 @@ static bool IsPassOptionName(StringRef S) {
     ||  S.equals("RequiresDomTree")
     ||  S.equals("Runtime")
     ||  S.equals("ScalarLoadThreshold")
+    ||  S.equals("SkipHLSLMat")
     ||  S.equals("StructMemberThreshold")
     ||  S.equals("TIRA")
     ||  S.equals("TLIImpl")
diff --git a/lib/HLSL/DxilCondenseResources.cpp b/lib/HLSL/DxilCondenseResources.cpp
index d9dd01a1f..c81355cb2 100644
--- a/lib/HLSL/DxilCondenseResources.cpp
+++ b/lib/HLSL/DxilCondenseResources.cpp
@@ -1534,7 +1534,7 @@ Type *UpdateFieldTypeForLegacyLayout(Type *Ty, bool IsCBuf,
       return Ty;
     else
       return ArrayType::get(UpdatedTy, Ty->getArrayNumElements());
-  } else if (HLMatrixLower::IsMatrixType(Ty)) {
+  } else if (dxilutil::IsHLSLMatrixType(Ty)) {
     DXASSERT(annotation.HasMatrixAnnotation(), "must a matrix");
     unsigned rows, cols;
     Type *EltTy = HLMatrixLower::GetMatrixInfo(Ty, cols, rows);
diff --git a/lib/HLSL/DxilContainerReflection.cpp b/lib/HLSL/DxilContainerReflection.cpp
index 4627102ce..1b284b6bd 100644
--- a/lib/HLSL/DxilContainerReflection.cpp
+++ b/lib/HLSL/DxilContainerReflection.cpp
@@ -765,7 +765,7 @@ HRESULT CShaderReflectionType::Initialize(
     llvm::Type* elementType = type->getArrayElementType();
 
     // Note: At this point an HLSL matrix type may appear as an ordinary
-    // array (not wrapped in a `struct`), so `HLMatrixLower::IsMatrixType()`
+    // array (not wrapped in a `struct`), so `dxilutil::IsHLSLMatrixType()`
     // is not sufficient. Instead we need to check the field annotation.
     //
     // We might have an array of matrices, though, so we only exit if
diff --git a/lib/HLSL/DxilGenerationPass.cpp b/lib/HLSL/DxilGenerationPass.cpp
index 35b91d4db..5a64db19d 100644
--- a/lib/HLSL/DxilGenerationPass.cpp
+++ b/lib/HLSL/DxilGenerationPass.cpp
@@ -1412,8 +1412,17 @@ void DxilTranslateRawBuffer::ReplaceRawBufferLoad64Bit(Function *F, Type *EltTy,
       for (unsigned i = 0; i < size; i++) {
         if (i == 2) {
           // Update offset 4 by 4 bytes.
-          args[DXIL::OperandIndex::kRawBufferLoadElementOffsetOpIdx] =
+          if (isa<UndefValue>(offset)) {
+            // [RW]ByteAddressBuffer has undef element offset -> update index
+            Value *index = CI->getArgOperand(DXIL::OperandIndex::kRawBufferLoadIndexOpIdx);
+            args[DXIL::OperandIndex::kRawBufferLoadIndexOpIdx] =
+              Builder.CreateAdd(index, Builder.getInt32(4 * 4));
+          }
+          else {
+            // [RW]StructuredBuffer -> update element offset
+            args[DXIL::OperandIndex::kRawBufferLoadElementOffsetOpIdx] =
               Builder.CreateAdd(offset, Builder.getInt32(4 * 4));
+          }
           args[DXIL::OperandIndex::kRawBufferLoadMaskOpIdx] =
               Builder.getInt8(maskHi);
           newLd = Builder.CreateCall(bufLd, args);
@@ -1531,10 +1540,20 @@ void DxilTranslateRawBuffer::ReplaceRawBufferStore64Bit(Function *F, Type *ETy,
       Builder.CreateCall(newFunction, args);
 
       if (maskHi) {
-        Value *offset = args[DXIL::OperandIndex::kBufferStoreCoord1OpIdx];
         // Update offset 4 by 4 bytes.
-        offset = Builder.CreateAdd(offset, Builder.getInt32(4 * 4));
-        args[DXIL::OperandIndex::kRawBufferStoreElementOffsetOpIdx] = offset;
+        Value *offset = args[DXIL::OperandIndex::kBufferStoreCoord1OpIdx];
+        if (isa<UndefValue>(offset)) {
+          // [RW]ByteAddressBuffer has element offset == undef -> update index instead
+          Value *index = args[DXIL::OperandIndex::kBufferStoreCoord0OpIdx];
+          index = Builder.CreateAdd(index, Builder.getInt32(4 * 4));
+          args[DXIL::OperandIndex::kRawBufferStoreIndexOpIdx] = index;
+        }
+        else {
+          // [RW]StructuredBuffer -> update element offset
+          offset = Builder.CreateAdd(offset, Builder.getInt32(4 * 4));
+          args[DXIL::OperandIndex::kRawBufferStoreElementOffsetOpIdx] = offset;
+        }
+        
         args[DXIL::OperandIndex::kRawBufferStoreMaskOpIdx] =
             Builder.getInt8(maskHi);
         args[DXIL::OperandIndex::kRawBufferStoreVal0OpIdx] = vals32[4];
diff --git a/lib/HLSL/DxilLinker.cpp b/lib/HLSL/DxilLinker.cpp
index 078178fd4..3ac0d25f3 100644
--- a/lib/HLSL/DxilLinker.cpp
+++ b/lib/HLSL/DxilLinker.cpp
@@ -1027,7 +1027,7 @@ void DxilLinkJob::RunPreparePass(Module &M) {
   PM.add(createDxilDeadFunctionEliminationPass());
 
   // SROA
-  PM.add(createSROAPass(/*RequiresDomTree*/false));
+  PM.add(createSROAPass(/*RequiresDomTree*/false, /*SkipHLSLMat*/false));
 
   // Remove MultiDimArray from function call arg.
   PM.add(createMultiDimArrayToOneDimArrayPass());
diff --git a/lib/HLSL/DxilSimpleGVNHoist.cpp b/lib/HLSL/DxilSimpleGVNHoist.cpp
new file mode 100644
index 000000000..0c053e87e
--- /dev/null
+++ b/lib/HLSL/DxilSimpleGVNHoist.cpp
@@ -0,0 +1,566 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilSimpleGVNHoist.cpp                                                    //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// A simple version of GVN hoist for DXIL.                                   //
+// Based on GVNHoist in LLVM 6.0.                                            //                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxc/HLSL/DxilGenerationPass.h"
+#include "dxc/DXIL/DxilOperations.h"
+
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/CFG.h"
+
+using namespace llvm;
+using namespace hlsl;
+
+///////////////////////////////////////////////////////////////////////////////
+namespace {
+struct Expression {
+  uint32_t opcode;
+  Type *type;
+  bool commutative = false;
+  SmallVector<uint32_t, 4> varargs;
+
+  Expression(uint32_t o = ~2U) : opcode(o) {}
+
+  bool operator==(const Expression &other) const {
+    if (opcode != other.opcode)
+      return false;
+    if (opcode == ~0U || opcode == ~1U)
+      return true;
+    if (type != other.type)
+      return false;
+    if (varargs != other.varargs)
+      return false;
+    return true;
+  }
+
+  friend hash_code hash_value(const Expression &Value) {
+    return hash_combine(
+        Value.opcode, Value.type,
+        hash_combine_range(Value.varargs.begin(), Value.varargs.end()));
+  }
+};
+
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<Expression> {
+  static inline Expression getEmptyKey() { return ~0U; }
+  static inline Expression getTombstoneKey() { return ~1U; }
+
+  static unsigned getHashValue(const Expression &e) {
+    using llvm::hash_value;
+
+    return static_cast<unsigned>(hash_value(e));
+  }
+
+  static bool isEqual(const Expression &LHS, const Expression &RHS) {
+    return LHS == RHS;
+  }
+};
+} // namespace llvm
+
+namespace {
+// Simple Value table which support DXIL operation.
+class ValueTable {
+  DenseMap<Value *, uint32_t> valueNumbering;
+  DenseMap<Expression, uint32_t> expressionNumbering;
+
+  // Expressions is the vector of Expression. ExprIdx is the mapping from
+  // value number to the index of Expression in Expressions. We use it
+  // instead of a DenseMap because filling such mapping is faster than
+  // filling a DenseMap and the compile time is a little better.
+  uint32_t nextExprNumber;
+
+  std::vector<Expression> Expressions;
+  std::vector<uint32_t> ExprIdx;
+
+  DominatorTree *DT;
+
+  uint32_t nextValueNumber = 1;
+
+  Expression createExpr(Instruction *I);
+  Expression createCmpExpr(unsigned Opcode, CmpInst::Predicate Predicate,
+                           Value *LHS, Value *RHS);
+  Expression createExtractvalueExpr(ExtractValueInst *EI);
+  uint32_t lookupOrAddCall(CallInst *C);
+
+  std::pair<uint32_t, bool> assignExpNewValueNum(Expression &exp);
+
+public:
+  ValueTable();
+  ValueTable(const ValueTable &Arg);
+  ValueTable(ValueTable &&Arg);
+  ~ValueTable();
+
+  uint32_t lookupOrAdd(Value *V);
+  uint32_t lookup(Value *V, bool Verify = true) const;
+  uint32_t lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Pred, Value *LHS,
+                          Value *RHS);
+  bool exists(Value *V) const;
+  void add(Value *V, uint32_t num);
+  void clear();
+  void erase(Value *v);
+  void setDomTree(DominatorTree *D) { DT = D; }
+  uint32_t getNextUnusedValueNumber() { return nextValueNumber; }
+  void verifyRemoved(const Value *) const;
+};
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable Internal Functions
+//===----------------------------------------------------------------------===//
+
+Expression ValueTable::createExpr(Instruction *I) {
+  Expression e;
+  e.type = I->getType();
+  e.opcode = I->getOpcode();
+  for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
+       OI != OE; ++OI)
+    e.varargs.push_back(lookupOrAdd(*OI));
+  if (I->isCommutative()) {
+    // Ensure that commutative instructions that only differ by a permutation
+    // of their operands get the same value number by sorting the operand value
+    // numbers.  Since all commutative instructions have two operands it is more
+    // efficient to sort by hand rather than using, say, std::sort.
+    assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
+    if (e.varargs[0] > e.varargs[1])
+      std::swap(e.varargs[0], e.varargs[1]);
+    e.commutative = true;
+  }
+
+  if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+    // Sort the operand value numbers so x<y and y>x get the same value number.
+    CmpInst::Predicate Predicate = C->getPredicate();
+    if (e.varargs[0] > e.varargs[1]) {
+      std::swap(e.varargs[0], e.varargs[1]);
+Predicate = CmpInst::getSwappedPredicate(Predicate);
+    }
+    e.opcode = (C->getOpcode() << 8) | Predicate;
+    e.commutative = true;
+  }
+ else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) {
+ for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();
+     II != IE; ++II)
+     e.varargs.push_back(*II);
+  }
+
+  return e;
+}
+
+Expression ValueTable::createCmpExpr(unsigned Opcode,
+    CmpInst::Predicate Predicate,
+    Value *LHS, Value *RHS) {
+    assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+        "Not a comparison!");
+    Expression e;
+    e.type = CmpInst::makeCmpResultType(LHS->getType());
+    e.varargs.push_back(lookupOrAdd(LHS));
+    e.varargs.push_back(lookupOrAdd(RHS));
+
+    // Sort the operand value numbers so x<y and y>x get the same value number.
+    if (e.varargs[0] > e.varargs[1]) {
+        std::swap(e.varargs[0], e.varargs[1]);
+        Predicate = CmpInst::getSwappedPredicate(Predicate);
+    }
+    e.opcode = (Opcode << 8) | Predicate;
+    e.commutative = true;
+    return e;
+}
+
+Expression ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
+    assert(EI && "Not an ExtractValueInst?");
+    Expression e;
+    e.type = EI->getType();
+    e.opcode = 0;
+
+    IntrinsicInst *I = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
+    if (I != nullptr && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
+        // EI might be an extract from one of our recognised intrinsics. If it
+        // is we'll synthesize a semantically equivalent expression instead on
+        // an extract value expression.
+        switch (I->getIntrinsicID()) {
+        case Intrinsic::sadd_with_overflow:
+        case Intrinsic::uadd_with_overflow:
+            e.opcode = Instruction::Add;
+            break;
+        case Intrinsic::ssub_with_overflow:
+        case Intrinsic::usub_with_overflow:
+            e.opcode = Instruction::Sub;
+            break;
+        case Intrinsic::smul_with_overflow:
+        case Intrinsic::umul_with_overflow:
+            e.opcode = Instruction::Mul;
+            break;
+        default:
+            break;
+        }
+
+        if (e.opcode != 0) {
+            // Intrinsic recognized. Grab its args to finish building the expression.
+            assert(I->getNumArgOperands() == 2 &&
+                "Expect two args for recognised intrinsics.");
+            e.varargs.push_back(lookupOrAdd(I->getArgOperand(0)));
+            e.varargs.push_back(lookupOrAdd(I->getArgOperand(1)));
+            return e;
+        }
+    }
+
+    // Not a recognised intrinsic. Fall back to producing an extract value
+    // expression.
+    e.opcode = EI->getOpcode();
+    for (Instruction::op_iterator OI = EI->op_begin(), OE = EI->op_end();
+        OI != OE; ++OI)
+        e.varargs.push_back(lookupOrAdd(*OI));
+
+    for (ExtractValueInst::idx_iterator II = EI->idx_begin(), IE = EI->idx_end();
+        II != IE; ++II)
+        e.varargs.push_back(*II);
+
+    return e;
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable External Functions
+//===----------------------------------------------------------------------===//
+
+ValueTable::ValueTable() = default;
+ValueTable::ValueTable(const ValueTable &) = default;
+ValueTable::ValueTable(ValueTable &&) = default;
+ValueTable::~ValueTable() = default;
+
+/// add - Insert a value into the table with a specified value number.
+void ValueTable::add(Value *V, uint32_t num) {
+    valueNumbering.insert(std::make_pair(V, num));
+}
+
+uint32_t ValueTable::lookupOrAddCall(CallInst *C) {
+  Function *F = C->getCalledFunction();
+  bool bSafe = false;
+  if (F->hasFnAttribute(Attribute::ReadNone)) {
+    bSafe = true;
+  } else if (F->hasFnAttribute(Attribute::ReadOnly)) {
+    if (hlsl::OP::IsDxilOpFunc(F)) {
+      DXIL::OpCode Opcode = hlsl::OP::GetDxilOpFuncCallInst(C);
+      switch (Opcode) {
+      default:
+        break;
+        // TODO: make buffer/texture load on srv safe.
+      case DXIL::OpCode::CreateHandleForLib:
+      case DXIL::OpCode::CBufferLoad:
+      case DXIL::OpCode::CBufferLoadLegacy:
+      case DXIL::OpCode::Sample:
+      case DXIL::OpCode::SampleBias:
+      case DXIL::OpCode::SampleCmp:
+      case DXIL::OpCode::SampleCmpLevelZero:
+      case DXIL::OpCode::SampleGrad:
+      case DXIL::OpCode::CheckAccessFullyMapped:
+      case DXIL::OpCode::GetDimensions:
+      case DXIL::OpCode::TextureGather:
+      case DXIL::OpCode::TextureGatherCmp:
+      case DXIL::OpCode::Texture2DMSGetSamplePosition:
+      case DXIL::OpCode::RenderTargetGetSampleCount:
+      case DXIL::OpCode::RenderTargetGetSamplePosition:
+      case DXIL::OpCode::CalculateLOD:
+        bSafe = true;
+        break;
+      }
+    }
+  }
+  if (bSafe) {
+    Expression exp = createExpr(C);
+    uint32_t e = assignExpNewValueNum(exp).first;
+    valueNumbering[C] = e;
+    return e;
+  } else {
+    // Not sure safe or not, always use new value number.
+    valueNumbering[C] = nextValueNumber;
+    return nextValueNumber++;
+  }
+}
+
+/// Returns true if a value number exists for the specified value.
+bool ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; }
+
+/// lookup_or_add - Returns the value number for the specified value, assigning
+/// it a new number if it did not have one before.
+uint32_t ValueTable::lookupOrAdd(Value *V) {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    return VI->second;
+
+  if (!isa<Instruction>(V)) {
+    valueNumbering[V] = nextValueNumber;
+    return nextValueNumber++;
+  }
+
+  Instruction* I = cast<Instruction>(V);
+  Expression exp;
+  switch (I->getOpcode()) {
+    case Instruction::Call:
+      return lookupOrAddCall(cast<CallInst>(I));
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+    case Instruction::Select:
+    case Instruction::ExtractElement:
+    case Instruction::InsertElement:
+    case Instruction::ShuffleVector:
+    case Instruction::InsertValue:
+    case Instruction::GetElementPtr:
+      exp = createExpr(I);
+      break;
+    case Instruction::ExtractValue:
+      exp = createExtractvalueExpr(cast<ExtractValueInst>(I));
+      break;
+    case Instruction::PHI:
+      valueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+    default:
+      valueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+  }
+
+  uint32_t e = assignExpNewValueNum(exp).first;
+  valueNumbering[V] = e;
+  return e;
+}
+
+/// Returns the value number of the specified value. Fails if
+/// the value has not yet been numbered.
+uint32_t ValueTable::lookup(Value *V, bool Verify) const {
+  DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
+  if (Verify) {
+    assert(VI != valueNumbering.end() && "Value not numbered?");
+    return VI->second;
+  }
+  return (VI != valueNumbering.end()) ? VI->second : 0;
+}
+
+/// Returns the value number of the given comparison,
+/// assigning it a new number if it did not have one before.  Useful when
+/// we deduced the result of a comparison, but don't immediately have an
+/// instruction realizing that comparison to hand.
+uint32_t ValueTable::lookupOrAddCmp(unsigned Opcode,
+                                         CmpInst::Predicate Predicate,
+                                         Value *LHS, Value *RHS) {
+  Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS);
+  return assignExpNewValueNum(exp).first;
+}
+
+/// Remove all entries from the ValueTable.
+void ValueTable::clear() {
+  valueNumbering.clear();
+  expressionNumbering.clear();
+  nextValueNumber = 1;
+  Expressions.clear();
+  ExprIdx.clear();
+  nextExprNumber = 0;
+}
+
+/// Remove a value from the value numbering.
+void ValueTable::erase(Value *V) {
+  valueNumbering.erase(V);
+}
+
+/// verifyRemoved - Verify that the value is removed from all internal data
+/// structures.
+void ValueTable::verifyRemoved(const Value *V) const {
+  for (DenseMap<Value*, uint32_t>::const_iterator
+         I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
+    assert(I->first != V && "Inst still occurs in value numbering map!");
+  }
+}
+
+/// Return a pair the first field showing the value number of \p Exp and the
+/// second field showing whether it is a value number newly created.
+std::pair<uint32_t, bool>
+ValueTable::assignExpNewValueNum(Expression &Exp) {
+  uint32_t &e = expressionNumbering[Exp];
+  bool CreateNewValNum = !e;
+  if (CreateNewValNum) {
+    Expressions.push_back(Exp);
+    if (ExprIdx.size() < nextValueNumber + 1)
+      ExprIdx.resize(nextValueNumber * 2);
+    e = nextValueNumber;
+    ExprIdx[nextValueNumber++] = nextExprNumber++;
+  }
+  return {e, CreateNewValNum};
+}
+
+} // namespace
+
+namespace {
+// Reduce code size for pattern like this:
+// if (a.x > 0) {
+//  r = tex.Sample(ss, uv)-1;
+// } else {
+//  if (a.y > 0)
+//    r = tex.Sample(ss, uv);
+//  else
+//    r = tex.Sample(ss, uv) + 3;
+// }
+class DxilSimpleGVNHoist : public FunctionPass {
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit DxilSimpleGVNHoist() : FunctionPass(ID) {}
+
+  const char *getPassName() const override {
+    return "DXIL simple GVN hoist";
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  bool tryToHoist(BasicBlock *BB, BasicBlock *Succ0, BasicBlock *Succ1);
+};
+
+char DxilSimpleGVNHoist::ID = 0;
+
+bool HasOnePred(BasicBlock *BB) {
+  if (pred_empty(BB))
+    return false;
+
+  auto pred = pred_begin(BB);
+  pred++;
+  if (pred != pred_end(BB))
+    return false;
+  return true;
+}
+
+bool DxilSimpleGVNHoist::tryToHoist(BasicBlock *BB, BasicBlock *Succ0,
+                                    BasicBlock *Succ1) {
+  // ValueNumber Succ0 and Succ1.
+  ValueTable VT;
+  DenseMap<uint32_t, SmallVector<Instruction *, 2>> VNtoInsts;
+  for (Instruction &I : *Succ0) {
+    uint32_t V = VT.lookupOrAdd(&I);
+    VNtoInsts[V].emplace_back(&I);
+  }
+
+  std::vector<uint32_t> HoistCandidateVN;
+
+  for (Instruction &I : *Succ1) {
+    uint32_t V = VT.lookupOrAdd(&I);
+    if (!VNtoInsts.count(V))
+      continue;
+    VNtoInsts[V].emplace_back(&I);
+    HoistCandidateVN.emplace_back(V);
+  }
+
+  if (HoistCandidateVN.empty()) {
+    return false;
+  }
+
+  DenseSet<uint32_t> ProcessedVN;
+  Instruction *TI = BB->getTerminator();
+  // Hoist need to be in order, so operand could hoist before its users.
+  for (uint32_t VN : HoistCandidateVN) {
+    // Skip processed VN
+    if (ProcessedVN.count(VN))
+      continue;
+    ProcessedVN.insert(VN);
+
+    auto &Insts = VNtoInsts[VN];
+    if (Insts.size() == 1)
+      continue;
+    bool bHoist = false;
+    for (Instruction *I : Insts) {
+      if (I->getParent() == Succ1) {
+        bHoist = true;
+        break;
+      }
+    }
+
+    Instruction *FirstI = Insts.front();
+    if (bHoist) {
+      // Move FirstI to BB.
+      FirstI->removeFromParent();
+      FirstI->insertBefore(TI);
+    }
+    // Replace all insts with same value number with firstI.
+    auto it = Insts.begin();
+    it++;
+    for (; it != Insts.end(); it++) {
+      Instruction *I = *it;
+      I->replaceAllUsesWith(FirstI);
+      I->eraseFromParent();
+    }
+    Insts.clear();
+  }
+  return true;
+}
+
+bool DxilSimpleGVNHoist::runOnFunction(Function &F) {
+  BasicBlock &Entry = F.getEntryBlock();
+  bool bUpdated = false;
+  for (auto it = po_begin(&Entry); it != po_end(&Entry); it++) {
+    BasicBlock *BB = *it;
+    TerminatorInst *TI = BB->getTerminator();
+    if (TI->getNumSuccessors() != 2)
+      continue;
+    BasicBlock *Succ0 = TI->getSuccessor(0);
+    BasicBlock *Succ1 = TI->getSuccessor(1);
+    if (BB == Succ0)
+      continue;
+    if (BB == Succ1)
+      continue;
+
+    if (!HasOnePred(Succ0))
+      continue;
+    if (!HasOnePred(Succ1))
+      continue;
+    bUpdated |= tryToHoist(BB, Succ0, Succ1);
+  }
+  return bUpdated;
+}
+
+}
+
+FunctionPass *llvm::createDxilSimpleGVNHoistPass() {
+  return new DxilSimpleGVNHoist();
+}
+
+INITIALIZE_PASS(DxilSimpleGVNHoist, "dxil-gvn-hoist",
+                "DXIL simple gvn hoist", false, false)
diff --git a/lib/HLSL/HLMatrixLowerPass.cpp b/lib/HLSL/HLMatrixLowerPass.cpp
index 40c35e8fe..a5a7e474b 100644
--- a/lib/HLSL/HLMatrixLowerPass.cpp
+++ b/lib/HLSL/HLMatrixLowerPass.cpp
@@ -36,20 +36,6 @@ using namespace hlsl::HLMatrixLower;
 namespace hlsl {
 namespace HLMatrixLower {
 
-bool IsMatrixType(Type *Ty) {
-  if (StructType *ST = dyn_cast<StructType>(Ty)) {
-    Type *EltTy = ST->getElementType(0);
-    if (!ST->getName().startswith("class.matrix"))
-      return false;
-
-    bool isVecArray = EltTy->isArrayTy() &&
-           EltTy->getArrayElementType()->isVectorTy();
-
-    return isVecArray && EltTy->getArrayNumElements() <= 4;
-  }
-  return false;
-}
-
 // If user is function call, return param annotation to get matrix major.
 DxilFieldAnnotation *FindAnnotationFromMatUser(Value *Mat,
                                                DxilTypeSystem &typeSys) {
@@ -69,7 +55,7 @@ DxilFieldAnnotation *FindAnnotationFromMatUser(Value *Mat,
 }
 
 // Translate matrix type to vector type.
-Type *LowerMatrixType(Type *Ty) {
+Type *LowerMatrixType(Type *Ty, bool forMem) {
   // Only translate matrix type and function type which use matrix type.
   // Not translate struct has matrix or matrix pointer.
   // Struct should be flattened before.
@@ -81,9 +67,11 @@ Type *LowerMatrixType(Type *Ty) {
       params.emplace_back(LowerMatrixType(param));
     }
     return FunctionType::get(RetTy, params, false);
-  } else if (IsMatrixType(Ty)) {
+  } else if (dxilutil::IsHLSLMatrixType(Ty)) {
     unsigned row, col;
     Type *EltTy = GetMatrixInfo(Ty, col, row);
+    if (forMem && EltTy->isIntegerTy(1))
+      EltTy = Type::getInt32Ty(Ty->getContext());
     return VectorType::get(EltTy, row * col);
   } else {
     return Ty;
@@ -92,7 +80,7 @@ Type *LowerMatrixType(Type *Ty) {
 
 // Translate matrix type to array type.
 Type *LowerMatrixTypeToOneDimArray(Type *Ty) {
-  if (IsMatrixType(Ty)) {
+  if (dxilutil::IsHLSLMatrixType(Ty)) {
     unsigned row, col;
     Type *EltTy = GetMatrixInfo(Ty, col, row);
     return ArrayType::get(EltTy, row * col);
@@ -103,7 +91,7 @@ Type *LowerMatrixTypeToOneDimArray(Type *Ty) {
 
 
 Type *GetMatrixInfo(Type *Ty, unsigned &col, unsigned &row) {
-  DXASSERT(IsMatrixType(Ty), "not matrix type");
+  DXASSERT(dxilutil::IsHLSLMatrixType(Ty), "not matrix type");
   StructType *ST = cast<StructType>(Ty);
   Type *EltTy = ST->getElementType(0);
   Type *RowTy = EltTy->getArrayElementType();
@@ -120,9 +108,9 @@ bool IsMatrixArrayPointer(llvm::Type *Ty) {
     return false;
   while (Ty->isArrayTy())
     Ty = Ty->getArrayElementType();
-  return IsMatrixType(Ty);
+  return dxilutil::IsHLSLMatrixType(Ty);
 }
-Type *LowerMatrixArrayPointer(Type *Ty) {
+Type *LowerMatrixArrayPointer(Type *Ty, bool forMem) {
   unsigned addrSpace = Ty->getPointerAddressSpace();
   Ty = Ty->getPointerElementType();
   std::vector<unsigned> arraySizeList;
@@ -130,7 +118,7 @@ Type *LowerMatrixArrayPointer(Type *Ty) {
     arraySizeList.push_back(Ty->getArrayNumElements());
     Ty = Ty->getArrayElementType();
   }
-  Ty = LowerMatrixType(Ty);
+  Ty = LowerMatrixType(Ty, forMem);
 
   for (auto arraySize = arraySizeList.rbegin();
        arraySize != arraySizeList.rend(); arraySize++)
@@ -155,13 +143,69 @@ Type *LowerMatrixArrayPointerToOneDimArray(Type *Ty) {
   return PointerType::get(Ty, addrSpace);
 }
 Value *BuildVector(Type *EltTy, unsigned size, ArrayRef<llvm::Value *> elts,
-                   IRBuilder<> &Builder) {
+  IRBuilder<> &Builder) {
   Value *Vec = UndefValue::get(VectorType::get(EltTy, size));
   for (unsigned i = 0; i < size; i++)
     Vec = Builder.CreateInsertElement(Vec, elts[i], i);
   return Vec;
 }
 
+llvm::Value *VecMatrixMemToReg(llvm::Value *VecVal, llvm::Type *MatType,
+  llvm::IRBuilder<> &Builder)
+{
+  llvm::Type *VecMatRegTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/false);
+  if (VecVal->getType() == VecMatRegTy) {
+    return VecVal;
+  }
+
+  DXASSERT(VecMatRegTy->getVectorElementType()->isIntegerTy(1),
+    "Vector matrix mem to reg type mismatch should only happen for bools.");
+  llvm::Type *VecMatMemTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/true);
+  return Builder.CreateICmpNE(VecVal, Constant::getNullValue(VecMatMemTy));
+}
+
+llvm::Value *VecMatrixRegToMem(llvm::Value* VecVal, llvm::Type *MatType,
+  llvm::IRBuilder<> &Builder)
+{
+  llvm::Type *VecMatMemTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/true);
+  if (VecVal->getType() == VecMatMemTy) {
+    return VecVal;
+  }
+
+  DXASSERT(VecVal->getType()->getVectorElementType()->isIntegerTy(1),
+    "Vector matrix reg to mem type mismatch should only happen for bools.");
+  return Builder.CreateZExt(VecVal, VecMatMemTy);
+}
+
+llvm::Instruction *CreateVecMatrixLoad(
+  llvm::Value *VecPtr, llvm::Type *MatType, llvm::IRBuilder<> &Builder)
+{
+  llvm::Instruction *VecVal = Builder.CreateLoad(VecPtr);
+  return cast<llvm::Instruction>(VecMatrixMemToReg(VecVal, MatType, Builder));
+}
+
+llvm::Instruction *CreateVecMatrixStore(llvm::Value* VecVal, llvm::Value *VecPtr,
+  llvm::Type *MatType, llvm::IRBuilder<> &Builder)
+{
+  llvm::Type *VecMatMemTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/true);
+  if (VecVal->getType() == VecMatMemTy) {
+    return Builder.CreateStore(VecVal, VecPtr);
+  }
+
+  // We need to convert to the memory representation, and we want to return
+  // the conversion instruction rather than the store since that's what
+  // accepts the register-typed i1 values.
+
+  // Do not use VecMatrixRegToMem as it may constant fold the conversion
+  // instruction, which is what we want to return.
+  DXASSERT(VecVal->getType()->getVectorElementType()->isIntegerTy(1),
+    "Vector matrix reg to mem type mismatch should only happen for bools.");
+
+  llvm::Instruction *ConvInst = Builder.Insert(new ZExtInst(VecVal, VecMatMemTy));
+  Builder.CreateStore(ConvInst, VecPtr);
+  return ConvInst;
+}
+
 Value *LowerGEPOnMatIndexListToIndex(
     llvm::GetElementPtrInst *GEP, ArrayRef<Value *> IdxList) {
   IRBuilder<> Builder(GEP);
@@ -353,41 +397,29 @@ INITIALIZE_PASS(HLMatrixLowerPass, "hlmatrixlower", "HLSL High-Level Matrix Lowe
 
 static Instruction *CreateTypeCast(HLCastOpcode castOp, Type *toTy, Value *src,
                                    IRBuilder<> Builder) {
-  // Cast to bool.
-  if (toTy->getScalarType()->isIntegerTy() &&
-      toTy->getScalarType()->getIntegerBitWidth() == 1) {
-    Type *fromTy = src->getType();
-    bool isFloat = fromTy->getScalarType()->isFloatingPointTy();
-    Constant *zero;
-    if (isFloat)
-      zero = llvm::ConstantFP::get(fromTy->getScalarType(), 0);
-    else
-      zero = llvm::ConstantInt::get(fromTy->getScalarType(), 0);
+  Type *srcTy = src->getType();
 
-    if (toTy->getScalarType() != toTy) {
-      // Create constant vector.
-      unsigned size = toTy->getVectorNumElements();
-      std::vector<Constant *> zeros(size, zero);
-      zero = llvm::ConstantVector::get(zeros);
-    }
-    if (isFloat)
-      return cast<Instruction>(Builder.CreateFCmpOEQ(src, zero));
-    else
-      return cast<Instruction>(Builder.CreateICmpEQ(src, zero));
-  }
-
-  Type *eltToTy = toTy->getScalarType();
-  Type *eltFromTy = src->getType()->getScalarType();
+  // Conversions between equivalent types are no-ops,
+  // even between signed/unsigned variants.
+  if (srcTy == toTy) return cast<Instruction>(src);
 
   bool fromUnsigned = castOp == HLCastOpcode::FromUnsignedCast ||
                       castOp == HLCastOpcode::UnsignedUnsignedCast;
   bool toUnsigned = castOp == HLCastOpcode::ToUnsignedCast ||
                     castOp == HLCastOpcode::UnsignedUnsignedCast;
 
-  Instruction::CastOps castOps = static_cast<Instruction::CastOps>(
-      HLModule::FindCastOp(fromUnsigned, toUnsigned, eltFromTy, eltToTy));
+  // Conversions to bools are comparisons
+  if (toTy->getScalarSizeInBits() == 1) {
+    // fcmp une is what regular clang uses in C++ for (bool)f;
+    return cast<Instruction>(srcTy->isIntOrIntVectorTy()
+      ? Builder.CreateICmpNE(src, llvm::Constant::getNullValue(srcTy), "tobool")
+      : Builder.CreateFCmpUNE(src, llvm::Constant::getNullValue(srcTy), "tobool"));
+  }
 
-  return cast<Instruction>(Builder.CreateCast(castOps, src, toTy));
+  // Cast necessary
+  auto CastOp = static_cast<Instruction::CastOps>(HLModule::GetNumericCastOp(
+    srcTy, fromUnsigned, toTy, toUnsigned));
+  return cast<Instruction>(Builder.CreateCast(CastOp, src, toTy));
 }
 
 Instruction *HLMatrixLowerPass::MatCastToVec(CallInst *CI) {
@@ -395,8 +427,8 @@ Instruction *HLMatrixLowerPass::MatCastToVec(CallInst *CI) {
   Value *op = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
   HLCastOpcode opcode = static_cast<HLCastOpcode>(GetHLOpcode(CI));
 
-  bool ToMat = IsMatrixType(CI->getType());
-  bool FromMat = IsMatrixType(op->getType());
+  bool ToMat = dxilutil::IsHLSLMatrixType(CI->getType());
+  bool FromMat = dxilutil::IsHLSLMatrixType(op->getType());
   if (ToMat && !FromMat) {
     // Translate OtherToMat here.
     // Rest will translated when replace.
@@ -468,11 +500,11 @@ Instruction *HLMatrixLowerPass::MatCastToVec(CallInst *CI) {
 // UDT alloca must be there for library function args
 static GetElementPtrInst *GetIfMatrixGEPOfUDTAlloca(Value *V) {
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
-    if (IsMatrixType(GEP->getResultElementType())) {
+    if (dxilutil::IsHLSLMatrixType(GEP->getResultElementType())) {
       Value *ptr = GEP->getPointerOperand();
       if (AllocaInst *AI = dyn_cast<AllocaInst>(ptr)) {
         Type *ATy = AI->getAllocatedType();
-        if (ATy->isStructTy() && !IsMatrixType(ATy)) {
+        if (ATy->isStructTy() && !dxilutil::IsHLSLMatrixType(ATy)) {
           return GEP;
         }
       }
@@ -485,7 +517,7 @@ static GetElementPtrInst *GetIfMatrixGEPOfUDTAlloca(Value *V) {
 // none-graphics functions.
 static GetElementPtrInst *GetIfMatrixGEPOfUDTArg(Value *V, HLModule &HM) {
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
-    if (IsMatrixType(GEP->getResultElementType())) {
+    if (dxilutil::IsHLSLMatrixType(GEP->getResultElementType())) {
       Value *ptr = GEP->getPointerOperand();
       if (Argument *Arg = dyn_cast<Argument>(ptr)) {
         if (!HM.IsGraphicsShader(Arg->getParent()))
@@ -508,7 +540,7 @@ Instruction *HLMatrixLowerPass::MatLdStToVec(CallInst *CI) {
     if (isa<AllocaInst>(matPtr) || GetIfMatrixGEPOfUDTAlloca(matPtr) ||
         GetIfMatrixGEPOfUDTArg(matPtr, *m_pHLModule)) {
       Value *vecPtr = matToVecMap[cast<Instruction>(matPtr)];
-      result = Builder.CreateLoad(vecPtr);
+      result = CreateVecMatrixLoad(vecPtr, matPtr->getType()->getPointerElementType(), Builder);
     } else
       result = MatIntrinsicToVec(CI);
   } break;
@@ -519,9 +551,8 @@ Instruction *HLMatrixLowerPass::MatLdStToVec(CallInst *CI) {
         GetIfMatrixGEPOfUDTArg(matPtr, *m_pHLModule)) {
       Value *vecPtr = matToVecMap[cast<Instruction>(matPtr)];
       Value *matVal = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
-      Value *vecVal =
-          UndefValue::get(HLMatrixLower::LowerMatrixType(matVal->getType()));
-      result = Builder.CreateStore(vecVal, vecPtr);
+      Value *vecVal = UndefValue::get(HLMatrixLower::LowerMatrixType(matVal->getType()));
+      result = CreateVecMatrixStore(vecVal, vecPtr, matVal->getType(), Builder);
     } else
       result = MatIntrinsicToVec(CI);
   } break;
@@ -609,7 +640,7 @@ Instruction *HLMatrixLowerPass::MatIntrinsicToVec(CallInst *CI) {
   SmallVector<Value *, 4> argList;
   for (Value *arg : CI->arg_operands()) {
     Type *Ty = arg->getType();
-    if (IsMatrixType(Ty)) {
+    if (dxilutil::IsHLSLMatrixType(Ty)) {
       argList.emplace_back(UndefValue::get(LowerMatrixType(Ty)));
     } else
       argList.emplace_back(arg);
@@ -625,47 +656,53 @@ Instruction *HLMatrixLowerPass::TrivialMatUnOpToVec(CallInst *CI) {
   HLUnaryOpcode opcode = static_cast<HLUnaryOpcode>(GetHLOpcode(CI));
   bool isFloat = ResultTy->getVectorElementType()->isFloatingPointTy();
 
-  auto GetVecConst = [&](Type *Ty, int v) -> Constant * {
-    Constant *val = isFloat ? ConstantFP::get(Ty->getScalarType(), v)
-                            : ConstantInt::get(Ty->getScalarType(), v);
-    std::vector<Constant *> vals(Ty->getVectorNumElements(), val);
-    return ConstantVector::get(vals);
-  };
-
-  Constant *one = GetVecConst(ResultTy, 1);
+  Constant *one = isFloat
+    ? ConstantFP::get(ResultTy->getVectorElementType(), 1)
+    : ConstantInt::get(ResultTy->getVectorElementType(), 1);
+  Constant *oneVec = ConstantVector::getSplat(ResultTy->getVectorNumElements(), one);
 
   Instruction *Result = nullptr;
   switch (opcode) {
+  case HLUnaryOpcode::Plus: {
+    // This is actually a no-op, but the structure of the code here requires
+    // that we create an instruction.
+    Constant *zero = Constant::getNullValue(ResultTy);
+    if (isFloat)
+      Result = BinaryOperator::CreateFAdd(tmp, zero);
+    else
+      Result = BinaryOperator::CreateAdd(tmp, zero);
+  } break;
   case HLUnaryOpcode::Minus: {
-    Constant *zero = GetVecConst(ResultTy, 0);
+    Constant *zero = Constant::getNullValue(ResultTy);
     if (isFloat)
       Result = BinaryOperator::CreateFSub(zero, tmp);
     else
       Result = BinaryOperator::CreateSub(zero, tmp);
   } break;
   case HLUnaryOpcode::LNot: {
-    Constant *zero = GetVecConst(ResultTy, 0);
+    Constant *zero = Constant::getNullValue(ResultTy);
     if (isFloat)
-      Result = CmpInst::Create(Instruction::FCmp, CmpInst::FCMP_UNE, tmp, zero);
+      Result = CmpInst::Create(Instruction::FCmp, CmpInst::FCMP_UEQ, tmp, zero);
     else
-      Result = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, tmp, zero);
+      Result = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, tmp, zero);
+  } break;
+  case HLUnaryOpcode::Not: {
+    Constant *allOneBits = Constant::getAllOnesValue(ResultTy);
+    Result = BinaryOperator::CreateXor(tmp, allOneBits);
   } break;
-  case HLUnaryOpcode::Not:
-    Result = BinaryOperator::CreateXor(tmp, tmp);
-    break;
   case HLUnaryOpcode::PostInc:
   case HLUnaryOpcode::PreInc:
     if (isFloat)
-      Result = BinaryOperator::CreateFAdd(tmp, one);
+      Result = BinaryOperator::CreateFAdd(tmp, oneVec);
     else
-      Result = BinaryOperator::CreateAdd(tmp, one);
+      Result = BinaryOperator::CreateAdd(tmp, oneVec);
     break;
   case HLUnaryOpcode::PostDec:
   case HLUnaryOpcode::PreDec:
     if (isFloat)
-      Result = BinaryOperator::CreateFSub(tmp, one);
+      Result = BinaryOperator::CreateFSub(tmp, oneVec);
     else
-      Result = BinaryOperator::CreateSub(tmp, one);
+      Result = BinaryOperator::CreateSub(tmp, oneVec);
     break;
   default:
     DXASSERT(0, "not implement");
@@ -728,12 +765,14 @@ Instruction *HLMatrixLowerPass::TrivialMatBinOpToVec(CallInst *CI) {
     break;
   case HLBinaryOpcode::Shl: {
     Value *op1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
-    DXASSERT_LOCALVAR(op1, IsMatrixType(op1->getType()), "must be matrix type here");
+    DXASSERT_LOCALVAR(op1, dxilutil::IsHLSLMatrixType(op1->getType()),
+                      "must be matrix type here");
     Result = BinaryOperator::CreateShl(tmp, tmp);
   } break;
   case HLBinaryOpcode::Shr: {
     Value *op1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
-    DXASSERT_LOCALVAR(op1, IsMatrixType(op1->getType()), "must be matrix type here");
+    DXASSERT_LOCALVAR(op1, dxilutil::IsHLSLMatrixType(op1->getType()),
+                      "must be matrix type here");
     Result = BinaryOperator::CreateAShr(tmp, tmp);
   } break;
   case HLBinaryOpcode::LT:
@@ -780,7 +819,8 @@ Instruction *HLMatrixLowerPass::TrivialMatBinOpToVec(CallInst *CI) {
     break;
   case HLBinaryOpcode::UShr: {
     Value *op1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
-    DXASSERT_LOCALVAR(op1, IsMatrixType(op1->getType()), "must be matrix type here");
+    DXASSERT_LOCALVAR(op1, dxilutil::IsHLSLMatrixType(op1->getType()),
+                      "must be matrix type here");
     Result = BinaryOperator::CreateLShr(tmp, tmp);
   } break;
   case HLBinaryOpcode::ULT:
@@ -797,33 +837,25 @@ Instruction *HLMatrixLowerPass::TrivialMatBinOpToVec(CallInst *CI) {
     break;
   case HLBinaryOpcode::LAnd:
   case HLBinaryOpcode::LOr: {
-    Constant *zero;
-    if (isFloat)
-      zero = llvm::ConstantFP::get(ResultTy->getVectorElementType(), 0);
-    else
-      zero = llvm::ConstantInt::get(ResultTy->getVectorElementType(), 0);
-
-    unsigned size = ResultTy->getVectorNumElements();
-    std::vector<Constant *> zeros(size, zero);
-    Value *vecZero = llvm::ConstantVector::get(zeros);
+    Value *vecZero = Constant::getNullValue(ResultTy);
     Instruction *cmpL;
     if (isFloat)
-      cmpL =
-          CmpInst::Create(Instruction::FCmp, CmpInst::FCMP_OEQ, tmp, vecZero);
+      cmpL = CmpInst::Create(Instruction::FCmp, CmpInst::FCMP_ONE, tmp, vecZero);
     else
-      cmpL = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, tmp, vecZero);
+      cmpL = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, tmp, vecZero);
     Builder.Insert(cmpL);
 
     Instruction *cmpR;
     if (isFloat)
       cmpR =
-          CmpInst::Create(Instruction::FCmp, CmpInst::FCMP_OEQ, tmp, vecZero);
+          CmpInst::Create(Instruction::FCmp, CmpInst::FCMP_ONE, tmp, vecZero);
     else
-      cmpR = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, tmp, vecZero);
+      cmpR = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, tmp, vecZero);
     Builder.Insert(cmpR);
+
     // How to map l, r back? Need check opcode
     if (opcode == HLBinaryOpcode::LOr)
-      Result = BinaryOperator::CreateAnd(cmpL, cmpR);
+      Result = BinaryOperator::CreateOr(cmpL, cmpR);
     else
       Result = BinaryOperator::CreateAnd(cmpL, cmpR);
     break;
@@ -905,11 +937,11 @@ void HLMatrixLowerPass::lowerToVec(Instruction *matInst) {
     
     IRBuilder<> AllocaBuilder(AI);
     if (Ty->isArrayTy()) {
-      Type *vecTy = HLMatrixLower::LowerMatrixArrayPointer(AI->getType());
+      Type *vecTy = HLMatrixLower::LowerMatrixArrayPointer(AI->getType(), /*forMem*/ true);
       vecTy = vecTy->getPointerElementType();
       vecVal = AllocaBuilder.CreateAlloca(vecTy, nullptr, AI->getName());
     } else {
-      Type *vecTy = HLMatrixLower::LowerMatrixType(matTy);
+      Type *vecTy = HLMatrixLower::LowerMatrixType(matTy, /*forMem*/ true);
       vecVal = AllocaBuilder.CreateAlloca(vecTy, nullptr, AI->getName());
     }
     // Update debug info.
@@ -951,23 +983,23 @@ void HLMatrixLowerPass::TrivialMatUnOpReplace(Value *matVal,
   HLUnaryOpcode opcode = static_cast<HLUnaryOpcode>(GetHLOpcode(matUseInst));
   Instruction *vecUseInst = cast<Instruction>(matToVecMap[matUseInst]);
   switch (opcode) {
-  case HLUnaryOpcode::Not:
-    // Not is xor now
-    vecUseInst->setOperand(0, vecVal);
-    vecUseInst->setOperand(1, vecVal);
-    break;
-  case HLUnaryOpcode::LNot:
+  case HLUnaryOpcode::Plus: // add(x, 0)
+    // Ideally we'd get completely rid of the instruction for +mat,
+    // but matToVecMap needs to point to some instruction.
+  case HLUnaryOpcode::Not: // xor(x, -1)
+  case HLUnaryOpcode::LNot: // cmpeq(x, 0)
   case HLUnaryOpcode::PostInc:
   case HLUnaryOpcode::PreInc:
   case HLUnaryOpcode::PostDec:
   case HLUnaryOpcode::PreDec:
     vecUseInst->setOperand(0, vecVal);
     break;
+  case HLUnaryOpcode::Minus: // sub(0, x)
+    vecUseInst->setOperand(1, vecVal);
+    break;
   case HLUnaryOpcode::Invalid:
-  case HLUnaryOpcode::Plus:
-  case HLUnaryOpcode::Minus:
   case HLUnaryOpcode::NumOfUO:
-    // No VecInst replacements for these.
+    DXASSERT(false, "Unexpected HL unary opcode.");
     break;
   }
 }
@@ -1186,8 +1218,8 @@ void HLMatrixLowerPass::TranslateMul(Value *matVal, Value *vecVal,
   Value *LVal = mulInst->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
   Value *RVal = mulInst->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
 
-  bool LMat = IsMatrixType(LVal->getType());
-  bool RMat = IsMatrixType(RVal->getType());
+  bool LMat = dxilutil::IsHLSLMatrixType(LVal->getType());
+  bool RMat = dxilutil::IsHLSLMatrixType(RVal->getType());
   if (LMat && RMat) {
     TranslateMatMatMul(matVal, vecVal, mulInst, isSigned);
   } else if (LMat) {
@@ -1458,8 +1490,8 @@ void HLMatrixLowerPass::TranslateMatCast(Value *matVal,
                           opcode == HLCastOpcode::RowMatrixToColMatrix,
                           /*bTranspose*/false);
   } else {
-    bool ToMat = IsMatrixType(castInst->getType());
-    bool FromMat = IsMatrixType(matVal->getType());
+    bool ToMat = dxilutil::IsHLSLMatrixType(castInst->getType());
+    bool FromMat = dxilutil::IsHLSLMatrixType(matVal->getType());
     if (ToMat && FromMat) {
       TranslateMatMatCast(matVal, vecVal, castInst);
     } else if (FromMat)
@@ -1903,7 +1935,7 @@ static void IterateInitList(MutableArrayRef<Value *> elts, unsigned &idx,
       }
     }
     Type *valEltTy = val->getType()->getPointerElementType();
-    if (valEltTy->isVectorTy() || HLMatrixLower::IsMatrixType(valEltTy) ||
+    if (valEltTy->isVectorTy() || dxilutil::IsHLSLMatrixType(valEltTy) ||
         valEltTy->isSingleValueType()) {
       Value *ldVal = Builder.CreateLoad(val);
       IterateInitList(elts, idx, ldVal, matToVecMap, Builder);
@@ -1926,7 +1958,7 @@ static void IterateInitList(MutableArrayRef<Value *> elts, unsigned &idx,
         }
       }
     }
-  } else if (HLMatrixLower::IsMatrixType(valTy)) {
+  } else if (dxilutil::IsHLSLMatrixType(valTy)) {
     unsigned col, row;
     HLMatrixLower::GetMatrixInfo(valTy, col, row);
     unsigned matSize = col * row;
@@ -2059,7 +2091,8 @@ void HLMatrixLowerPass::TranslateMatArrayGEP(Value *matInst,
           // Skip the vector version.
           if (useCall->getType()->isVectorTy())
             continue;
-          Value *newLd = Builder.CreateLoad(newGEP);
+          Type *matTy = useCall->getType();
+          Value *newLd = CreateVecMatrixLoad(newGEP, matTy, Builder);
           DXASSERT(matToVecMap.count(useCall), "must have vec version");
           Value *oldLd = matToVecMap[useCall];
           // Delete the oldLd.
@@ -2082,7 +2115,7 @@ void HLMatrixLowerPass::TranslateMatArrayGEP(Value *matInst,
 
           DXASSERT(matToVecMap.count(matInst), "must have vec version");
           Value *vecVal = matToVecMap[matInst];
-          Builder.CreateStore(vecVal, vecPtr);
+          CreateVecMatrixStore(vecVal, vecPtr, matVal->getType(), Builder);
         } break;
         }
       } break;
@@ -2137,20 +2170,46 @@ void HLMatrixLowerPass::replaceMatWithVec(Value *matVal,
           MatIntrinsicReplace(matCI, vecVal, useCall);
         } else {
           IntrinsicOp opcode = static_cast<IntrinsicOp>(GetHLOpcode(useCall));
-          DXASSERT_LOCALVAR(opcode, opcode == IntrinsicOp::IOP_frexp,
-                   "otherwise, unexpected opcode with matrix out parameter");
-          // NOTE: because out param use copy out semantic, so the operand of
-          // out must be temp alloca.
-          DXASSERT(isa<AllocaInst>(matVal), "else invalid mat ptr for frexp");
-          auto it = matToVecMap.find(useCall);
-          DXASSERT(it != matToVecMap.end(),
-                   "else fail to create vec version of useCall");
-          CallInst *vecUseInst = cast<CallInst>(it->second);
-
-          for (unsigned i = 0; i < vecUseInst->getNumArgOperands(); i++) {
-            if (useCall->getArgOperand(i) == matVal) {
-              vecUseInst->setArgOperand(i, vecVal);
+          if (opcode == IntrinsicOp::MOP_Append) {
+            // Replace matrix with vector representation and update intrinsic signature
+            // We don't care about matrix orientation here, since that will need to be
+            // taken into account anyways when generating the store output calls.
+            SmallVector<Value *, 4> flatArgs;
+            SmallVector<Type *, 4> flatParamTys;
+            for (Value *arg : useCall->arg_operands()) {
+              Value *flagArg = arg == matVal ? vecVal : arg;
+              flatArgs.emplace_back(arg == matVal ? vecVal : arg);
+              flatParamTys.emplace_back(flagArg->getType());
             }
+
+            // Don't need flat return type for Append.
+            FunctionType *flatFuncTy =
+              FunctionType::get(useInst->getType(), flatParamTys, false);
+            Function *flatF = GetOrCreateHLFunction(*m_pModule, flatFuncTy, group, static_cast<unsigned int>(opcode));
+            
+            // Append returns void, so the old call should have no users
+            DXASSERT(useInst->getType()->isVoidTy(), "Unexpected MOP_Append intrinsic return type");
+            DXASSERT(useInst->use_empty(), "Unexpected users of MOP_Append intrinsic return value");
+            IRBuilder<> Builder(useCall);
+            Builder.CreateCall(flatF, flatArgs);
+            AddToDeadInsts(useCall);
+          }
+          else if (opcode == IntrinsicOp::IOP_frexp) {
+            // NOTE: because out param use copy out semantic, so the operand of
+            // out must be temp alloca.
+            DXASSERT(isa<AllocaInst>(matVal), "else invalid mat ptr for frexp");
+            auto it = matToVecMap.find(useCall);
+            DXASSERT(it != matToVecMap.end(),
+              "else fail to create vec version of useCall");
+            CallInst *vecUseInst = cast<CallInst>(it->second);
+
+            for (unsigned i = 0; i < vecUseInst->getNumArgOperands(); i++) {
+              if (useCall->getArgOperand(i) == matVal) {
+                vecUseInst->setArgOperand(i, vecVal);
+              }
+            }
+          } else {
+            DXASSERT(false, "Unexpected matrix user intrinsic.");
           }
         }
       } break;
@@ -2174,9 +2233,17 @@ void HLMatrixLowerPass::replaceMatWithVec(Value *matVal,
           // Load Already translated in lowerToVec.
           // Store val operand will be set by the val use.
           // Do nothing here.
-        } else if (StoreInst *stInst = dyn_cast<StoreInst>(vecUser))
+        } else if (StoreInst *stInst = dyn_cast<StoreInst>(vecUser)) {
+          DXASSERT(vecVal->getType() == stInst->getValueOperand()->getType(),
+            "Mismatched vector matrix store value types.");
           stInst->setOperand(0, vecVal);
-        else
+        } else if (ZExtInst *zextInst = dyn_cast<ZExtInst>(vecUser)) {
+          // This happens when storing bool matrices,
+          // which must first undergo conversion from i1's to i32's.
+          DXASSERT(vecVal->getType() == zextInst->getOperand(0)->getType(),
+            "Mismatched vector matrix store value types.");
+          zextInst->setOperand(0, vecVal);
+        } else
           TrivialMatReplace(matVal, vecVal, useCall);
 
       } break;
@@ -2411,7 +2478,7 @@ void HLMatrixLowerPass::runOnGlobal(GlobalVariable *GV) {
   }
 
   Type *Ty = GV->getType()->getPointerElementType();
-  if (!HLMatrixLower::IsMatrixType(Ty))
+  if (!dxilutil::IsHLSLMatrixType(Ty))
     return;
 
   bool onlyLdSt = OnlyUsedByMatrixLdSt(GV);
@@ -2507,11 +2574,11 @@ void HLMatrixLowerPass::runOnFunction(Function &F) {
     BasicBlock *BB = BBI;
     for (auto II = BB->begin(); II != BB->end(); ) {
       Instruction &I = *(II++);
-      if (IsMatrixType(I.getType())) {
+      if (dxilutil::IsHLSLMatrixType(I.getType())) {
         lowerToVec(&I);
       } else if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
         Type *Ty = AI->getAllocatedType();
-        if (HLMatrixLower::IsMatrixType(Ty)) {
+        if (dxilutil::IsHLSLMatrixType(Ty)) {
           lowerToVec(&I);
         } else if (HLMatrixLower::IsMatrixArrayPointer(AI->getType())) {
           lowerToVec(&I);
@@ -2587,7 +2654,7 @@ Type *TryLowerMatTy(Type *Ty) {
   if (HLMatrixLower::IsMatrixArrayPointer(Ty)) {
     VecTy = HLMatrixLower::LowerMatrixArrayPointerToOneDimArray(Ty);
   } else if (isa<PointerType>(Ty) &&
-             HLMatrixLower::IsMatrixType(Ty->getPointerElementType())) {
+             dxilutil::IsHLSLMatrixType(Ty->getPointerElementType())) {
     VecTy = HLMatrixLower::LowerMatrixTypeToOneDimArray(
         Ty->getPointerElementType());
     VecTy = PointerType::get(VecTy, Ty->getPointerAddressSpace());
@@ -2649,7 +2716,7 @@ bool MatrixBitcastLowerPass::hasCallUser(Instruction *M) {
     User *U = *(it++);
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
       Type *EltTy = GEP->getType()->getPointerElementType();
-      if (HLMatrixLower::IsMatrixType(EltTy)) {
+      if (dxilutil::IsHLSLMatrixType(EltTy)) {
         if (hasCallUser(GEP))
           return true;
       } else {
@@ -2704,7 +2771,7 @@ void MatrixBitcastLowerPass::lowerMatrix(Instruction *M, Value *A) {
     User *U = *(it++);
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
       Type *EltTy = GEP->getType()->getPointerElementType();
-      if (HLMatrixLower::IsMatrixType(EltTy)) {
+      if (dxilutil::IsHLSLMatrixType(EltTy)) {
         // Change gep matrixArray, 0, index
         // into
         //   gep oneDimArray, 0, index * matSize
diff --git a/lib/HLSL/HLModule.cpp b/lib/HLSL/HLModule.cpp
index 4fc195762..fa4715685 100644
--- a/lib/HLSL/HLModule.cpp
+++ b/lib/HLSL/HLModule.cpp
@@ -957,6 +957,13 @@ void HLModule::MergeGepUse(Value *V) {
   }
 }
 
+template
+CallInst *HLModule::EmitHLOperationCall(IRBuilder<> &Builder,
+                                           HLOpcodeGroup group, unsigned opcode,
+                                           Type *RetType,
+                                           ArrayRef<Value *> paramList,
+                                           llvm::Module &M);
+
 template<typename BuilderTy>
 CallInst *HLModule::EmitHLOperationCall(BuilderTy &Builder,
                                            HLOpcodeGroup group, unsigned opcode,
@@ -984,54 +991,40 @@ CallInst *HLModule::EmitHLOperationCall(BuilderTy &Builder,
   return Builder.CreateCall(opFunc, opcodeParamList);
 }
 
-template
-CallInst *HLModule::EmitHLOperationCall(IRBuilder<> &Builder,
-                                           HLOpcodeGroup group, unsigned opcode,
-                                           Type *RetType,
-                                           ArrayRef<Value *> paramList,
-                                           llvm::Module &M);
-
-unsigned HLModule::FindCastOp(bool fromUnsigned, bool toUnsigned,
-                              llvm::Type *SrcTy, llvm::Type *DstTy) {
-  Instruction::CastOps castOp = llvm::Instruction::CastOps::BitCast;
-
-  if (SrcTy->isAggregateType() || DstTy->isAggregateType())
-    return llvm::Instruction::CastOps::BitCast;
-
+unsigned HLModule::GetNumericCastOp(
+  llvm::Type *SrcTy, bool SrcIsUnsigned, llvm::Type *DstTy, bool DstIsUnsigned) {
+  DXASSERT(SrcTy != DstTy, "No-op conversions are not casts and should have been handled by the callee.");
   uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
   uint32_t DstBitSize = DstTy->getScalarSizeInBits();
-  if (SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy()) {
-    if (SrcBitSize > DstBitSize)
-      return Instruction::Trunc;
-    if (toUnsigned)
-      return Instruction::ZExt;
-    else
-      return Instruction::SExt;
-  }
+  bool SrcIsInt = SrcTy->isIntOrIntVectorTy();
+  bool DstIsInt = DstTy->isIntOrIntVectorTy();
 
-  if (SrcTy->isFPOrFPVectorTy() && DstTy->isFPOrFPVectorTy()) {
-    if (SrcBitSize > DstBitSize)
-      return Instruction::FPTrunc;
-    else
-      return Instruction::FPExt;
-  }
+  DXASSERT(DstBitSize != 1, "Conversions to bool are not a cast and should have been handled by the callee.");
 
-  if (SrcTy->isIntOrIntVectorTy() && DstTy->isFPOrFPVectorTy()) {
-    if (fromUnsigned)
-      return Instruction::UIToFP;
-    else
-      return Instruction::SIToFP;
-  }
+  // Conversions from bools are like unsigned integer widening
+  if (SrcBitSize == 1) SrcIsUnsigned = true;
 
-  if (SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy()) {
-    if (toUnsigned)
-      return Instruction::FPToUI;
-    else
-      return Instruction::FPToSI;
+  if (SrcIsInt) {
+    if (DstIsInt) { // int to int
+      if (SrcBitSize > DstBitSize) return Instruction::Trunc;
+      // unsigned to unsigned: zext
+      // unsigned to signed: zext (fully representable)
+      // signed to signed: sext
+      // signed to unsigned: sext (like C++)
+      return SrcIsUnsigned ? Instruction::ZExt : Instruction::SExt;
+    }
+    else { // int to float
+      return SrcIsUnsigned ? Instruction::UIToFP : Instruction::SIToFP;
+    }
+  }
+  else {
+    if (DstIsInt) { // float to int
+      return DstIsUnsigned ? Instruction::FPToUI : Instruction::FPToSI;
+    }
+    else { // float to float
+      return SrcBitSize > DstBitSize ? Instruction::FPTrunc : Instruction::FPExt;
+    }
   }
-
-  DXASSERT_NOMSG(0);
-  return castOp;
 }
 
 bool HLModule::HasPreciseAttributeWithMetadata(Instruction *I) {
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 2040e587c..f1ee8f9f9 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -1226,7 +1226,7 @@ Value *TranslateWaveReadLaneFirst(CallInst *CI, IntrinsicOp IOP,
                               CI->getOperand(1)->getType(), CI, hlslOP);
 }
 
-Value *TransalteAbs(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
+Value *TranslateAbs(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                     HLOperationLowerHelper &helper,  HLObjectOperationLowerHelper *pObjHelper, bool &Translated) {
   hlsl::OP *hlslOP = &helper.hlslOP;
   Type *pOverloadTy = CI->getType()->getScalarType();
@@ -1243,6 +1243,11 @@ Value *TransalteAbs(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   }
 }
 
+Value *TranslateUAbs(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
+  HLOperationLowerHelper &helper, HLObjectOperationLowerHelper *pObjHelper, bool &Translated) {
+  return CI->getOperand(HLOperandIndex::kUnaryOpSrc0Idx); // No-op
+}
+
 Value *GenerateCmpNEZero(Value *val, IRBuilder<> Builder) {
   Type *Ty = val->getType();
   Type *EltTy = Ty->getScalarType();
@@ -2195,30 +2200,26 @@ Value *TranslateSign(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                      HLOperationLowerHelper &helper,  HLObjectOperationLowerHelper *pObjHelper, bool &Translated) {
   Value *val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
   Type *Ty = val->getType();
-  Type *EltTy = Ty->getScalarType();
-  IRBuilder<> Builder(CI);
+  bool IsInt = Ty->getScalarType()->isIntegerTy();
 
-  if (EltTy->isIntegerTy()) {
-    Constant *zero = ConstantInt::get(Ty->getScalarType(), 0);
-    if (Ty != EltTy) {
-      zero = ConstantVector::getSplat(Ty->getVectorNumElements(), zero);
-    }
-    Value *zeroLtVal = Builder.CreateICmpSLT(zero, val);
-    zeroLtVal = Builder.CreateZExt(zeroLtVal, CI->getType());
-    Value *valLtZero = Builder.CreateICmpSLT(val, zero);
-    valLtZero = Builder.CreateZExt(valLtZero, CI->getType());
-    return Builder.CreateSub(zeroLtVal, valLtZero);
-  } else {
-    Constant *zero = ConstantFP::get(Ty->getScalarType(), 0.0);
-    if (Ty != EltTy) {
-      zero = ConstantVector::getSplat(Ty->getVectorNumElements(), zero);
-    }
-    Value *zeroLtVal = Builder.CreateFCmpOLT(zero, val);
-    zeroLtVal = Builder.CreateZExt(zeroLtVal, CI->getType());
-    Value *valLtZero = Builder.CreateFCmpOLT(val, zero);
-    valLtZero = Builder.CreateZExt(valLtZero, CI->getType());
-    return Builder.CreateSub(zeroLtVal, valLtZero);
-  }
+  IRBuilder<> Builder(CI);
+  Constant *zero = Constant::getNullValue(Ty);
+  Value *zeroLtVal = IsInt ? Builder.CreateICmpSLT(zero, val) : Builder.CreateFCmpOLT(zero, val);
+  Value *valLtZero = IsInt ? Builder.CreateICmpSLT(val, zero) : Builder.CreateFCmpOLT(val, zero);
+  zeroLtVal = Builder.CreateZExt(zeroLtVal, CI->getType());
+  valLtZero = Builder.CreateZExt(valLtZero, CI->getType());
+  return Builder.CreateSub(zeroLtVal, valLtZero);
+}
+
+Value *TranslateUSign(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
+  HLOperationLowerHelper &helper, HLObjectOperationLowerHelper *pObjHelper, bool &Translated) {
+  Value *val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Type *Ty = val->getType();
+
+  IRBuilder<> Builder(CI);
+  Constant *zero = Constant::getNullValue(Ty);
+  Value *nonZero = Builder.CreateICmpNE(val, zero);
+  return Builder.CreateZExt(nonZero, CI->getType());
 }
 
 Value *TranslateStep(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -3299,6 +3300,7 @@ void TranslateLoad(ResLoadHelper &helper, HLResource::Kind RK,
 
   Type *Ty = helper.retVal->getType();
   if (Ty->isPointerTy()) {
+    DXASSERT(!DxilResource::IsAnyTexture(RK), "Textures should not be treated as structured buffers.");
     TranslateStructBufSubscript(cast<CallInst>(helper.retVal), helper.handle,
                                 helper.status, OP, DL);
     return;
@@ -4745,7 +4747,7 @@ IntrinsicLower gLowerTable[static_cast<unsigned>(IntrinsicOp::Num_Intrinsics)] =
     {IntrinsicOp::IOP_WorldToObject3x4, TranslateNoArgMatrix3x4Operation, DXIL::OpCode::WorldToObject},
     {IntrinsicOp::IOP_WorldToObject4x3, TranslateNoArgTransposedMatrix3x4Operation, DXIL::OpCode::WorldToObject},
     {IntrinsicOp::IOP_abort, EmptyLower, DXIL::OpCode::NumOpCodes},
-    {IntrinsicOp::IOP_abs, TransalteAbs, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::IOP_abs, TranslateAbs, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_acos, TrivialUnaryOperation, DXIL::OpCode::Acos},
     {IntrinsicOp::IOP_all, TranslateAll, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_any, TranslateAny, DXIL::OpCode::NumOpCodes},
@@ -4910,12 +4912,14 @@ IntrinsicLower gLowerTable[static_cast<unsigned>(IntrinsicOp::Num_Intrinsics)] =
     { IntrinsicOp::IOP_WaveActiveUSum, TranslateWaveA2A, DXIL::OpCode::WaveActiveOp },
     { IntrinsicOp::IOP_WavePrefixUProduct, TranslateWaveA2A, DXIL::OpCode::WavePrefixOp },
     { IntrinsicOp::IOP_WavePrefixUSum, TranslateWaveA2A, DXIL::OpCode::WavePrefixOp },
+    { IntrinsicOp::IOP_uabs, TranslateUAbs, DXIL::OpCode::NumOpCodes },
     { IntrinsicOp::IOP_uclamp, TranslateClamp, DXIL::OpCode::NumOpCodes },
     { IntrinsicOp::IOP_ufirstbithigh, TranslateFirstbitHi, DXIL::OpCode::FirstbitHi },
     { IntrinsicOp::IOP_umad, TranslateFUITrinary, DXIL::OpCode::UMad},
     { IntrinsicOp::IOP_umax, TranslateFUIBinary, DXIL::OpCode::UMax},
-    { IntrinsicOp::IOP_umin,   TranslateFUIBinary, DXIL::OpCode::UMin },
-    { IntrinsicOp::IOP_umul,   TranslateFUIBinary, DXIL::OpCode::UMul },
+    { IntrinsicOp::IOP_umin, TranslateFUIBinary, DXIL::OpCode::UMin },
+    { IntrinsicOp::IOP_umul, TranslateFUIBinary, DXIL::OpCode::UMul },
+    { IntrinsicOp::IOP_usign, TranslateUSign, DXIL::OpCode::UMax },
     { IntrinsicOp::MOP_InterlockedUMax, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes },
     { IntrinsicOp::MOP_InterlockedUMin, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes },
 };
@@ -4969,25 +4973,21 @@ unsigned GetEltTypeByteSizeForConstBuf(Type *EltType, const DataLayout &DL) {
 Value *GenerateCBLoad(Value *handle, Value *offset, Type *EltTy, OP *hlslOP,
                       IRBuilder<> &Builder) {
   Constant *OpArg = hlslOP->GetU32Const((unsigned)OP::OpCode::CBufferLoad);
+
+  DXASSERT(!EltTy->isIntegerTy(1), "Bools should not be loaded as their register representation.");
+
   // Align to 8 bytes for now.
   Constant *align = hlslOP->GetU32Const(8);
-  Type *i1Ty = Type::getInt1Ty(EltTy->getContext());
-  if (EltTy != i1Ty) {
-    Function *CBLoad = hlslOP->GetOpFunc(OP::OpCode::CBufferLoad, EltTy);
-    return Builder.CreateCall(CBLoad, {OpArg, handle, offset, align});
-  } else {
-    Type *i32Ty = Type::getInt32Ty(EltTy->getContext());
-    Function *CBLoad = hlslOP->GetOpFunc(OP::OpCode::CBufferLoad, i32Ty);
-    Value *Result = Builder.CreateCall(CBLoad, {OpArg, handle, offset, align});
-    return Builder.CreateICmpEQ(Result, hlslOP->GetU32Const(0));
-  }
+  Function *CBLoad = hlslOP->GetOpFunc(OP::OpCode::CBufferLoad, EltTy);
+  return Builder.CreateCall(CBLoad, {OpArg, handle, offset, align});
 }
 
 Value *TranslateConstBufMatLd(Type *matType, Value *handle, Value *offset,
                               bool colMajor, OP *OP, const DataLayout &DL,
                               IRBuilder<> &Builder) {
   unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/true)->getVectorElementType();
   unsigned matSize = col * row;
   std::vector<Value *> elts(matSize);
   Value *EltByteSize = ConstantInt::get(
@@ -5000,7 +5000,9 @@ Value *TranslateConstBufMatLd(Type *matType, Value *handle, Value *offset,
     baseOffset = Builder.CreateAdd(baseOffset, EltByteSize);
   }
 
-  return HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Value* Vec = HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Vec = HLMatrixLower::VecMatrixMemToReg(Vec, matType, Builder);
+  return Vec;
 }
 
 void TranslateCBGep(GetElementPtrInst *GEP, Value *handle, Value *baseOffset,
@@ -5309,22 +5311,18 @@ Value *GenerateCBLoadLegacy(Value *handle, Value *legacyIdx,
                             IRBuilder<> &Builder) {
   Constant *OpArg = hlslOP->GetU32Const((unsigned)OP::OpCode::CBufferLoadLegacy);
 
-  Type *i1Ty = Type::getInt1Ty(EltTy->getContext());
+  DXASSERT(!EltTy->isIntegerTy(1), "Bools should not be loaded as their register representation.");
+
   Type *doubleTy = Type::getDoubleTy(EltTy->getContext());
   Type *halfTy = Type::getHalfTy(EltTy->getContext());
   Type *i64Ty = Type::getInt64Ty(EltTy->getContext());
   Type *i16Ty = Type::getInt16Ty(EltTy->getContext());
-  bool isBool = EltTy == i1Ty;
+
   bool is64 = (EltTy == doubleTy) | (EltTy == i64Ty);
   bool is16 = (EltTy == halfTy || EltTy == i16Ty) && !hlslOP->UseMinPrecision();
-  bool isNormal = !isBool && !is64;
   DXASSERT_LOCALVAR(is16, (is16 && channelOffset < 8) || channelOffset < 4,
            "legacy cbuffer don't across 16 bytes register.");
-  if (isNormal) {
-    Function *CBLoad = hlslOP->GetOpFunc(OP::OpCode::CBufferLoadLegacy, EltTy);
-    Value *loadLegacy = Builder.CreateCall(CBLoad, {OpArg, handle, legacyIdx});
-    return Builder.CreateExtractValue(loadLegacy, channelOffset);
-  } else if (is64) {
+  if (is64) {
     Function *CBLoad = hlslOP->GetOpFunc(OP::OpCode::CBufferLoadLegacy, EltTy);
     Value *loadLegacy = Builder.CreateCall(CBLoad, {OpArg, handle, legacyIdx});
     DXASSERT((channelOffset&1)==0,"channel offset must be even for double");
@@ -5332,12 +5330,9 @@ Value *GenerateCBLoadLegacy(Value *handle, Value *legacyIdx,
     Value *Result = Builder.CreateExtractValue(loadLegacy, eltIdx);
     return Result;
   } else {
-    DXASSERT(isBool, "bool should be i1");
-    Type *i32Ty = Type::getInt32Ty(EltTy->getContext());
-    Function *CBLoad = hlslOP->GetOpFunc(OP::OpCode::CBufferLoadLegacy, i32Ty);
-    Value *loadLegacy = Builder.CreateCall(CBLoad, {OpArg, handle, legacyIdx});
-    Value *Result = Builder.CreateExtractValue(loadLegacy, channelOffset);
-    return Builder.CreateICmpEQ(Result, hlslOP->GetU32Const(0));
+    Function *CBLoad = hlslOP->GetOpFunc(OP::OpCode::CBufferLoadLegacy, EltTy);
+    Value *loadLegacy = Builder.CreateCall(CBLoad, { OpArg, handle, legacyIdx });
+    return Builder.CreateExtractValue(loadLegacy, channelOffset);
   }
 }
 
@@ -5347,29 +5342,19 @@ Value *GenerateCBLoadLegacy(Value *handle, Value *legacyIdx,
                             IRBuilder<> &Builder) {
   Constant *OpArg = hlslOP->GetU32Const((unsigned)OP::OpCode::CBufferLoadLegacy);
 
-  Type *i1Ty = Type::getInt1Ty(EltTy->getContext());
+  DXASSERT(!EltTy->isIntegerTy(1), "Bools should not be loaded as their register representation.");
+
   Type *doubleTy = Type::getDoubleTy(EltTy->getContext());
   Type *i64Ty = Type::getInt64Ty(EltTy->getContext());
   Type *halfTy = Type::getHalfTy(EltTy->getContext());
   Type *shortTy = Type::getInt16Ty(EltTy->getContext());
 
-  bool isBool = EltTy == i1Ty;
   bool is64 = (EltTy == doubleTy) | (EltTy == i64Ty);
   bool is16 = (EltTy == shortTy || EltTy == halfTy) && !hlslOP->UseMinPrecision();
-  bool isNormal = !isBool && !is64 && !is16;
   DXASSERT((is16 && channelOffset + vecSize <= 8) ||
                (channelOffset + vecSize) <= 4,
            "legacy cbuffer don't across 16 bytes register.");
-  if (isNormal) {
-    Function *CBLoad = hlslOP->GetOpFunc(OP::OpCode::CBufferLoadLegacy, EltTy);
-    Value *loadLegacy = Builder.CreateCall(CBLoad, {OpArg, handle, legacyIdx});
-    Value *Result = UndefValue::get(VectorType::get(EltTy, vecSize));
-    for (unsigned i = 0; i < vecSize; ++i) {
-      Value *NewElt = Builder.CreateExtractValue(loadLegacy, channelOffset+i);
-      Result = Builder.CreateInsertElement(Result, NewElt, i);
-    }
-    return Result;
-  } else if (is16) {
+  if (is16) {
     Function *CBLoad = hlslOP->GetOpFunc(OP::OpCode::CBufferLoadLegacy, EltTy);
     Value *loadLegacy = Builder.CreateCall(CBLoad, {OpArg, handle, legacyIdx});
     Value *Result = UndefValue::get(VectorType::get(EltTy, vecSize));
@@ -5401,25 +5386,24 @@ Value *GenerateCBLoadLegacy(Value *handle, Value *legacyIdx,
     }
     return Result;
   } else {
-    DXASSERT(isBool, "bool should be i1");
-    Type *i32Ty = Type::getInt32Ty(EltTy->getContext());
-    Function *CBLoad = hlslOP->GetOpFunc(OP::OpCode::CBufferLoadLegacy, i32Ty);
-    Value *loadLegacy = Builder.CreateCall(CBLoad, {OpArg, handle, legacyIdx});
-    Value *Result = UndefValue::get(VectorType::get(i32Ty, vecSize));
+    Function *CBLoad = hlslOP->GetOpFunc(OP::OpCode::CBufferLoadLegacy, EltTy);
+    Value *loadLegacy = Builder.CreateCall(CBLoad, { OpArg, handle, legacyIdx });
+    Value *Result = UndefValue::get(VectorType::get(EltTy, vecSize));
     for (unsigned i = 0; i < vecSize; ++i) {
-      Value *NewElt = Builder.CreateExtractValue(loadLegacy, channelOffset+i);
+      Value *NewElt = Builder.CreateExtractValue(loadLegacy, channelOffset + i);
       Result = Builder.CreateInsertElement(Result, NewElt, i);
     }
-    return Builder.CreateICmpEQ(Result, ConstantAggregateZero::get(Result->getType()));
+    return Result;
   }
 }
 
 Value *TranslateConstBufMatLdLegacy(Type *matType, Value *handle,
                                     Value *legacyIdx, bool colMajor, OP *OP,
-                                    const DataLayout &DL,
+                                    bool memElemRepr, const DataLayout &DL,
                                     IRBuilder<> &Builder) {
   unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/true)->getVectorElementType();
 
   unsigned matSize = col * row;
   std::vector<Value *> elts(matSize);
@@ -5453,7 +5437,10 @@ Value *TranslateConstBufMatLdLegacy(Type *matType, Value *handle,
     }
   }
 
-  return HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Value *Vec = HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  if (!memElemRepr)
+    Vec = HLMatrixLower::VecMatrixMemToReg(Vec, matType, Builder);
+  return Vec;
 }
 
 void TranslateCBGepLegacy(GetElementPtrInst *GEP, Value *handle,
@@ -5505,8 +5492,9 @@ void TranslateCBAddressUserLegacy(Instruction *user, Value *handle,
       Type *matType = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx)
                           ->getType()
                           ->getPointerElementType();
+      // This will replace a call, so we should use the register representation of elements
       Value *newLd = TranslateConstBufMatLdLegacy(
-          matType, handle, legacyIdx, colMajor, hlslOP, DL, Builder);
+          matType, handle, legacyIdx, colMajor, hlslOP, /*memElemRepr*/false, DL, Builder);
       CI->replaceAllUsesWith(newLd);
       CI->eraseFromParent();
     } else if (group == HLOpcodeGroup::HLSubscript) {
@@ -5533,8 +5521,9 @@ void TranslateCBAddressUserLegacy(Instruction *user, Value *handle,
 
       Value *ldData = UndefValue::get(resultType);
       if (!dynamicIndexing) {
+        // This will replace a load or GEP, so we should use the memory representation of elements
         Value *matLd = TranslateConstBufMatLdLegacy(
-            matType, handle, legacyIdx, colMajor, hlslOP, DL, Builder);
+            matType, handle, legacyIdx, colMajor, hlslOP, /*memElemRepr*/true, DL, Builder);
         // The matLd is keep original layout, just use the idx calc in
         // EmitHLSLMatrixElement and EmitHLSLMatrixSubscript.
         switch (subOp) {
@@ -5923,8 +5912,8 @@ Value *GEPIdxToOffset(GetElementPtrInst *GEP, IRBuilder<> &Builder,
           continue;
         }
       }
-      if (GEPIt->isPointerTy()) {
-        unsigned size = DL.getTypeAllocSize(GEPIt->getPointerElementType());
+      if (GEPIt->isPointerTy() || GEPIt->isArrayTy() || GEPIt->isVectorTy()) {
+        unsigned size = DL.getTypeAllocSize(GEPIt->getSequentialElementType());
         if (immIdx) {
           unsigned tempOffset = size * immIdx;
           offset = Builder.CreateAdd(offset, OP->GetU32Const(tempOffset));
@@ -5933,29 +5922,9 @@ Value *GEPIdxToOffset(GetElementPtrInst *GEP, IRBuilder<> &Builder,
           offset = Builder.CreateAdd(offset, tempOffset);
         }
       } else if (GEPIt->isStructTy()) {
-        unsigned structOffset = 0;
-        for (unsigned i = 0; i < immIdx; i++) {
-          structOffset += DL.getTypeAllocSize(GEPIt->getStructElementType(i));
-        }
+        const StructLayout *Layout = DL.getStructLayout(cast<StructType>(*GEPIt));
+        unsigned structOffset = Layout->getElementOffset(immIdx);
         offset = Builder.CreateAdd(offset, OP->GetU32Const(structOffset));
-      } else if (GEPIt->isArrayTy()) {
-        unsigned size = DL.getTypeAllocSize(GEPIt->getArrayElementType());
-        if (immIdx) {
-          unsigned tempOffset = size * immIdx;
-          offset = Builder.CreateAdd(offset, OP->GetU32Const(tempOffset));
-        } else {
-          Value *tempOffset = Builder.CreateMul(idx, OP->GetU32Const(size));
-          offset = Builder.CreateAdd(offset, tempOffset);
-        }
-      } else if (GEPIt->isVectorTy()) {
-        unsigned size = DL.getTypeAllocSize(GEPIt->getVectorElementType());
-        if (immIdx) {
-          unsigned tempOffset = size * immIdx;
-          offset = Builder.CreateAdd(offset, OP->GetU32Const(tempOffset));
-        } else {
-          Value *tempOffset = Builder.CreateMul(idx, OP->GetU32Const(size));
-          offset = Builder.CreateAdd(offset, tempOffset);
-        }
       } else {
         gep_type_iterator temp = GEPIt;
         temp++;
@@ -6019,9 +5988,10 @@ void GenerateStructBufSt(Value *handle, Value *bufIdx, Value *offset,
 Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
                                Value *handle, hlsl::OP *OP, Value *status,
                                Value *bufIdx, Value *baseOffset,
-                               bool colMajor, const DataLayout &DL) {
+                               const DataLayout &DL) {
   unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/true)->getVectorElementType();
   unsigned  EltSize = DL.getTypeAllocSize(EltTy);
   Constant* alignment = OP->GetI32Const(EltSize);
 
@@ -6053,14 +6023,20 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
     offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * EltSize));
   }
 
-  return HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Value *Vec = HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Vec = HLMatrixLower::VecMatrixMemToReg(Vec, matType, Builder);
+  return Vec;
 }
 
 void TranslateStructBufMatSt(Type *matType, IRBuilder<> &Builder, Value *handle,
                              hlsl::OP *OP, Value *bufIdx, Value *baseOffset,
-                             Value *val, bool colMajor, const DataLayout &DL) {
+                             Value *val, const DataLayout &DL) {
   unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/true)->getVectorElementType();
+
+  val = HLMatrixLower::VecMatrixRegToMem(val, matType, Builder);
+
   unsigned EltSize = DL.getTypeAllocSize(EltTy);
   Constant *Alignment = OP->GetI32Const(EltSize);
   Value *offset = baseOffset;
@@ -6075,18 +6051,8 @@ void TranslateStructBufMatSt(Type *matType, IRBuilder<> &Builder, Value *handle,
     storeSize = matSize + 4 - (matSize & 3);
   }
   std::vector<Value *> elts(storeSize, undefElt);
-
-  if (colMajor) {
-    for (unsigned i = 0; i < matSize; i++)
-      elts[i] = Builder.CreateExtractElement(val, i);
-  } else {
-    for (unsigned r = 0; r < row; r++)
-      for (unsigned c = 0; c < col; c++) {
-        unsigned rowMajorIdx = r * col + c;
-        unsigned colMajorIdx = c * row + r;
-        elts[rowMajorIdx] = Builder.CreateExtractElement(val, colMajorIdx);
-      }
-  }
+  for (unsigned i = 0; i < matSize; i++)
+    elts[i] = Builder.CreateExtractElement(val, i);
 
   for (unsigned i = 0; i < matSize; i += 4) {
     uint8_t mask = 0;
@@ -6111,34 +6077,25 @@ void TranslateStructBufMatLdSt(CallInst *CI, Value *handle, hlsl::OP *OP,
   DXASSERT_LOCALVAR(group, group == HLOpcodeGroup::HLMatLoadStore,
                     "only translate matrix loadStore here.");
   HLMatLoadStoreOpcode matOp = static_cast<HLMatLoadStoreOpcode>(opcode);
+  // Due to the current way the initial codegen generates matrix
+  // orientation casts, the in-register vector matrix has already been
+  // reordered based on the destination's row or column-major packing orientation.
   switch (matOp) {
+  case HLMatLoadStoreOpcode::RowMatLoad:
   case HLMatLoadStoreOpcode::ColMatLoad: {
     Value *ptr = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx);
     Value *NewLd = TranslateStructBufMatLd(
         ptr->getType()->getPointerElementType(), Builder, handle, OP, status,
-        bufIdx, baseOffset, /*colMajor*/ true, DL);
-    CI->replaceAllUsesWith(NewLd);
-  } break;
-  case HLMatLoadStoreOpcode::RowMatLoad: {
-    Value *ptr = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx);
-    Value *NewLd = TranslateStructBufMatLd(
-        ptr->getType()->getPointerElementType(), Builder, handle, OP, status,
-        bufIdx, baseOffset, /*colMajor*/ false, DL);
+        bufIdx, baseOffset, DL);
     CI->replaceAllUsesWith(NewLd);
   } break;
+  case HLMatLoadStoreOpcode::RowMatStore:
   case HLMatLoadStoreOpcode::ColMatStore: {
     Value *ptr = CI->getArgOperand(HLOperandIndex::kMatStoreDstPtrOpIdx);
     Value *val = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
     TranslateStructBufMatSt(ptr->getType()->getPointerElementType(), Builder,
                             handle, OP, bufIdx, baseOffset, val,
-                            /*colMajor*/ true, DL);
-  } break;
-  case HLMatLoadStoreOpcode::RowMatStore: {
-    Value *ptr = CI->getArgOperand(HLOperandIndex::kMatStoreDstPtrOpIdx);
-    Value *val = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
-    TranslateStructBufMatSt(ptr->getType()->getPointerElementType(), Builder,
-                            handle, OP, bufIdx, baseOffset, val,
-                            /*colMajor*/ false, DL);
+                            DL);
   } break;
   }
 
diff --git a/lib/HLSL/HLSignatureLower.cpp b/lib/HLSL/HLSignatureLower.cpp
index 648d775db..7eeb0711f 100644
--- a/lib/HLSL/HLSignatureLower.cpp
+++ b/lib/HLSL/HLSignatureLower.cpp
@@ -624,14 +624,14 @@ void replaceDirectInputParameter(Value *param, Function *loadInput,
       newVec = Builder.CreateInsertElement(newVec, input, col);
     }
     param->replaceAllUsesWith(newVec);
-  } else if (!Ty->isArrayTy() && !HLMatrixLower::IsMatrixType(Ty)) {
+  } else if (!Ty->isArrayTy() && !dxilutil::IsHLSLMatrixType(Ty)) {
     DXASSERT(cols == 1, "only support scalar here");
     Value *colIdx = hlslOP->GetU8Const(0);
     args[DXIL::OperandIndex::kLoadInputColOpIdx] = colIdx;
     Value *input =
         GenerateLdInput(loadInput, args, Builder, zero, bCast, EltTy);
     param->replaceAllUsesWith(input);
-  } else if (HLMatrixLower::IsMatrixType(Ty)) {
+  } else if (dxilutil::IsHLSLMatrixType(Ty)) {
     Value *colIdx = hlslOP->GetU8Const(0);
     (void)colIdx;
     DXASSERT(param->hasOneUse(),
@@ -784,7 +784,7 @@ void collectInputOutputAccessInfo(
               vectorIdx = GEPIt.getOperand();
             }
           }
-          if (HLMatrixLower::IsMatrixType(*GEPIt)) {
+          if (dxilutil::IsHLSLMatrixType(*GEPIt)) {
             unsigned row, col;
             HLMatrixLower::GetMatrixInfo(*GEPIt, col, row);
             Constant *arraySize = ConstantInt::get(idxTy, col);
@@ -912,80 +912,54 @@ void GenerateInputOutputUserCall(InputOutputAccessInfo &info, Value *undefVertex
     DXASSERT_NOMSG(group == HLOpcodeGroup::HLMatLoadStore);
     HLMatLoadStoreOpcode matOp = static_cast<HLMatLoadStoreOpcode>(opcode);
     switch (matOp) {
-    case HLMatLoadStoreOpcode::ColMatLoad: {
-      IRBuilder<> LocalBuilder(CI);
-      Type *matTy = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx)
-                        ->getType()
-                        ->getPointerElementType();
-      unsigned col, row;
-      Type *EltTy = HLMatrixLower::GetMatrixInfo(matTy, col, row);
-      std::vector<Value *> matElts(col * row);
-      for (unsigned c = 0; c < col; c++) {
-        Constant *constRowIdx = LocalBuilder.getInt32(c);
-        Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
-        for (unsigned r = 0; r < row; r++) {
-          SmallVector<Value *, 4> args = {OpArg, ID, rowIdx, columnConsts[r]};
-          if (vertexID)
-            args.emplace_back(vertexID);
-
-          Value *input = LocalBuilder.CreateCall(ldStFunc, args);
-          unsigned matIdx = c * row + r;
-          matElts[matIdx] = input;
-        }
-      }
-      Value *newVec =
-          HLMatrixLower::BuildVector(EltTy, col * row, matElts, LocalBuilder);
-      CI->replaceAllUsesWith(newVec);
-      CI->eraseFromParent();
-    } break;
+    case HLMatLoadStoreOpcode::ColMatLoad:
     case HLMatLoadStoreOpcode::RowMatLoad: {
       IRBuilder<> LocalBuilder(CI);
       Type *matTy = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx)
                         ->getType()
                         ->getPointerElementType();
       unsigned col, row;
-      Type *EltTy = HLMatrixLower::GetMatrixInfo(matTy, col, row);
+      HLMatrixLower::GetMatrixInfo(matTy, col, row);
       std::vector<Value *> matElts(col * row);
-      for (unsigned r = 0; r < row; r++) {
-        Constant *constRowIdx = LocalBuilder.getInt32(r);
-        Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
-        for (unsigned c = 0; c < col; c++) {
-          SmallVector<Value *, 4> args = {OpArg, ID, rowIdx, columnConsts[c]};
-          if (vertexID)
-            args.emplace_back(vertexID);
 
-          Value *input = LocalBuilder.CreateCall(ldStFunc, args);
-          unsigned matIdx = r * col + c;
-          matElts[matIdx] = input;
+      if (matOp == HLMatLoadStoreOpcode::ColMatLoad) {
+        for (unsigned c = 0; c < col; c++) {
+          Constant *constRowIdx = LocalBuilder.getInt32(c);
+          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+          for (unsigned r = 0; r < row; r++) {
+            SmallVector<Value *, 4> args = { OpArg, ID, rowIdx, columnConsts[r] };
+            if (vertexID)
+              args.emplace_back(vertexID);
+
+            Value *input = LocalBuilder.CreateCall(ldStFunc, args);
+            unsigned matIdx = c * row + r;
+            matElts[matIdx] = input;
+          }
+        }
+      } else {
+        for (unsigned r = 0; r < row; r++) {
+          Constant *constRowIdx = LocalBuilder.getInt32(r);
+          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+          for (unsigned c = 0; c < col; c++) {
+            SmallVector<Value *, 4> args = { OpArg, ID, rowIdx, columnConsts[c] };
+            if (vertexID)
+              args.emplace_back(vertexID);
+
+            Value *input = LocalBuilder.CreateCall(ldStFunc, args);
+            unsigned matIdx = r * col + c;
+            matElts[matIdx] = input;
+          }
         }
       }
+
       Value *newVec =
-          HLMatrixLower::BuildVector(EltTy, col * row, matElts, LocalBuilder);
+          HLMatrixLower::BuildVector(matElts[0]->getType(), col * row, matElts, LocalBuilder);
+      newVec = HLMatrixLower::VecMatrixMemToReg(newVec, matTy, LocalBuilder);
+
       CI->replaceAllUsesWith(newVec);
       CI->eraseFromParent();
     } break;
-    case HLMatLoadStoreOpcode::ColMatStore: {
-      IRBuilder<> LocalBuilder(CI);
-      Value *Val = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
-      Type *matTy = CI->getArgOperand(HLOperandIndex::kMatStoreDstPtrOpIdx)
-                        ->getType()
-                        ->getPointerElementType();
-      unsigned col, row;
-      HLMatrixLower::GetMatrixInfo(matTy, col, row);
-
-      for (unsigned c = 0; c < col; c++) {
-        Constant *constColIdx = LocalBuilder.getInt32(c);
-        Value *colIdx = LocalBuilder.CreateAdd(idxVal, constColIdx);
-
-        for (unsigned r = 0; r < row; r++) {
-          unsigned matIdx = HLMatrixLower::GetColMajorIdx(r, c, row);
-          Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
-          LocalBuilder.CreateCall(ldStFunc,
-                                  {OpArg, ID, colIdx, columnConsts[r], Elt});
-        }
-      }
-      CI->eraseFromParent();
-    } break;
+    case HLMatLoadStoreOpcode::ColMatStore:
     case HLMatLoadStoreOpcode::RowMatStore: {
       IRBuilder<> LocalBuilder(CI);
       Value *Val = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
@@ -995,14 +969,30 @@ void GenerateInputOutputUserCall(InputOutputAccessInfo &info, Value *undefVertex
       unsigned col, row;
       HLMatrixLower::GetMatrixInfo(matTy, col, row);
 
-      for (unsigned r = 0; r < row; r++) {
-        Constant *constRowIdx = LocalBuilder.getInt32(r);
-        Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+      Val = HLMatrixLower::VecMatrixRegToMem(Val, matTy, LocalBuilder);
+
+      if (matOp == HLMatLoadStoreOpcode::ColMatStore) {
         for (unsigned c = 0; c < col; c++) {
-          unsigned matIdx = HLMatrixLower::GetRowMajorIdx(r, c, col);
-          Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
-          LocalBuilder.CreateCall(ldStFunc,
-                                  {OpArg, ID, rowIdx, columnConsts[c], Elt});
+          Constant *constColIdx = LocalBuilder.getInt32(c);
+          Value *colIdx = LocalBuilder.CreateAdd(idxVal, constColIdx);
+
+          for (unsigned r = 0; r < row; r++) {
+            unsigned matIdx = HLMatrixLower::GetColMajorIdx(r, c, row);
+            Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
+            LocalBuilder.CreateCall(ldStFunc,
+              { OpArg, ID, colIdx, columnConsts[r], Elt });
+          }
+        }
+      } else {
+        for (unsigned r = 0; r < row; r++) {
+          Constant *constRowIdx = LocalBuilder.getInt32(r);
+          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+          for (unsigned c = 0; c < col; c++) {
+            unsigned matIdx = HLMatrixLower::GetRowMajorIdx(r, c, col);
+            Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
+            LocalBuilder.CreateCall(ldStFunc,
+              { OpArg, ID, rowIdx, columnConsts[c], Elt });
+          }
         }
       }
       CI->eraseFromParent();
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index cb11069cc..0529bb708 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -41,10 +41,11 @@ subdirectories =
  HLSL
  DXIL
  DxilContainer
+ DxilDia
  DxrFallback
  DxilRootSignature
 
-; HLSL Change: remove LibDriver, LineEditor, add HLSL, DxrtFallback, DXIL, DxilContainer, DxilPIXPasses, DxilRootSignature
+; HLSL Change: remove LibDriver, LineEditor, add HLSL, DxrtFallback, DXIL, DxilContainer, DxilDia, DxilPIXPasses, DxilRootSignature
 
 [component_0]
 type = Group
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 104cec49e..bdafe6fad 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -248,20 +248,24 @@ static void addHLSLPasses(bool HLSLHighLevel, unsigned OptLevel, hlsl::HLSLExten
     MPM.add(createDxilConvergentMarkPass());
   }
 
-  if (OptLevel > 2) {
-    MPM.add(createLoopRotatePass());
-    MPM.add(createLoopUnrollPass());
-  }
-
-  if (!NoOpt) {
-    // Verify no undef resource path before simplify, since that can remove undef
-    // paths.  For NoOpt, resources are unpromoted here, so this will not work.
-    MPM.add(createFailUndefResourcePass());
-  }
   MPM.add(createSimplifyInstPass());
 
   MPM.add(createCFGSimplificationPass());
 
+  // Passes to handle [unroll]
+  // Needs to happen after SROA since loop count may depend on
+  // struct members.
+  // Needs to happen before resources are lowered and before HL
+  // module is gone.
+  MPM.add(createLoopRotatePass());
+  MPM.add(createDxilLoopUnrollPass(/*MaxIterationAttempt*/ 128));
+
+  // Default unroll pass. This is purely for optimizing loops without
+  // attributes.
+  if (OptLevel > 2) {
+    MPM.add(createLoopUnrollPass());
+  }
+
   MPM.add(createDxilPromoteLocalResources());
   MPM.add(createDxilPromoteStaticResources());
   // Verify no undef resource again after promotion
@@ -374,14 +378,11 @@ void PassManagerBuilder::populateModulePassManager(
 
   // Start of function pass.
   // Break up aggregate allocas, using SSAUpdater.
-  // HLSL Change - don't run SROA. 
-  // HLSL uses special SROA added in addHLSLPasses.
-  if (HLSLHighLevel) { // HLSL Change
   if (UseNewSROA)
     MPM.add(createSROAPass(/*RequiresDomTree*/ false));
   else
     MPM.add(createScalarReplAggregatesPass(-1, false));
-  }
+
   // HLSL Change. MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
   // HLSL Change. MPM.add(createJumpThreadingPass());         // Thread jumps.
   MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
@@ -418,6 +419,8 @@ void PassManagerBuilder::populateModulePassManager(
     if (EnableMLSM)
       MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
     MPM.add(createGVNPass(DisableGVNLoadPRE));  // Remove redundancies
+    if (!HLSLResMayAlias)
+      MPM.add(createDxilSimpleGVNHoistPass()); // HLSL Change - GVN hoist for code size.
   }
   // HLSL Change Begins.
   // HLSL don't allow memcpy and memset.
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 26a7ec998..af18a0396 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -44,6 +44,7 @@ add_llvm_library(LLVMScalarOpts
   Scalar.cpp
   ScalarReplAggregates.cpp
   ScalarReplAggregatesHLSL.cpp  # HLSL Change
+  DxilLoopUnroll.cpp # HLSL Change
   Scalarizer.cpp
   SeparateConstOffsetFromGEP.cpp
   SimplifyCFGPass.cpp
diff --git a/lib/Transforms/Scalar/DxilLoopUnroll.cpp b/lib/Transforms/Scalar/DxilLoopUnroll.cpp
new file mode 100644
index 000000000..51806e4e3
--- /dev/null
+++ b/lib/Transforms/Scalar/DxilLoopUnroll.cpp
@@ -0,0 +1,1065 @@
+//===- DxilLoopUnroll.cpp - Special Unroll for Constant Values ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Special loop unroll routine for creating mandatory constant values and
+// loops that have exits.
+//
+// Overview of algorithm:
+// 
+// 1. Identify a set of blocks to unroll.
+//
+//    LLVM's concept of loop excludes exit blocks, which are blocks that no
+//    longer have a path to the loop latch. However, some exit blocks in HLSL
+//    also need to be unrolled. For example:
+//
+//        [unroll]
+//        for (uint i = 0; i < 4; i++)
+//        {
+//          if (...)
+//          {
+//            // This block here is an exit block, since it's.
+//            // guaranteed to exit the loop.
+//            ...
+//            a[i] = ...; // Indexing requires unroll.
+//            return;
+//          }
+//        }
+//
+//
+// 2. Create LCSSA based on the new loop boundary.
+//
+//    See LCSSA.cpp for more details. It creates trivial PHI nodes for any
+//    outgoing values of the loop at the exit blocks, so when the loop body
+//    gets cloned, the outgoing values can be added to those PHI nodes easily.
+//
+//    We are using a modified LCSSA routine here because we are including some
+//    of the original exit blocks in the unroll.
+//
+//
+// 3. Unroll the loop until we succeed.
+//
+//    Unlike LLVM, we do not try to find a loop count before unrolling.
+//    Instead, we unroll to find a constant terminal condition. Give up when we
+//    fail to do so.
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/SetVector.h"
+
+#include "dxc/DXIL/DxilUtil.h"
+#include "dxc/HLSL/HLModule.h"
+
+using namespace llvm;
+using namespace hlsl;
+
+// Copied over from LoopUnroll.cpp - RemapInstruction()
+static inline void RemapInstruction(Instruction *I,
+                                    ValueToValueMapTy &VMap) {
+  for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
+    Value *Op = I->getOperand(op);
+    ValueToValueMapTy::iterator It = VMap.find(Op);
+    if (It != VMap.end())
+      I->setOperand(op, It->second);
+  }
+
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      ValueToValueMapTy::iterator It = VMap.find(PN->getIncomingBlock(i));
+      if (It != VMap.end())
+        PN->setIncomingBlock(i, cast<BasicBlock>(It->second));
+    }
+  }
+}
+
+
+namespace {
+
+class DxilLoopUnroll : public LoopPass {
+public:
+  static char ID;
+
+  std::unordered_set<Function *> CleanedUpAlloca;
+  unsigned MaxIterationAttempt = 0;
+
+  DxilLoopUnroll(unsigned MaxIterationAttempt = 128) :
+    LoopPass(ID),
+    MaxIterationAttempt(MaxIterationAttempt)
+  {
+    initializeDxilLoopUnrollPass(*PassRegistry::getPassRegistry());
+  }
+  const char *getPassName() const override { return "Dxil Loop Unroll"; }
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+};
+
+char DxilLoopUnroll::ID;
+
+static void FailLoopUnroll(bool WarnOnly, LLVMContext &Ctx, DebugLoc DL, const char *Message) {
+  if (WarnOnly) {
+    if (DL)
+      Ctx.emitWarning(hlsl::dxilutil::FormatMessageAtLocation(DL, Message));
+    else
+      Ctx.emitWarning(hlsl::dxilutil::FormatMessageWithoutLocation(Message));
+  }
+  else {
+    if (DL)
+      Ctx.emitError(hlsl::dxilutil::FormatMessageAtLocation(DL, Message));
+    else
+      Ctx.emitError(hlsl::dxilutil::FormatMessageWithoutLocation(Message));
+  }
+}
+
+struct LoopIteration {
+  SmallVector<BasicBlock *, 16> Body;
+  BasicBlock *Latch = nullptr;
+  BasicBlock *Header = nullptr;
+  ValueToValueMapTy VarMap;
+  SetVector<BasicBlock *> Extended; // Blocks that are included in the clone that are not in the core loop body.
+  LoopIteration() {}
+};
+
+static bool GetConstantI1(Value *V, bool *Val=nullptr) {
+  if (ConstantInt *C = dyn_cast<ConstantInt>(V)) {
+    if (V->getType()->isIntegerTy(1)) {
+      if (Val)
+        *Val = (bool)C->getLimitedValue();
+      return true;
+    }
+  }
+  return false;
+}
+
+// Copied from llvm::SimplifyInstructionsInBlock
+static bool SimplifyInstructionsInBlock_NoDelete(BasicBlock *BB,
+                                       const TargetLibraryInfo *TLI) {
+  bool MadeChange = false;
+
+#ifndef NDEBUG
+  // In debug builds, ensure that the terminator of the block is never replaced
+  // or deleted by these simplifications. The idea of simplification is that it
+  // cannot introduce new instructions, and there is no way to replace the
+  // terminator of a block without introducing a new instruction.
+  AssertingVH<Instruction> TerminatorVH(--BB->end());
+#endif
+
+  for (BasicBlock::iterator BI = BB->begin(), E = --BB->end(); BI != E; ) {
+    assert(!BI->isTerminator());
+    Instruction *Inst = BI++;
+
+    WeakVH BIHandle(BI);
+    if (recursivelySimplifyInstruction(Inst, TLI)) {
+      MadeChange = true;
+      if (BIHandle != BI)
+        BI = BB->begin();
+      continue;
+    }
+#if 0 // HLSL Change
+    MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
+#endif // HLSL Change
+    if (BIHandle != BI)
+      BI = BB->begin();
+  }
+  return MadeChange;
+}
+
+static bool IsMarkedFullUnroll(Loop *L) {
+  if (MDNode *LoopID = L->getLoopID())
+    return GetUnrollMetadata(LoopID, "llvm.loop.unroll.full");
+  return false;
+}
+
+static bool IsMarkedUnrollCount(Loop *L, unsigned *OutCount) {
+  if (MDNode *LoopID = L->getLoopID()) {
+    if (MDNode *MD = GetUnrollMetadata(LoopID, "llvm.loop.unroll.count")) {
+      assert(MD->getNumOperands() == 2 &&
+             "Unroll count hint metadata should have two operands.");
+      unsigned Count =
+        mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+      assert(Count >= 1 && "Unroll count must be positive.");
+      *OutCount = Count;
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool HasSuccessorsInLoop(BasicBlock *BB, Loop *L) {
+  for (BasicBlock *Succ : successors(BB)) {
+    if (L->contains(Succ)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static void DetachFromSuccessors(BasicBlock *BB) {
+  SmallVector<BasicBlock *, 16> Successors(succ_begin(BB), succ_end(BB));
+  for (BasicBlock *Succ : Successors) {
+    Succ->removePredecessor(BB);
+  }
+}
+
+/// Return true if the specified block is in the list.
+static bool isExitBlock(BasicBlock *BB,
+                        const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    if (ExitBlocks[i] == BB)
+      return true;
+  return false;
+}
+
+// Copied and modified from LCSSA.cpp
+static bool processInstruction(SetVector<BasicBlock *> &Body, Loop &L, Instruction &Inst, DominatorTree &DT, // HLSL Change
+                               const SmallVectorImpl<BasicBlock *> &ExitBlocks,
+                               PredIteratorCache &PredCache, LoopInfo *LI) {
+
+  SmallVector<Use *, 16> UsesToRewrite;
+
+  BasicBlock *InstBB = Inst.getParent();
+
+  for (Use &U : Inst.uses()) {
+    Instruction *User = cast<Instruction>(U.getUser());
+    BasicBlock *UserBB = User->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(User))
+      UserBB = PN->getIncomingBlock(U);
+
+    if (InstBB != UserBB && /*!L.contains(UserBB)*/!Body.count(UserBB)) // HLSL Change
+      UsesToRewrite.push_back(&U);
+  }
+
+  // If there are no uses outside the loop, exit with no change.
+  if (UsesToRewrite.empty())
+    return false;
+#if 0 // HLSL Change
+  ++NumLCSSA; // We are applying the transformation
+#endif // HLSL Change
+  // Invoke instructions are special in that their result value is not available
+  // along their unwind edge. The code below tests to see whether DomBB
+  // dominates
+  // the value, so adjust DomBB to the normal destination block, which is
+  // effectively where the value is first usable.
+  BasicBlock *DomBB = Inst.getParent();
+  if (InvokeInst *Inv = dyn_cast<InvokeInst>(&Inst))
+    DomBB = Inv->getNormalDest();
+
+  DomTreeNode *DomNode = DT.getNode(DomBB);
+
+  SmallVector<PHINode *, 16> AddedPHIs;
+  SmallVector<PHINode *, 8> PostProcessPHIs;
+
+  SSAUpdater SSAUpdate;
+  SSAUpdate.Initialize(Inst.getType(), Inst.getName());
+
+  // Insert the LCSSA phi's into all of the exit blocks dominated by the
+  // value, and add them to the Phi's map.
+  for (SmallVectorImpl<BasicBlock *>::const_iterator BBI = ExitBlocks.begin(),
+                                                     BBE = ExitBlocks.end();
+       BBI != BBE; ++BBI) {
+    BasicBlock *ExitBB = *BBI;
+    if (!DT.dominates(DomNode, DT.getNode(ExitBB)))
+      continue;
+
+    // If we already inserted something for this BB, don't reprocess it.
+    if (SSAUpdate.HasValueForBlock(ExitBB))
+      continue;
+
+    PHINode *PN = PHINode::Create(Inst.getType(), PredCache.size(ExitBB),
+                                  Inst.getName() + ".lcssa", ExitBB->begin());
+
+    // Add inputs from inside the loop for this PHI.
+    for (BasicBlock *Pred : PredCache.get(ExitBB)) {
+      PN->addIncoming(&Inst, Pred);
+
+      // If the exit block has a predecessor not within the loop, arrange for
+      // the incoming value use corresponding to that predecessor to be
+      // rewritten in terms of a different LCSSA PHI.
+      if (/*!L.contains(Pred)*/ !Body.count(Pred)) // HLSL Change
+        UsesToRewrite.push_back(
+            &PN->getOperandUse(PN->getOperandNumForIncomingValue(
+                 PN->getNumIncomingValues() - 1)));
+    }
+
+    AddedPHIs.push_back(PN);
+
+    // Remember that this phi makes the value alive in this block.
+    SSAUpdate.AddAvailableValue(ExitBB, PN);
+
+    // LoopSimplify might fail to simplify some loops (e.g. when indirect
+    // branches are involved). In such situations, it might happen that an exit
+    // for Loop L1 is the header of a disjoint Loop L2. Thus, when we create
+    // PHIs in such an exit block, we are also inserting PHIs into L2's header.
+    // This could break LCSSA form for L2 because these inserted PHIs can also
+    // have uses outside of L2. Remember all PHIs in such situation as to
+    // revisit than later on. FIXME: Remove this if indirectbr support into
+    // LoopSimplify gets improved.
+    if (auto *OtherLoop = LI->getLoopFor(ExitBB))
+      if (!L.contains(OtherLoop))
+        PostProcessPHIs.push_back(PN);
+  }
+
+  // Rewrite all uses outside the loop in terms of the new PHIs we just
+  // inserted.
+  for (unsigned i = 0, e = UsesToRewrite.size(); i != e; ++i) {
+    // If this use is in an exit block, rewrite to use the newly inserted PHI.
+    // This is required for correctness because SSAUpdate doesn't handle uses in
+    // the same block.  It assumes the PHI we inserted is at the end of the
+    // block.
+    Instruction *User = cast<Instruction>(UsesToRewrite[i]->getUser());
+    BasicBlock *UserBB = User->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(User))
+      UserBB = PN->getIncomingBlock(*UsesToRewrite[i]);
+
+    if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
+      // Tell the VHs that the uses changed. This updates SCEV's caches.
+      if (UsesToRewrite[i]->get()->hasValueHandle())
+        ValueHandleBase::ValueIsRAUWd(*UsesToRewrite[i], UserBB->begin());
+      UsesToRewrite[i]->set(UserBB->begin());
+      continue;
+    }
+
+    // Otherwise, do full PHI insertion.
+    SSAUpdate.RewriteUse(*UsesToRewrite[i]);
+  }
+
+  // Post process PHI instructions that were inserted into another disjoint loop
+  // and update their exits properly.
+  for (auto *I : PostProcessPHIs) {
+    if (I->use_empty())
+      continue;
+
+    BasicBlock *PHIBB = I->getParent();
+    Loop *OtherLoop = LI->getLoopFor(PHIBB);
+    SmallVector<BasicBlock *, 8> EBs;
+    OtherLoop->getExitBlocks(EBs);
+    if (EBs.empty())
+      continue;
+
+    // Recurse and re-process each PHI instruction. FIXME: we should really
+    // convert this entire thing to a worklist approach where we process a
+    // vector of instructions...
+    processInstruction(Body, *OtherLoop, *I, DT, EBs, PredCache, LI);
+  }
+
+  // Remove PHI nodes that did not have any uses rewritten.
+  for (unsigned i = 0, e = AddedPHIs.size(); i != e; ++i) {
+    if (AddedPHIs[i]->use_empty())
+      AddedPHIs[i]->eraseFromParent();
+  }
+
+  return true;
+
+}
+
+// Copied from LCSSA.cpp
+static bool blockDominatesAnExit(BasicBlock *BB,
+                     DominatorTree &DT,
+                     const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+  DomTreeNode *DomNode = DT.getNode(BB);
+  for (BasicBlock *Exit : ExitBlocks)
+    if (DT.dominates(DomNode, DT.getNode(Exit)))
+      return true;
+  return false;
+};
+
+// Copied from LCSSA.cpp
+//
+// We need to recreate the LCSSA form since our loop boundary is potentially different from
+// the canonical one.
+static bool CreateLCSSA(SetVector<BasicBlock *> &Body, const SmallVectorImpl<BasicBlock *> &ExitBlocks, Loop *L, DominatorTree &DT, LoopInfo *LI) {
+
+  PredIteratorCache PredCache;
+  bool Changed = false;
+  // Look at all the instructions in the loop, checking to see if they have uses
+  // outside the loop.  If so, rewrite those uses.
+  for (SetVector<BasicBlock *>::iterator BBI = Body.begin(), BBE = Body.end();
+       BBI != BBE; ++BBI) {
+    BasicBlock *BB = *BBI;
+
+    // For large loops, avoid use-scanning by using dominance information:  In
+    // particular, if a block does not dominate any of the loop exits, then none
+    // of the values defined in the block could be used outside the loop.
+    if (!blockDominatesAnExit(BB, DT, ExitBlocks))
+      continue;
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Reject two common cases fast: instructions with no uses (like stores)
+      // and instructions with one use that is in the same block as this.
+      if (I->use_empty() ||
+          (I->hasOneUse() && I->user_back()->getParent() == BB &&
+           !isa<PHINode>(I->user_back())))
+        continue;
+
+      Changed |= processInstruction(Body, *L, *I, DT, ExitBlocks, PredCache, LI);
+    }
+  }
+
+  return Changed;
+}
+
+static Value *GetGEPPtrOrigin(GEPOperator *GEP) {
+  Value *Ptr = GEP->getPointerOperand();
+  while (Ptr) {
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(Ptr)) {
+      return AI;
+    }
+    else if (GEPOperator *NewGEP = dyn_cast<GEPOperator>(Ptr)) {
+      Ptr = NewGEP->getPointerOperand();
+    }
+    else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+      return GV;
+    }
+    else {
+      break;
+    }
+  }
+  return nullptr;
+}
+
+// Find all blocks in the loop with instructions that
+// would require an unroll to be correct.
+//
+// For example:
+// for (int i = 0; i < 10; i++) {
+//   gep i
+// }
+//
+static void FindProblemBlocks(BasicBlock *Header, const SmallVectorImpl<BasicBlock *> &BlocksInLoop, std::unordered_set<BasicBlock *> &ProblemBlocks, SetVector<AllocaInst *> &ProblemAllocas) {
+  SmallVector<Instruction *, 16> WorkList;
+
+  std::unordered_set<BasicBlock *> BlocksInLoopSet(BlocksInLoop.begin(), BlocksInLoop.end());
+  std::unordered_set<Instruction *> InstructionsSeen;
+
+  for (Instruction &I : *Header) {
+    PHINode *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      break;
+    WorkList.push_back(PN);
+    InstructionsSeen.insert(PN);
+  }
+
+  while (WorkList.size()) {
+    Instruction *I = WorkList.pop_back_val();
+
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+      Type *EltType = GEP->getType()->getPointerElementType();
+
+      // NOTE: This is a very convservative in the following conditions:
+      // - constant global resource arrays with external linkage (these can be
+      //   dynamically accessed)
+      // - global resource arrays or alloca resource arrays, as long as all
+      //   writes come from the same original resource definition (which can
+      //   also be an array).
+      //
+      // We may want to make this more precise in the future if it becomes a
+      // problem.
+      //
+      if (hlsl::dxilutil::IsHLSLObjectType(EltType)) {
+        if (Value *Ptr = GetGEPPtrOrigin(cast<GEPOperator>(GEP))) {
+          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+            if (!GV->isExternalLinkage(llvm::GlobalValue::ExternalLinkage))
+              ProblemBlocks.insert(GEP->getParent());
+          }
+          else if (AllocaInst *AI = dyn_cast<AllocaInst>(Ptr)) {
+            ProblemAllocas.insert(AI);
+            ProblemBlocks.insert(GEP->getParent());
+          }
+        }
+        continue; // Stop Propagating
+      }
+    }
+
+    for (User *U : I->users()) {
+      if (Instruction *UserI = dyn_cast<Instruction>(U)) {
+        if (!InstructionsSeen.count(UserI) &&
+          BlocksInLoopSet.count(UserI->getParent()))
+        {
+          InstructionsSeen.insert(UserI);
+          WorkList.push_back(UserI);
+        }
+      }
+    }
+  }
+}
+
+// Helper function for getting GEP's const index value
+inline static int64_t GetGEPIndex(GEPOperator *GEP, unsigned idx) {
+  return cast<ConstantInt>(GEP->getOperand(idx + 1))->getSExtValue();
+} 
+
+// Replace allocas with all constant indices with scalar allocas, then promote
+// them to values where possible (mem2reg).
+//
+// Before loop unroll, we did not have constant indices for arrays and SROA was
+// unable to break them into scalars. Now that unroll has potentially given
+// them constant values, we need to turn them into scalars.
+//
+// if "AllowOOBIndex" is true, it turns any out of bound index into 0.
+// Otherwise it emits an error and fails compilation.
+//
+template<typename IteratorT>
+static bool BreakUpArrayAllocas(bool AllowOOBIndex, IteratorT ItBegin, IteratorT ItEnd, DominatorTree *DT, AssumptionCache *AC) { 
+  bool Success = true;
+
+  SmallVector<AllocaInst *, 8> WorkList(ItBegin, ItEnd);
+
+  SmallVector<GEPOperator *, 16> GEPs;
+  while (WorkList.size()) {
+    AllocaInst *AI = WorkList.pop_back_val();
+
+    Type *AllocaType = AI->getAllocatedType();
+
+    // Only deal with array allocas.
+    if (!AllocaType->isArrayTy())
+      continue;
+
+    unsigned ArraySize = AI->getAllocatedType()->getArrayNumElements();
+    Type *ElementType = AllocaType->getArrayElementType();
+    if (!ArraySize)
+      continue;
+
+    GEPs.clear(); // Re-use array
+    for (User *U : AI->users()) {
+      if (GEPOperator *GEP = dyn_cast<GEPOperator>(U)) {
+        if (!GEP->hasAllConstantIndices() || GEP->getNumIndices() < 2 ||
+          GetGEPIndex(GEP, 0) != 0)
+        {
+          GEPs.clear();
+          break;
+        }
+        else {
+          GEPs.push_back(GEP);
+        }
+      }
+      else {
+        GEPs.clear();
+        break;
+      }
+    }
+
+    if (!GEPs.size())
+      continue;
+
+    SmallVector<AllocaInst *, 8> ScalarAllocas;
+    ScalarAllocas.resize(ArraySize);
+
+    IRBuilder<> B(AI);
+    for (GEPOperator *GEP : GEPs) {
+      int64_t idx = GetGEPIndex(GEP, 1);
+      GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(GEP);
+
+      if (idx < 0 || idx >= ArraySize) {
+        if (AllowOOBIndex)
+          idx = 0;
+        else {
+          Success = false;
+          if (GEPInst)
+            hlsl::dxilutil::EmitErrorOnInstruction(GEPInst, "Array access out of bound.");
+          continue;
+        }
+      } 
+      AllocaInst *ScalarAlloca = ScalarAllocas[idx];
+      if (!ScalarAlloca) {
+        ScalarAlloca = B.CreateAlloca(ElementType);
+        ScalarAllocas[idx] = ScalarAlloca;
+        if (ElementType->isArrayTy()) {
+          WorkList.push_back(ScalarAlloca);
+        }
+      }
+      Value *NewPointer = nullptr;
+      if (ElementType->isArrayTy()) {
+        SmallVector<Value *, 2> Indices;
+        Indices.push_back(B.getInt32(0));
+        for (unsigned i = 2; i < GEP->getNumIndices(); i++) {
+          Indices.push_back(GEP->getOperand(i + 1));
+        }
+        NewPointer = B.CreateGEP(ScalarAlloca, Indices);
+      } else {
+        NewPointer = ScalarAlloca;
+      }
+
+      GEP->replaceAllUsesWith(NewPointer);
+    } 
+
+    if (!ElementType->isArrayTy()) {
+      std::remove(ScalarAllocas.begin(), ScalarAllocas.end(), nullptr);
+      PromoteMemToReg(ScalarAllocas, *DT, nullptr, AC);
+    }
+  }
+
+  return Success;
+}
+
+static bool ContainsFloatingPointType(Type *Ty) {
+  if (Ty->isFloatingPointTy()) {
+    return true;
+  }
+  else if (Ty->isArrayTy()) {
+    return ContainsFloatingPointType(Ty->getArrayElementType());
+  }
+  else if (Ty->isVectorTy()) {
+    return ContainsFloatingPointType(Ty->getVectorElementType());
+  }
+  else if (Ty->isStructTy()) {
+    for (unsigned i = 0, NumStructElms = Ty->getStructNumElements(); i < NumStructElms; i++) {
+      if (ContainsFloatingPointType(Ty->getStructElementType(i)))
+        return true;
+    }
+  }
+  return false;
+}
+
+static bool Mem2Reg(Function &F, DominatorTree &DT, AssumptionCache &AC) {
+  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
+  bool Changed  = false;
+  std::vector<AllocaInst*> Allocas;
+  while (1) {
+    Allocas.clear();
+
+    // Find allocas that are safe to promote, by looking at all instructions in
+    // the entry node
+    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+        if (isAllocaPromotable(AI) &&
+          (!HLModule::HasPreciseAttributeWithMetadata(AI) || !ContainsFloatingPointType(AI->getAllocatedType())))
+          Allocas.push_back(AI);
+
+    if (Allocas.empty()) break;
+
+    PromoteMemToReg(Allocas, DT, nullptr, &AC);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+
+
+bool DxilLoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+
+  bool HasExplicitLoopCount = false;
+  unsigned UnrollCount = 0;
+
+  // If the loop is not marked as [unroll], don't do anything.
+  if (IsMarkedUnrollCount(L, &UnrollCount)) {
+    HasExplicitLoopCount = true;
+  }
+  else if (!IsMarkedFullUnroll(L)) {
+    return false;
+  }
+
+  if (!L->isSafeToClone())
+    return false;
+
+  DebugLoc LoopLoc = L->getStartLoc(); // Debug location for the start of the loop.
+  Function *F = L->getHeader()->getParent();
+  bool FxcCompatMode = false;
+  if (F->getParent()->HasHLModule()) {
+    HLModule &HM = F->getParent()->GetHLModule();
+    FxcCompatMode = HM.GetHLOptions().bFXCCompatMode;
+  }
+
+  // Analysis passes
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  AssumptionCache *AC =
+    &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(*F);
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+  Loop *OuterL = L->getParentLoop();
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Predecessor = L->getLoopPredecessor();
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
+  // Quit if we don't have a single latch block or predecessor
+  if (!Latch || !Predecessor) {
+    return false;
+  }
+
+  // If the loop exit condition is not in the latch, then the loop is not rotated. Give up.
+  if (!cast<BranchInst>(Latch->getTerminator())->isConditional()) {
+    return false;
+  }
+
+  // Promote alloca's
+  if (!CleanedUpAlloca.count(F)) {
+    CleanedUpAlloca.insert(F);
+    Mem2Reg(*F, *DT, *AC);
+  }
+
+  SmallVector<BasicBlock *, 16> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  std::unordered_set<BasicBlock *> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end());
+
+  SmallVector<BasicBlock *, 16> BlocksInLoop; // Set of blocks including both body and exits
+  BlocksInLoop.append(L->getBlocks().begin(), L->getBlocks().end());
+  BlocksInLoop.append(ExitBlocks.begin(), ExitBlocks.end());
+
+  // Heuristically find blocks that likely need to be unrolled
+  SetVector<AllocaInst *> ProblemAllocas;
+  std::unordered_set<BasicBlock *> ProblemBlocks;
+  FindProblemBlocks(L->getHeader(), BlocksInLoop, ProblemBlocks, ProblemAllocas);
+
+  // Keep track of the PHI nodes at the header.
+  SmallVector<PHINode *, 16> PHIs;
+  for (auto it = Header->begin(); it != Header->end(); it++) {
+    if (PHINode *PN = dyn_cast<PHINode>(it)) {
+      PHIs.push_back(PN);
+    }
+    else {
+      break;
+    }
+  }
+
+  // Quick simplification of PHINode incoming values
+  for (PHINode *PN : PHIs) {
+    for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
+      Value *OldIncomingV = PN->getIncomingValue(i);
+      if (Instruction *IncomingI = dyn_cast<Instruction>(OldIncomingV)) {
+        if (Value *NewIncomingV = llvm::SimplifyInstruction(IncomingI, DL)) {
+          PN->setIncomingValue(i, NewIncomingV);
+        }
+      }
+    }
+  }
+
+  SetVector<BasicBlock *> ToBeCloned; // List of blocks that will be cloned.
+  for (BasicBlock *BB : L->getBlocks()) // Include the body right away
+    ToBeCloned.insert(BB);
+
+  // Find the exit blocks that also need to be included
+  // in the unroll.
+  SmallVector<BasicBlock *, 8> NewExits; // New set of exit blocks as boundaries for LCSSA
+  SmallVector<BasicBlock *, 8> FakeExits; // Set of blocks created to allow cloning original exit blocks.
+  for (BasicBlock *BB : ExitBlocks) {
+    bool CloneThisExitBlock = ProblemBlocks.count(BB);
+
+    if (CloneThisExitBlock) {
+      ToBeCloned.insert(BB);
+
+      // If we are cloning this basic block, we must create a new exit
+      // block for inserting LCSSA PHI nodes.
+      BasicBlock *FakeExit = BasicBlock::Create(BB->getContext(), "loop.exit.new");
+      F->getBasicBlockList().insert(BB, FakeExit);
+
+      TerminatorInst *OldTerm = BB->getTerminator();
+      OldTerm->removeFromParent();
+      FakeExit->getInstList().push_back(OldTerm);
+
+      BranchInst::Create(FakeExit, BB);
+      for (BasicBlock *Succ : successors(FakeExit)) {
+        for (Instruction &I : *Succ) {
+          if (PHINode *PN = dyn_cast<PHINode>(&I)) {
+            for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
+              if (PN->getIncomingBlock(i) == BB)
+                PN->setIncomingBlock(i, FakeExit);
+            }
+          }
+        }
+      }
+
+      NewExits.push_back(FakeExit);
+      FakeExits.push_back(FakeExit);
+
+      // Update Dom tree with new exit
+      if (!DT->getNode(FakeExit))
+        DT->addNewBlock(FakeExit, BB);
+    }
+    else {
+      // If we are not including this exit block in the unroll,
+      // use it for LCSSA as normal.
+      NewExits.push_back(BB);
+    }
+  }
+
+  // Simplify the PHI nodes that have single incoming value. The original LCSSA form
+  // (if exists) does not necessarily work for our unroll because we may be unrolling
+  // from a different boundary.
+  for (BasicBlock *BB : BlocksInLoop)
+    hlsl::dxilutil::SimplifyTrivialPHIs(BB);
+
+  // Re-establish LCSSA form to get ready for unrolling.
+  CreateLCSSA(ToBeCloned, NewExits, L, *DT, LI);
+
+  SmallVector<std::unique_ptr<LoopIteration>, 16> Iterations; // List of cloned iterations
+  bool Succeeded = false;
+
+  for (unsigned IterationI = 0; IterationI < this->MaxIterationAttempt; IterationI++) {
+
+    LoopIteration *PrevIteration = nullptr;
+    if (Iterations.size())
+      PrevIteration = Iterations.back().get();
+    Iterations.push_back(llvm::make_unique<LoopIteration>());
+    LoopIteration &CurIteration = *Iterations.back().get();
+
+    // Clone the blocks.
+    for (BasicBlock *BB : ToBeCloned) {
+
+      BasicBlock *ClonedBB = CloneBasicBlock(BB, CurIteration.VarMap);
+      CurIteration.VarMap[BB] = ClonedBB;
+      ClonedBB->insertInto(F, Header);
+
+      if (ExitBlockSet.count(BB))
+        CurIteration.Extended.insert(ClonedBB);
+
+      CurIteration.Body.push_back(ClonedBB);
+
+      // Identify the special blocks.
+      if (BB == Latch) {
+        CurIteration.Latch = ClonedBB;
+      }
+      if (BB == Header) {
+        CurIteration.Header = ClonedBB;
+      }
+    }
+
+    for (BasicBlock *BB : ToBeCloned) {
+      BasicBlock *ClonedBB = cast<BasicBlock>(CurIteration.VarMap[BB]);
+      // If branching to outside of the loop, need to update the
+      // phi nodes there to include new values.
+      for (BasicBlock *Succ : successors(ClonedBB)) {
+        if (ToBeCloned.count(Succ))
+          continue;
+        for (Instruction &I : *Succ) {
+          PHINode *PN = dyn_cast<PHINode>(&I);
+          if (!PN)
+            break;
+
+          // Find the incoming value for this new block. If there is an entry
+          // for this block in the map, then it was defined in the loop, use it.
+          // Otherwise it came from outside the loop.
+          Value *OldIncoming = PN->getIncomingValueForBlock(BB);
+          Value *NewIncoming = OldIncoming;
+          ValueToValueMapTy::iterator Itor = CurIteration.VarMap.find(OldIncoming);
+          if (Itor != CurIteration.VarMap.end())
+            NewIncoming = Itor->second;
+          PN->addIncoming(NewIncoming, ClonedBB);
+        }
+      }
+    }
+
+    // Remap the instructions inside of cloned blocks.
+    for (BasicBlock *BB : CurIteration.Body) {
+      for (Instruction &I : *BB) {
+        ::RemapInstruction(&I, CurIteration.VarMap);
+      }
+    }
+
+    // If this is the first block
+    if (!PrevIteration) {
+      // Replace the phi nodes in the clone block with the values coming
+      // from outside of the loop
+      for (PHINode *PN : PHIs) {
+        PHINode *ClonedPN = cast<PHINode>(CurIteration.VarMap[PN]);
+        Value *ReplacementVal = ClonedPN->getIncomingValueForBlock(Predecessor);
+        ClonedPN->replaceAllUsesWith(ReplacementVal);
+        ClonedPN->eraseFromParent();
+        CurIteration.VarMap[PN] = ReplacementVal;
+      }
+    }
+    else {
+      // Replace the phi nodes with the value defined INSIDE the previous iteration.
+      for (PHINode *PN : PHIs) {
+        PHINode *ClonedPN = cast<PHINode>(CurIteration.VarMap[PN]);
+        Value *ReplacementVal = PrevIteration->VarMap[PN->getIncomingValueForBlock(Latch)];
+        ClonedPN->replaceAllUsesWith(ReplacementVal);
+        ClonedPN->eraseFromParent();
+        CurIteration.VarMap[PN] = ReplacementVal;
+      }
+
+      // Make the latch of the previous iteration branch to the header
+      // of this new iteration.
+      if (BranchInst *BI = dyn_cast<BranchInst>(PrevIteration->Latch->getTerminator())) {
+        for (unsigned i = 0; i < BI->getNumSuccessors(); i++) {
+          if (BI->getSuccessor(i) == PrevIteration->Header) {
+            BI->setSuccessor(i, CurIteration.Header);
+            break;
+          }
+        }
+      }
+    }
+
+    // Simplify instructions in the cloned blocks to create
+    // constant exit conditions.
+    for (BasicBlock *ClonedBB : CurIteration.Body)
+      SimplifyInstructionsInBlock_NoDelete(ClonedBB, NULL);
+
+    // Check exit condition to see if we fully unrolled the loop
+    if (BranchInst *BI = dyn_cast<BranchInst>(CurIteration.Latch->getTerminator())) {
+      bool Cond = false;
+      if (GetConstantI1(BI->getCondition(), &Cond)) {
+        if (BI->getSuccessor(Cond ? 1 : 0) == CurIteration.Header) {
+          Succeeded = true;
+          break;
+        }
+      }
+    }
+
+    // We've reached the N defined in [unroll(N)]
+    if (HasExplicitLoopCount && IterationI+1 >= UnrollCount) {
+      Succeeded = true;
+      BranchInst *BI = cast<BranchInst>(CurIteration.Latch->getTerminator());
+
+      BasicBlock *ExitBlock = nullptr;
+      for (unsigned i = 0; i < BI->getNumSuccessors(); i++) {
+        BasicBlock *Succ = BI->getSuccessor(i);
+        if (Succ != CurIteration.Header) {
+          ExitBlock = Succ;
+          break;
+        }
+      }
+
+      BranchInst *NewBI = BranchInst::Create(ExitBlock, BI);
+      BI->replaceAllUsesWith(NewBI);
+      BI->eraseFromParent();
+
+      break;
+    }
+  }
+
+  if (Succeeded) {
+    // We are going to be cleaning them up later. Maker sure
+    // they're in entry block so deleting loop blocks don't 
+    // kill them too.
+    for (AllocaInst *AI : ProblemAllocas)
+      DXASSERT(AI->getParent() == &F->getEntryBlock(), "Alloca is not in entry block.");
+
+    LoopIteration &FirstIteration = *Iterations.front().get();
+    // Make the predecessor branch to the first new header.
+    {
+      BranchInst *BI = cast<BranchInst>(Predecessor->getTerminator());
+      for (unsigned i = 0, NumSucc = BI->getNumSuccessors(); i < NumSucc; i++) {
+        if (BI->getSuccessor(i) == Header) {
+          BI->setSuccessor(i, FirstIteration.Header);
+        }
+      }
+    }
+
+    if (OuterL) {
+
+      // Core body blocks need to be added to outer loop
+      for (size_t i = 0; i < Iterations.size(); i++) {
+        LoopIteration &Iteration = *Iterations[i].get();
+        for (BasicBlock *BB : Iteration.Body) {
+          if (!Iteration.Extended.count(BB)) {
+            OuterL->addBasicBlockToLoop(BB, *LI);
+          }
+        }
+      }
+
+      // Our newly created exit blocks may need to be added to outer loop
+      for (BasicBlock *BB : FakeExits) {
+        if (HasSuccessorsInLoop(BB, OuterL))
+          OuterL->addBasicBlockToLoop(BB, *LI);
+      }
+
+      // Cloned exit blocks may need to be added to outer loop
+      for (size_t i = 0; i < Iterations.size(); i++) {
+        LoopIteration &Iteration = *Iterations[i].get();
+        for (BasicBlock *BB : Iteration.Extended) {
+          if (HasSuccessorsInLoop(BB, OuterL))
+            OuterL->addBasicBlockToLoop(BB, *LI);
+        }
+      }
+    }
+
+    // Remove the original blocks that we've cloned from all loops.
+    for (BasicBlock *BB : ToBeCloned)
+      LI->removeBlock(BB);
+
+    LPM.deleteLoopFromQueue(L);
+
+    // Remove dead blocks.
+    for (BasicBlock *BB : ToBeCloned)
+      DetachFromSuccessors(BB);
+    for (BasicBlock *BB : ToBeCloned)
+      BB->dropAllReferences();
+    for (BasicBlock *BB : ToBeCloned)
+      BB->eraseFromParent();
+
+    // Blocks need to be removed from DomTree. There's no easy way
+    // to remove them in the right order, so just make DomTree
+    // recalculate.
+    DT->recalculate(*F);
+
+    if (OuterL) {
+      // This process may have created multiple back edges for the
+      // parent loop. Simplify to keep it well-formed.
+      simplifyLoop(OuterL, DT, LI, this, nullptr, nullptr, AC);
+    }
+
+    // Now that we potentially turned some GEP indices into constants,
+    // try to clean up their allocas.
+    if (!BreakUpArrayAllocas(FxcCompatMode /* allow oob index */, ProblemAllocas.begin(), ProblemAllocas.end(), DT, AC)) {
+      FailLoopUnroll(false, F->getContext(), LoopLoc, "Could not unroll loop due to out of bound array access.");
+    }
+
+    return true;
+  }
+
+  // If we were unsuccessful in unrolling the loop
+  else {
+    FailLoopUnroll(FxcCompatMode /*warn only*/, F->getContext(), LoopLoc, "Could not unroll loop.");
+
+    // Remove all the cloned blocks
+    for (std::unique_ptr<LoopIteration> &Ptr : Iterations) {
+      LoopIteration &Iteration = *Ptr.get();
+      for (BasicBlock *BB : Iteration.Body)
+        DetachFromSuccessors(BB);
+    }
+    for (std::unique_ptr<LoopIteration> &Ptr : Iterations) {
+      LoopIteration &Iteration = *Ptr.get();
+      for (BasicBlock *BB : Iteration.Body)
+        BB->dropAllReferences();
+    }
+    for (std::unique_ptr<LoopIteration> &Ptr : Iterations) {
+      LoopIteration &Iteration = *Ptr.get();
+      for (BasicBlock *BB : Iteration.Body)
+        BB->eraseFromParent();
+    }
+
+    return false;
+  }
+}
+
+}
+
+Pass *llvm::createDxilLoopUnrollPass(unsigned MaxIterationAttempt) {
+  return new DxilLoopUnroll(MaxIterationAttempt);
+}
+
+INITIALIZE_PASS(DxilLoopUnroll, "dxil-loop-unroll", "Dxil Unroll loops", false, false)
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 0a36d1d9e..395fbc2cc 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -222,7 +222,8 @@ namespace {
 class AllocaSlices {
 public:
   /// \brief Construct the slices of a particular alloca.
-  AllocaSlices(const DataLayout &DL, AllocaInst &AI);
+  AllocaSlices(const DataLayout &DL, AllocaInst &AI,
+               const bool SkipHLSLMat); // HLSL Change - not sroa matrix type.
 
   /// \brief Test whether a pointer to the allocation escapes our analysis.
   ///
@@ -633,6 +634,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
   friend class InstVisitor<SliceBuilder>;
   typedef PtrUseVisitor<SliceBuilder> Base;
 
+  const bool SkipHLSLMat; // HLSL Change - not sroa matrix type.
   const uint64_t AllocSize;
   AllocaSlices &AS;
 
@@ -643,8 +645,10 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
   SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
 
 public:
-  SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
+  SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS,
+               const bool SkipHLSLMat)
       : PtrUseVisitor<SliceBuilder>(DL),
+        SkipHLSLMat(SkipHLSLMat), // HLSL Change - not sroa matrix type.
         AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), AS(AS) {}
 
 private:
@@ -690,7 +694,24 @@ private:
   void visitBitCastInst(BitCastInst &BC) {
     if (BC.use_empty())
       return markAsDead(BC);
-
+    // HLSL Change Begin - not sroa matrix type.
+    if (PointerType *PT = dyn_cast<PointerType>(BC.getType())) {
+      Type *EltTy = PT->getElementType();
+      if ((SkipHLSLMat && hlsl::dxilutil::IsHLSLMatrixType(EltTy)) ||
+          hlsl::dxilutil::IsHLSLObjectType(EltTy)) {
+        AS.PointerEscapingInstr = &BC;
+        return;
+      }
+      if (PointerType *SrcPT = dyn_cast<PointerType>(BC.getSrcTy())) {
+        Type *SrcEltTy = SrcPT->getElementType();
+        if ((SkipHLSLMat && hlsl::dxilutil::IsHLSLMatrixType(SrcEltTy)) ||
+            hlsl::dxilutil::IsHLSLObjectType(SrcEltTy)) {
+          AS.PointerEscapingInstr = &BC;
+          return;
+        }
+      }
+    }
+    // HLSL Change End.
     return Base::visitBitCastInst(BC);
   }
 
@@ -751,9 +772,15 @@ private:
   }
 
   void visitLoadInst(LoadInst &LI) {
+    // HLSL Change Begin - not sroa matrix type.
+    if ((SkipHLSLMat && hlsl::dxilutil::IsHLSLMatrixType(LI.getType())) ||
+        hlsl::dxilutil::IsHLSLObjectType(LI.getType()))
+      return PI.setEscapedAndAborted(&LI);
+    // HLSL Change End.
     assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
            "All simple FCA loads should have been pre-split");
 
+
     if (!IsOffsetKnown)
       return PI.setAborted(&LI);
 
@@ -766,6 +793,12 @@ private:
     Value *ValOp = SI.getValueOperand();
     if (ValOp == *U)
       return PI.setEscapedAndAborted(&SI);
+    // HLSL Change Begin - not sroa matrix type.
+    if ((SkipHLSLMat && hlsl::dxilutil::IsHLSLMatrixType(ValOp->getType())) ||
+        hlsl::dxilutil::IsHLSLObjectType(ValOp->getType()))
+      return PI.setEscapedAndAborted(&SI);
+    // HLSL Change End.
+
     if (!IsOffsetKnown)
       return PI.setAborted(&SI);
 
@@ -1002,13 +1035,15 @@ private:
   void visitInstruction(Instruction &I) { PI.setAborted(&I); }
 };
 
-AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
+AllocaSlices::AllocaSlices(
+    const DataLayout &DL, AllocaInst &AI,
+    const bool SkipHLSLMat) // HLSL Change - not sroa matrix type.
     :
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
       AI(AI),
 #endif
       PointerEscapingInstr(nullptr) {
-  SliceBuilder PB(DL, AI, *this);
+  SliceBuilder PB(DL, AI, *this, SkipHLSLMat);
   SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
   if (PtrI.isEscaped() || PtrI.isAborted()) {
     // FIXME: We should sink the escape vs. abort info into the caller nicely,
@@ -1204,6 +1239,7 @@ namespace {
 ///    SSA vector values.
 class SROA : public FunctionPass {
   const bool RequiresDomTree;
+  const bool SkipHLSLMat; // HLSL Change - not sroa matrix type.
 
   LLVMContext *C;
   DominatorTree *DT;
@@ -1252,9 +1288,10 @@ class SROA : public FunctionPass {
   SetVector<SelectInst *, SmallVector<SelectInst *, 2>> SpeculatableSelects;
 
 public:
-  SROA(bool RequiresDomTree = true)
-      : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr),
-        DT(nullptr) {
+  SROA(bool RequiresDomTree = true, bool SkipHLSLMat = true)
+      : FunctionPass(ID), RequiresDomTree(RequiresDomTree),
+        SkipHLSLMat(SkipHLSLMat), // HLSL Change - not sroa matrix type.
+        C(nullptr), DT(nullptr) {
     initializeSROAPass(*PassRegistry::getPassRegistry());
   }
   bool runOnFunction(Function &F) override;
@@ -1280,8 +1317,8 @@ private:
 
 char SROA::ID = 0;
 
-FunctionPass *llvm::createSROAPass(bool RequiresDomTree) {
-  return new SROA(RequiresDomTree);
+FunctionPass *llvm::createSROAPass(bool RequiresDomTree, bool SkipHLSLMat) {
+  return new SROA(RequiresDomTree, SkipHLSLMat);
 }
 
 INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false,
@@ -3191,6 +3228,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
   friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
 
   const DataLayout &DL;
+  const bool SkipHLSLMat; // HLSL Change - not sroa matrix type.
 
   /// Queue of pointer uses to analyze and potentially rewrite.
   SmallVector<Use *, 8> Queue;
@@ -3203,7 +3241,9 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
   Use *U;
 
 public:
-  AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {}
+  AggLoadStoreRewriter(const DataLayout &DL, const bool SkipHLSLMat)
+      // HLSL Change - not sroa matrix type.
+      : DL(DL), SkipHLSLMat(SkipHLSLMat) {}
 
   /// Rewrite loads and stores through a pointer and all pointers derived from
   /// it.
@@ -3323,6 +3363,11 @@ private:
     assert(LI.getPointerOperand() == *U);
     if (!LI.isSimple() || LI.getType()->isSingleValueType())
       return false;
+    // HLSL Change Begin - not sroa matrix type.
+    if ((SkipHLSLMat && hlsl::dxilutil::IsHLSLMatrixType(LI.getType())) ||
+        hlsl::dxilutil::IsHLSLObjectType(LI.getType()))
+      return false;
+    // HLSL Change End.
 
     // We have an aggregate being loaded, split it apart.
     DEBUG(dbgs() << "    original: " << LI << "\n");
@@ -3357,7 +3402,11 @@ private:
     Value *V = SI.getValueOperand();
     if (V->getType()->isSingleValueType())
       return false;
-
+    // HLSL Change Begin - not sroa matrix type.
+    if ((SkipHLSLMat && hlsl::dxilutil::IsHLSLMatrixType(V->getType())) ||
+        hlsl::dxilutil::IsHLSLObjectType(V->getType()))
+      return false;
+    // HLSL Change End.
     // We have an aggregate being stored, split it apart.
     DEBUG(dbgs() << "    original: " << SI << "\n");
     StoreOpSplitter Splitter(&SI, *U);
@@ -3367,6 +3416,20 @@ private:
   }
 
   bool visitBitCastInst(BitCastInst &BC) {
+    // HLSL Change Begin - not sroa matrix type.
+    if (PointerType *PT = dyn_cast<PointerType>(BC.getType())) {
+      Type *EltTy = PT->getElementType();
+      if ((SkipHLSLMat && hlsl::dxilutil::IsHLSLMatrixType(EltTy)) ||
+          hlsl::dxilutil::IsHLSLObjectType(EltTy))
+        return false;
+      if (PointerType *SrcPT = dyn_cast<PointerType>(BC.getSrcTy())) {
+        Type *SrcEltTy = SrcPT->getElementType();
+        if ((SkipHLSLMat && hlsl::dxilutil::IsHLSLMatrixType(SrcEltTy)) ||
+            hlsl::dxilutil::IsHLSLObjectType(SrcEltTy))
+          return false;
+      }
+    }
+    // HLSL Change End.
     enqueueUsers(BC);
     return false;
   }
@@ -4310,7 +4373,12 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
 
   // Skip alloca forms that this analysis can't handle.
   if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
-      hlsl::dxilutil::IsHLSLObjectType(AI.getAllocatedType()) || // HLSL Change - not sroa resource type.
+      hlsl::dxilutil::IsHLSLObjectType(
+          AI.getAllocatedType()) || // HLSL Change - not sroa resource type.
+      // HLSL Change Begin - not sroa matrix type.
+      (SkipHLSLMat &&
+       hlsl::dxilutil::IsHLSLMatrixType(AI.getAllocatedType())) ||
+      // HLSL Change End.
       DL.getTypeAllocSize(AI.getAllocatedType()) == 0)
     return false;
 
@@ -4318,11 +4386,11 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
 
   // First, split any FCA loads and stores touching this alloca to promote
   // better splitting and promotion opportunities.
-  AggLoadStoreRewriter AggRewriter(DL);
+  AggLoadStoreRewriter AggRewriter(DL, SkipHLSLMat);
   Changed |= AggRewriter.rewrite(AI);
 
   // Build the slices using a recursive instruction-visiting builder.
-  AllocaSlices AS(DL, AI);
+  AllocaSlices AS(DL, AI, SkipHLSLMat);
   DEBUG(AS.print(dbgs()));
   if (AS.isEscaped())
     return Changed;
diff --git a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
index 9e8446025..0d43dbc2c 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
@@ -1639,7 +1639,7 @@ bool SROA_HLSL::performScalarRepl(Function &F, DxilTypeSystem &typeSys) {
         Type *Ty = AI->getAllocatedType();
         // Skip empty struct parameters.
         if (StructType *ST = dyn_cast<StructType>(Ty)) {
-          if (!HLMatrixLower::IsMatrixType(Ty)) {
+          if (!dxilutil::IsHLSLMatrixType(Ty)) {
             DxilStructAnnotation *SA = typeSys.GetStructAnnotation(ST);
             if (SA && SA->IsEmptyStruct()) {
               for (User *U : AI->users()) {
@@ -1884,7 +1884,7 @@ void SROA_HLSL::isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset,
 
   for (;GEPIt != E; ++GEPIt) {
     Type *Ty = *GEPIt;
-    if (Ty->isStructTy() && !HLMatrixLower::IsMatrixType(Ty)) {
+    if (Ty->isStructTy() && !dxilutil::IsHLSLMatrixType(Ty)) {
       // Don't go inside struct when mark hasArrayIndexing and hasVectorIndexing.
       // The following level won't affect scalar repl on the struct.
       break;
@@ -2250,7 +2250,7 @@ static void EltMemCpy(Type *Ty, Value *Dest, Value *Src,
 static bool IsMemCpyTy(Type *Ty, DxilTypeSystem &typeSys) {
   if (!Ty->isAggregateType())
     return false;
-  if (HLMatrixLower::IsMatrixType(Ty))
+  if (dxilutil::IsHLSLMatrixType(Ty))
     return false;
   if (dxilutil::IsHLSLObjectType(Ty))
     return false;
@@ -2272,7 +2272,7 @@ static bool IsMemCpyTy(Type *Ty, DxilTypeSystem &typeSys) {
 static void SplitCpy(Type *Ty, Value *Dest, Value *Src,
                      SmallVector<Value *, 16> &idxList, IRBuilder<> &Builder,
                      const DataLayout &DL, DxilTypeSystem &typeSys,
-                     DxilFieldAnnotation *fieldAnnotation, const bool bEltMemCpy = true) {
+                     const DxilFieldAnnotation *fieldAnnotation, const bool bEltMemCpy = true) {
   if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
     Constant *idx = Constant::getIntegerValue(
         IntegerType::get(Ty->getContext(), 32), APInt(32, 0));
@@ -2282,7 +2282,7 @@ static void SplitCpy(Type *Ty, Value *Dest, Value *Src,
              fieldAnnotation, bEltMemCpy);
 
     idxList.pop_back();
-  } else if (HLMatrixLower::IsMatrixType(Ty)) {
+  } else if (dxilutil::IsHLSLMatrixType(Ty)) {
     // If no fieldAnnotation, use row major as default.
     // Only load then store immediately should be fine.
     bool bRowMajor = true;
@@ -2293,31 +2293,31 @@ static void SplitCpy(Type *Ty, Value *Dest, Value *Src,
                   MatrixOrientation::RowMajor;
     }
     Module *M = Builder.GetInsertPoint()->getModule();
-    Value *DestGEP = Builder.CreateInBoundsGEP(Dest, idxList);
-    Value *SrcGEP = Builder.CreateInBoundsGEP(Src, idxList);
-    if (bRowMajor) {
-      Value *Load = HLModule::EmitHLOperationCall(
-          Builder, HLOpcodeGroup::HLMatLoadStore,
-          static_cast<unsigned>(HLMatLoadStoreOpcode::RowMatLoad), Ty, {SrcGEP},
-          *M);
 
-      // Generate Matrix Store.
-      HLModule::EmitHLOperationCall(
-          Builder, HLOpcodeGroup::HLMatLoadStore,
-          static_cast<unsigned>(HLMatLoadStoreOpcode::RowMatStore), Ty,
-          {DestGEP, Load}, *M);
-    } else {
-      Value *Load = HLModule::EmitHLOperationCall(
-          Builder, HLOpcodeGroup::HLMatLoadStore,
-          static_cast<unsigned>(HLMatLoadStoreOpcode::ColMatLoad), Ty, {SrcGEP},
-          *M);
-
-      // Generate Matrix Store.
-      HLModule::EmitHLOperationCall(
-          Builder, HLOpcodeGroup::HLMatLoadStore,
-          static_cast<unsigned>(HLMatLoadStoreOpcode::ColMatStore), Ty,
-          {DestGEP, Load}, *M);
+    Value *DestMatPtr;
+    Value *SrcMatPtr;
+    if (idxList.size() == 1 && idxList[0] == ConstantInt::get(
+      IntegerType::get(Ty->getContext(), 32), APInt(32, 0))) {
+      // Avoid creating GEP(0)
+      DestMatPtr = Dest;
+      SrcMatPtr = Src;
     }
+    else {
+      DestMatPtr = Builder.CreateInBoundsGEP(Dest, idxList);
+      SrcMatPtr = Builder.CreateInBoundsGEP(Src, idxList);
+    }
+
+    HLMatLoadStoreOpcode loadOp = bRowMajor
+      ? HLMatLoadStoreOpcode::RowMatLoad : HLMatLoadStoreOpcode::ColMatLoad;
+    HLMatLoadStoreOpcode storeOp = bRowMajor
+      ? HLMatLoadStoreOpcode::RowMatStore : HLMatLoadStoreOpcode::ColMatStore;
+
+    Value *Load = HLModule::EmitHLOperationCall(
+      Builder, HLOpcodeGroup::HLMatLoadStore, static_cast<unsigned>(loadOp),
+      Ty, { SrcMatPtr }, *M);
+    HLModule::EmitHLOperationCall(
+      Builder, HLOpcodeGroup::HLMatLoadStore, static_cast<unsigned>(storeOp),
+      Ty, { DestMatPtr, Load }, *M);
   } else if (StructType *ST = dyn_cast<StructType>(Ty)) {
     if (dxilutil::IsHLSLObjectType(ST)) {
       // Avoid split HLSL object.
@@ -2365,44 +2365,55 @@ static void SplitCpy(Type *Ty, Value *Dest, Value *Src,
   }
 }
 
-static void SplitPtr(Type *Ty, Value *Ptr, SmallVector<Value *, 16> &idxList,
-                     SmallVector<Value *, 16> &EltPtrList,
-                     IRBuilder<> &Builder) {
+// Given a pointer to a value, produces a list of pointers to
+// all scalar elements of that value and their field annotations, at any nesting level.
+static void SplitPtr(Value *Ptr, // The root value pointer
+  SmallVectorImpl<Value *> &IdxList, // GEP indices stack during recursion
+  Type *Ty, // Type at the current GEP indirection level
+  const DxilFieldAnnotation &Annotation, // Annotation at the current GEP indirection level
+  SmallVectorImpl<Value *> &EltPtrList, // Accumulates pointers to each element found
+  SmallVectorImpl<const DxilFieldAnnotation*> &EltAnnotationList, // Accumulates field annotations for each element found
+  DxilTypeSystem &TypeSys,
+  IRBuilder<> &Builder) {
+
   if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
     Constant *idx = Constant::getIntegerValue(
         IntegerType::get(Ty->getContext(), 32), APInt(32, 0));
-    idxList.emplace_back(idx);
+    IdxList.emplace_back(idx);
 
-    SplitPtr(PT->getElementType(), Ptr, idxList, EltPtrList, Builder);
+    SplitPtr(Ptr, IdxList, PT->getElementType(), Annotation,
+      EltPtrList, EltAnnotationList, TypeSys, Builder);
 
-    idxList.pop_back();
-  } else if (HLMatrixLower::IsMatrixType(Ty)) {
-    Value *GEP = Builder.CreateInBoundsGEP(Ptr, idxList);
-    EltPtrList.emplace_back(GEP);
-  } else if (StructType *ST = dyn_cast<StructType>(Ty)) {
-    if (dxilutil::IsHLSLObjectType(ST)) {
-      // Avoid split HLSL object.
-      Value *GEP = Builder.CreateInBoundsGEP(Ptr, idxList);
-      EltPtrList.emplace_back(GEP);
-      return;
-    }
-    for (uint32_t i = 0; i < ST->getNumElements(); i++) {
-      llvm::Type *ET = ST->getElementType(i);
+    IdxList.pop_back();
+    return;
+  }
+  
+  if (StructType *ST = dyn_cast<StructType>(Ty)) {
+    if (!dxilutil::IsHLSLMatrixType(Ty) && !dxilutil::IsHLSLObjectType(ST)) {
+      const DxilStructAnnotation* SA = TypeSys.GetStructAnnotation(ST);
 
-      Constant *idx = llvm::Constant::getIntegerValue(
+      for (uint32_t i = 0; i < ST->getNumElements(); i++) {
+        llvm::Type *EltTy = ST->getElementType(i);
+
+        Constant *idx = llvm::Constant::getIntegerValue(
           IntegerType::get(Ty->getContext(), 32), APInt(32, i));
-      idxList.emplace_back(idx);
+        IdxList.emplace_back(idx);
 
-      SplitPtr(ET, Ptr, idxList, EltPtrList, Builder);
+        SplitPtr(Ptr, IdxList, EltTy, SA->GetFieldAnnotation(i),
+          EltPtrList, EltAnnotationList, TypeSys, Builder);
 
-      idxList.pop_back();
-    }
-
-  } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
-    if (AT->getNumContainedTypes() == 0) {
-      // Skip case like [0 x %struct].
+        IdxList.pop_back();
+      }
       return;
     }
+  }
+  
+  if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+    if (AT->getArrayNumElements() == 0) {
+      // Skip cases like [0 x %struct], nothing to copy
+      return;
+    }
+
     Type *ElTy = AT->getElementType();
     SmallVector<ArrayType *, 4> nestArrayTys;
 
@@ -2414,19 +2425,16 @@ static void SplitPtr(Type *Ty, Value *Ptr, SmallVector<Value *, 16> &idxList,
       ElTy = ElAT->getElementType();
     }
 
-    if (!ElTy->isStructTy() ||
-        HLMatrixLower::IsMatrixType(ElTy)) {
-      // Not split array of basic type.
-      Value *GEP = Builder.CreateInBoundsGEP(Ptr, idxList);
-      EltPtrList.emplace_back(GEP);
-    }
-    else {
+    if (ElTy->isStructTy() && !dxilutil::IsHLSLMatrixType(ElTy)) {
       DXASSERT(0, "Not support array of struct when split pointers.");
+      return;
     }
-  } else {
-    Value *GEP = Builder.CreateInBoundsGEP(Ptr, idxList);
-    EltPtrList.emplace_back(GEP);
   }
+
+  // Return a pointer to the current element and its annotation
+  Value *GEP = Builder.CreateInBoundsGEP(Ptr, IdxList);
+  EltPtrList.emplace_back(GEP);
+  EltAnnotationList.emplace_back(&Annotation);
 }
 
 // Support case when bitcast (gep ptr, 0,0) is transformed into bitcast ptr.
@@ -2435,7 +2443,7 @@ static unsigned MatchSizeByCheckElementType(Type *Ty, const DataLayout &DL, unsi
   // Size match, return current level.
   if (ptrSize == size) {
     // Not go deeper for matrix.
-    if (HLMatrixLower::IsMatrixType(Ty))
+    if (dxilutil::IsHLSLMatrixType(Ty))
       return level;
     // For struct, go deeper if size not change.
     // This will leave memcpy to deeper level when flatten.
@@ -2588,7 +2596,7 @@ void MemcpySplitter::SplitMemCpy(MemCpyInst *MI, const DataLayout &DL,
   // Try to find fieldAnnotation from user of Dest/Src.
   if (!fieldAnnotation) {
     Type *EltTy = dxilutil::GetArrayEltTy(DestTy);
-    if (HLMatrixLower::IsMatrixType(EltTy)) {
+    if (dxilutil::IsHLSLMatrixType(EltTy)) {
       fieldAnnotation = HLMatrixLower::FindAnnotationFromMatUser(Dest, typeSys);
     }
   }
@@ -2837,7 +2845,7 @@ void SROA_Helper::RewriteForLoad(LoadInst *LI) {
         Value *Ptr = NewElts[i];
         Type *Ty = Ptr->getType()->getPointerElementType();
         Value *Load = nullptr;
-        if (!HLMatrixLower::IsMatrixType(Ty))
+        if (!dxilutil::IsHLSLMatrixType(Ty))
           Load = Builder.CreateLoad(Ptr, "load");
         else {
           // Generate Matrix Load.
@@ -2922,7 +2930,7 @@ void SROA_Helper::RewriteForStore(StoreInst *SI) {
       Module *M = SI->getModule();
       for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
         Value *Extract = Builder.CreateExtractValue(Val, i, Val->getName());
-        if (!HLMatrixLower::IsMatrixType(Extract->getType())) {
+        if (!dxilutil::IsHLSLMatrixType(Extract->getType())) {
           Builder.CreateStore(Extract, NewElts[i]);
         } else {
           // Generate Matrix Store.
@@ -3234,9 +3242,15 @@ void SROA_Helper::RewriteCall(CallInst *CI) {
         Function *flatF =
             GetOrCreateHLFunction(*F->getParent(), flatFuncTy, group, opcode);
         IRBuilder<> Builder(CI);
-        // Append return void, don't need to replace CI with flatCI.
         Builder.CreateCall(flatF, flatArgs);
 
+        // Append returns void, so it's not used by other instructions
+        // and we don't need to replace it with flatCI.
+        // However, we don't want to visit the same append again
+        // when SROA'ing other arguments, as that would be O(n^2)
+        // and we would attempt double-deleting the original call.
+        for (auto& opit : CI->operands())
+          opit.set(UndefValue::get(opit->getType()));
         DeadInsts.push_back(CI);
       } break;
       case IntrinsicOp::IOP_TraceRay: {
@@ -3379,7 +3393,7 @@ bool SROA_Helper::DoScalarReplacement(Value *V, std::vector<Value *> &Elts,
   if (!Ty->isAggregateType())
     return false;
   // Skip matrix types.
-  if (HLMatrixLower::IsMatrixType(Ty))
+  if (dxilutil::IsHLSLMatrixType(Ty))
     return false;
 
   IRBuilder<> AllocaBuilder(dxilutil::FindAllocaInsertionPt(Builder.GetInsertPoint()));
@@ -3426,7 +3440,7 @@ bool SROA_Helper::DoScalarReplacement(Value *V, std::vector<Value *> &Elts,
 
     if (ElTy->isStructTy() &&
         // Skip Matrix type.
-        !HLMatrixLower::IsMatrixType(ElTy)) {
+        !dxilutil::IsHLSLMatrixType(ElTy)) {
       if (!dxilutil::IsHLSLObjectType(ElTy)) {
         // for array of struct
         // split into arrays of struct elements
@@ -3555,7 +3569,7 @@ bool SROA_Helper::DoScalarReplacement(GlobalVariable *GV,
   if (Ty->isSingleValueType() && !Ty->isVectorTy())
     return false;
   // Skip matrix types.
-  if (HLMatrixLower::IsMatrixType(Ty))
+  if (dxilutil::IsHLSLMatrixType(Ty))
     return false;
 
   Module *M = GV->getParent();
@@ -3624,7 +3638,7 @@ bool SROA_Helper::DoScalarReplacement(GlobalVariable *GV,
 
     if (ElTy->isStructTy() &&
         // Skip Matrix type.
-        !HLMatrixLower::IsMatrixType(ElTy)) {
+        !dxilutil::IsHLSLMatrixType(ElTy)) {
       // for array of struct
       // split into arrays of struct elements
       StructType *ElST = cast<StructType>(ElTy);
@@ -4108,6 +4122,9 @@ bool SROA_Helper::LowerMemcpy(Value *V, DxilFieldAnnotation *annotation,
           // For GEP, the ptr could have other GEP read/write.
           // Only scan one GEP is not enough.
           Value *Ptr = GEP->getPointerOperand();
+          while (GEPOperator *NestedGEP = dyn_cast<GEPOperator>(Ptr))
+            Ptr = NestedGEP->getPointerOperand();
+
           if (CallInst *PtrCI = dyn_cast<CallInst>(Ptr)) {
             hlsl::HLOpcodeGroup group =
                 hlsl::GetHLOpcodeGroup(PtrCI->getCalledFunction());
@@ -4185,7 +4202,7 @@ bool SROA_Helper::IsEmptyStructType(Type *Ty, DxilTypeSystem &typeSys) {
     Ty = Ty->getArrayElementType();
 
   if (StructType *ST = dyn_cast<StructType>(Ty)) {
-    if (!HLMatrixLower::IsMatrixType(Ty)) {
+    if (!dxilutil::IsHLSLMatrixType(Ty)) {
       DxilStructAnnotation *SA = typeSys.GetStructAnnotation(ST);
       if (SA && SA->IsEmptyStruct())
         return true;
@@ -4343,7 +4360,7 @@ public:
           continue;
 
         // Check matrix store.
-        if (HLMatrixLower::IsMatrixType(
+        if (dxilutil::IsHLSLMatrixType(
                 GV->getType()->getPointerElementType())) {
           if (CallInst *CI = dyn_cast<CallInst>(user)) {
             if (GetHLOpcodeGroupByName(CI->getCalledFunction()) ==
@@ -4389,14 +4406,13 @@ private:
                   DxilParameterAnnotation &paramAnnotation,
                   std::vector<Value *> &FlatParamList,
                   std::vector<DxilParameterAnnotation> &FlatRetAnnotationList,
-                  IRBuilder<> &Builder, DbgDeclareInst *DDI);
+                  BasicBlock *EntryBlock, DbgDeclareInst *DDI);
   Value *castResourceArgIfRequired(Value *V, Type *Ty, bool bOut,
                                    DxilParamInputQual inputQual,
                                    IRBuilder<> &Builder);
   Value *castArgumentIfRequired(Value *V, Type *Ty, bool bOut,
                                 DxilParamInputQual inputQual,
                                 DxilFieldAnnotation &annotation,
-                                std::deque<Value *> &WorkList,
                                 IRBuilder<> &Builder);
   // Replace use of parameter which changed type when flatten.
   // Also add information to Arg if required.
@@ -4422,6 +4438,25 @@ private:
   std::unordered_set<Value *> castRowMajorParamMap;
   bool m_HasDbgInfo;
 };
+
+// When replacing aggregates by its scalar elements,
+// the first element will preserve the original semantic,
+// and the subsequent ones will temporarily use this value.
+// We then run a pass to fix the semantics and properly renumber them
+// once the aggregate has been fully expanded.
+// 
+// For example:
+// struct Foo { float a; float b; };
+// void main(Foo foo : TEXCOORD0, float bar : TEXCOORD0)
+//
+// Will be expanded to
+// void main(float a : TEXCOORD0, float b : *, float bar : TEXCOORD0)
+//
+// And then fixed up to
+// void main(float a : TEXCOORD0, float b : TEXCOORD1, float bar : TEXCOORD0)
+//
+// (which will later on fail validation due to duplicate semantics).
+constexpr const char *ContinuedPseudoSemantic = "*";
 }
 
 char SROA_Parameter_HLSL::ID = 0;
@@ -4632,7 +4667,7 @@ static DxilFieldAnnotation &GetEltAnnotation(Type *Ty, unsigned idx, DxilFieldAn
   while (Ty->isArrayTy())
     Ty = Ty->getArrayElementType();
   if (StructType *ST = dyn_cast<StructType>(Ty)) {
-    if (HLMatrixLower::IsMatrixType(Ty))
+    if (dxilutil::IsHLSLMatrixType(Ty))
       return annotation;
     DxilStructAnnotation *SA = dxilTypeSys.GetStructAnnotation(ST);
     if (SA) {
@@ -4700,13 +4735,13 @@ static unsigned AllocateSemanticIndex(
                                             FlatAnnotationList);
     }
     return updatedArgIdx;
-  } else if (Ty->isStructTy() && !HLMatrixLower::IsMatrixType(Ty)) {
+  } else if (Ty->isStructTy() && !dxilutil::IsHLSLMatrixType(Ty)) {
     unsigned fieldsCount = Ty->getStructNumElements();
     for (unsigned i = 0; i < fieldsCount; i++) {
       Type *EltTy = Ty->getStructElementType(i);
       argIdx = AllocateSemanticIndex(EltTy, semIndex, argIdx, endArgIdx,
                                      FlatAnnotationList);
-      if (!(EltTy->isStructTy() && !HLMatrixLower::IsMatrixType(EltTy))) {
+      if (!(EltTy->isStructTy() && !dxilutil::IsHLSLMatrixType(EltTy))) {
         // Update argIdx only when it is a leaf node.
         argIdx++;
       }
@@ -4753,17 +4788,18 @@ void SROA_Parameter_HLSL::allocateSemanticIndex(
     if (semantic.empty())
       continue;
 
-    unsigned semGroupEnd = i + 1;
-    while (semGroupEnd < endArgIndex &&
-           FlatAnnotationList[semGroupEnd].GetSemanticString() == semantic) {
-      ++semGroupEnd;
-    }
-
     StringRef baseSemName; // The 'FOO' in 'FOO1'.
     uint32_t semIndex;     // The '1' in 'FOO1'
     // Split semName and index.
     Semantic::DecomposeNameAndIndex(semantic, &baseSemName, &semIndex);
 
+    unsigned semGroupEnd = i + 1;
+    while (semGroupEnd < endArgIndex &&
+           FlatAnnotationList[semGroupEnd].GetSemanticString() == ContinuedPseudoSemantic) {
+      FlatAnnotationList[semGroupEnd].SetSemanticString(baseSemName);
+      ++semGroupEnd;
+    }
+
     DXASSERT(semanticTypeMap.count(semantic) > 0, "Must has semantic type");
     Type *semanticTy = semanticTypeMap[semantic];
 
@@ -4945,7 +4981,7 @@ CastCopyArrayMultiDimTo1Dim(Value *FromArray, Value *ToArray, Type *CurFromTy,
       Value *Elt = Builder.CreateExtractElement(V, i);
       Builder.CreateStore(Elt, ToPtr);
     }
-  } else if (HLMatrixLower::IsMatrixType(CurFromTy)) {
+  } else if (dxilutil::IsHLSLMatrixType(CurFromTy)) {
     // Copy matrix to array.
     unsigned col, row;
     HLMatrixLower::GetMatrixInfo(CurFromTy, col, row);
@@ -4992,7 +5028,7 @@ CastCopyArray1DimToMultiDim(Value *FromArray, Value *ToArray, Type *CurToTy,
       V = Builder.CreateInsertElement(V, Elt, i);
     }
     Builder.CreateStore(V, ToPtr);
-  } else if (HLMatrixLower::IsMatrixType(CurToTy)) {
+  } else if (dxilutil::IsHLSLMatrixType(CurToTy)) {
     // Copy array to matrix.
     unsigned col, row;
     HLMatrixLower::GetMatrixInfo(CurToTy, col, row);
@@ -5036,7 +5072,7 @@ static void CastCopyOldPtrToNewPtr(Value *OldPtr, Value *NewPtr, HLModule &HLM,
       Value *Elt = Builder.CreateExtractElement(V, i);
       Builder.CreateStore(Elt, EltPtr);
     }
-  } else if (HLMatrixLower::IsMatrixType(OldTy)) {
+  } else if (dxilutil::IsHLSLMatrixType(OldTy)) {
     CopyMatPtrToArrayPtr(OldPtr, NewPtr, /*arrayBaseIdx*/ 0, HLM, Builder,
                          bRowMajor);
   } else if (OldTy->isArrayTy()) {
@@ -5066,7 +5102,7 @@ static void CastCopyNewPtrToOldPtr(Value *NewPtr, Value *OldPtr, HLModule &HLM,
       V = Builder.CreateInsertElement(V, Elt, i);
     }
     Builder.CreateStore(V, OldPtr);
-  } else if (HLMatrixLower::IsMatrixType(OldTy)) {
+  } else if (dxilutil::IsHLSLMatrixType(OldTy)) {
     CopyArrayPtrToMatPtr(NewPtr, /*arrayBaseIdx*/ 0, OldPtr, HLM, Builder,
                          bRowMajor);
   } else if (OldTy->isArrayTy()) {
@@ -5164,7 +5200,7 @@ void SROA_Parameter_HLSL::replaceCastParameter(
     // Must be in param.
     // Store NewParam to OldParam at entry.
     Builder.CreateStore(NewParam, OldParam);
-  } else if (HLMatrixLower::IsMatrixType(OldTy)) {
+  } else if (dxilutil::IsHLSLMatrixType(OldTy)) {
     bool bRowMajor = castRowMajorParamMap.count(NewParam);
     Value *Mat = LoadArrayPtrToMat(NewParam, /*arrayBaseIdx*/ 0, OldTy,
                                    *m_pHLModule, Builder, bRowMajor);
@@ -5243,7 +5279,7 @@ Value *SROA_Parameter_HLSL::castResourceArgIfRequired(
 Value *SROA_Parameter_HLSL::castArgumentIfRequired(
     Value *V, Type *Ty, bool bOut,
     DxilParamInputQual inputQual, DxilFieldAnnotation &annotation,
-    std::deque<Value *> &WorkList, IRBuilder<> &Builder) {
+    IRBuilder<> &Builder) {
   Module &M = *m_pHLModule->GetModule();
   IRBuilder<> AllocaBuilder(dxilutil::FindAllocaInsertionPt(Builder.GetInsertPoint()));
 
@@ -5329,26 +5365,25 @@ Value *SROA_Parameter_HLSL::castArgumentIfRequired(
   return V;
 }
 
+struct AnnotatedValue {
+  llvm::Value *Value;
+  DxilFieldAnnotation Annotation;
+};
+
 void SROA_Parameter_HLSL::flattenArgument(
     Function *F, Value *Arg, bool bForParam,
     DxilParameterAnnotation &paramAnnotation,
     std::vector<Value *> &FlatParamList,
     std::vector<DxilParameterAnnotation> &FlatAnnotationList,
-    IRBuilder<> &Builder, DbgDeclareInst *DDI) {
-  IRBuilder<> AllocaBuilder(dxilutil::FindAllocaInsertionPt(Builder.GetInsertPoint()));
-  std::deque<Value *> WorkList;
-  WorkList.push_back(Arg);
+    BasicBlock *EntryBlock, DbgDeclareInst *DDI) {
+  std::deque<AnnotatedValue> WorkList;
+  WorkList.push_back({ Arg, paramAnnotation });
 
   unsigned startArgIndex = FlatAnnotationList.size();
 
-  // Map from value to annotation.
-  std::unordered_map<Value *, DxilFieldAnnotation> annotationMap;
-  annotationMap[Arg] = paramAnnotation;
-
   DxilTypeSystem &dxilTypeSys = m_pHLModule->GetTypeSystem();
 
   const std::string &semantic = paramAnnotation.GetSemanticString();
-  bool bSemOverride = !semantic.empty();
 
   DxilParamInputQual inputQual = paramAnnotation.GetParamInputQual();
   bool bOut = inputQual == DxilParamInputQual::Out ||
@@ -5384,15 +5419,20 @@ void SROA_Parameter_HLSL::flattenArgument(
 
   // Process the worklist
   while (!WorkList.empty()) {
-    Value *V = WorkList.front();
+    AnnotatedValue AV = WorkList.front();
     WorkList.pop_front();
 
     // Do not skip unused parameter.
-
-    DxilFieldAnnotation &annotation = annotationMap[V];
+    Value *V = AV.Value;
+    DxilFieldAnnotation &annotation = AV.Annotation;
     const bool bAllowReplace = !bOut;
     SROA_Helper::LowerMemcpy(V, &annotation, dxilTypeSys, DL, bAllowReplace);
 
+    // Now is safe to create the IRBuilders.
+    // If we create it before LowerMemcpy, the insertion pointer instruction may get deleted
+    IRBuilder<> Builder(dxilutil::FirstNonAllocaInsertionPt(EntryBlock));
+    IRBuilder<> AllocaBuilder(dxilutil::FindAllocaInsertionPt(EntryBlock));
+
     std::vector<Value *> Elts;
 
     // Not flat vector for entry function currently.
@@ -5409,26 +5449,26 @@ void SROA_Parameter_HLSL::flattenArgument(
         continue;
       }
 
-      // Push Elts into workList.
-      // Use rbegin to make sure the order not change.
-      for (auto iter = Elts.rbegin(); iter != Elts.rend(); iter++)
-        WorkList.push_front(*iter);
-
       bool precise = annotation.IsPrecise();
       const std::string &semantic = annotation.GetSemanticString();
       hlsl::InterpolationMode interpMode = annotation.GetInterpolationMode();
-      
-      for (unsigned i=0;i<Elts.size();i++) {
-        Value *Elt = Elts[i];
+
+      // Push Elts into workList from right to left to preserve the order.
+      for (unsigned ri=0;ri<Elts.size();ri++) {
+        unsigned i = Elts.size() - ri - 1;
         DxilFieldAnnotation EltAnnotation = GetEltAnnotation(Ty, i, annotation, dxilTypeSys);
         const std::string &eltSem = EltAnnotation.GetSemanticString();
-
         if (!semantic.empty()) {
           if (!eltSem.empty()) {
-            // TODO: warning for override the semantic in EltAnnotation.
+            // It doesn't look like we can provide source location information from here
+            F->getContext().emitWarning(
+              Twine("semantic '") + eltSem + "' on field overridden by function or enclosing type");
           }
-          // Just save parent semantic here, allocate later.
-          EltAnnotation.SetSemanticString(semantic);
+
+          // Inherit semantic from parent, but only preserve it for the first element.
+          // Subsequent elements are noted with a special value that gets resolved
+          // once the argument is completely flattened.
+          EltAnnotation.SetSemanticString(i == 0 ? semantic : ContinuedPseudoSemantic);
         } else if (!eltSem.empty() &&
                  semanticTypeMap.count(eltSem) == 0) {
           Type *EltTy = dxilutil::GetArrayEltTy(Ty);
@@ -5442,22 +5482,13 @@ void SROA_Parameter_HLSL::flattenArgument(
         if (EltAnnotation.GetInterpolationMode().GetKind() == DXIL::InterpolationMode::Undefined)
           EltAnnotation.SetInterpolationMode(interpMode);
 
-        annotationMap[Elt] = EltAnnotation;
+        WorkList.push_front({ Elts[i], EltAnnotation });
       }
 
-      annotationMap.erase(V);
-
       ++NumReplaced;
       if (Instruction *I = dyn_cast<Instruction>(V))
         deadAllocas.emplace_back(I);
     } else {
-      if (bSemOverride) {
-        if (!annotation.GetSemanticString().empty()) {
-          // TODO: warning for override the semantic in EltAnnotation.
-        }
-        // Just save parent semantic here, allocate later.
-        annotation.SetSemanticString(semantic);
-      }
       Type *Ty = V->getType();
       if (Ty->isPointerTy())
         Ty = Ty->getPointerElementType();
@@ -5494,8 +5525,7 @@ void SROA_Parameter_HLSL::flattenArgument(
           // Add semantic type.
           semanticTypeMap[EltAnnotation.GetSemanticString()] = Ty;
 
-          annotationMap[Elt] = EltAnnotation;
-          WorkList.push_front(Elt);
+          WorkList.push_front({ Elt, EltAnnotation });
           // Copy local target to flattened target.
           std::vector<Value*> idxList(arrayLevel+1);
           idxList[0] = Builder.getInt32(0);
@@ -5529,16 +5559,12 @@ void SROA_Parameter_HLSL::flattenArgument(
             arrayIdxList[idx-1] = 0;
           }
         }
-        // Don't override flattened SV_Target.
-        if (V == Arg) {
-          bSemOverride = false;
-        }
         continue;
       }
 
       // Cast vector/matrix/resource parameter.
       V = castArgumentIfRequired(V, Ty, bOut, inputQual,
-                                 annotation, WorkList, Builder);
+                                  annotation, Builder);
 
       // Cannot SROA, save it to final parameter list.
       FlatParamList.emplace_back(V);
@@ -5583,6 +5609,7 @@ void SROA_Parameter_HLSL::flattenArgument(
         // Create a value as output value.
         Type *outputType = V->getType()->getPointerElementType()->getStructElementType(0);
         Value *outputVal = AllocaBuilder.CreateAlloca(outputType);
+
         // For each stream.Append(data)
         // transform into
         //   d = load data
@@ -5592,42 +5619,46 @@ void SROA_Parameter_HLSL::flattenArgument(
           if (CallInst *CI = dyn_cast<CallInst>(user)) {
             unsigned opcode = GetHLOpcode(CI);
             if (opcode == static_cast<unsigned>(IntrinsicOp::MOP_Append)) {
-              if (CI->getNumArgOperands() == (HLOperandIndex::kStreamAppendDataOpIndex + 1)) {
-                Value *data =
-                    CI->getArgOperand(HLOperandIndex::kStreamAppendDataOpIndex);
-                DXASSERT(data->getType()->isPointerTy(),
-                         "Append value must be pointer.");
+              // At this point, the stream append data argument might or not have been SROA'd
+              Value *firstDataPtr = CI->getArgOperand(HLOperandIndex::kStreamAppendDataOpIndex);
+              DXASSERT(firstDataPtr->getType()->isPointerTy(), "Append value must be a pointer.");
+              if (firstDataPtr->getType()->getPointerElementType() == outputType) {
+                // The data has not been SROA'd
+                DXASSERT(CI->getNumArgOperands() == (HLOperandIndex::kStreamAppendDataOpIndex + 1),
+                  "Unexpected number of arguments for non-SROA'd StreamOutput.Append");
                 IRBuilder<> Builder(CI);
 
                 llvm::SmallVector<llvm::Value *, 16> idxList;
-                SplitCpy(data->getType(), outputVal, data, idxList, Builder, DL,
-                         dxilTypeSys, &flatParamAnnotation);
+                SplitCpy(firstDataPtr->getType(), outputVal, firstDataPtr, idxList, Builder, DL,
+                          dxilTypeSys, &flatParamAnnotation);
 
                 CI->setArgOperand(HLOperandIndex::kStreamAppendDataOpIndex, outputVal);
               }
               else {
-                // Append has been flattened.
+                // Append has been SROA'd, we might be operating on multiple values
+                // with types differing from the stream output type.
                 // Flatten store outputVal.
                 // Must be struct to be flatten.
                 IRBuilder<> Builder(CI);
 
-                llvm::SmallVector<llvm::Value *, 16> idxList;
+                llvm::SmallVector<llvm::Value *, 16> IdxList;
                 llvm::SmallVector<llvm::Value *, 16> EltPtrList;
+                llvm::SmallVector<const DxilFieldAnnotation*, 16> EltAnnotationList;
                 // split
-                SplitPtr(outputVal->getType(), outputVal, idxList, EltPtrList,
-                         Builder);
+                SplitPtr(outputVal, IdxList, outputVal->getType(), flatParamAnnotation,
+                  EltPtrList, EltAnnotationList, dxilTypeSys, Builder);
 
                 unsigned eltCount = CI->getNumArgOperands()-2;
                 DXASSERT_LOCALVAR(eltCount, eltCount == EltPtrList.size(), "invalid element count");
 
                 for (unsigned i = HLOperandIndex::kStreamAppendDataOpIndex; i < CI->getNumArgOperands(); i++) {
                   Value *DataPtr = CI->getArgOperand(i);
-                  Value *EltPtr =
-                      EltPtrList[i - HLOperandIndex::kStreamAppendDataOpIndex];
+                  Value *EltPtr = EltPtrList[i - HLOperandIndex::kStreamAppendDataOpIndex];
+                  const DxilFieldAnnotation *EltAnnotation = EltAnnotationList[i - HLOperandIndex::kStreamAppendDataOpIndex];
 
-                  llvm::SmallVector<llvm::Value *, 16> idxList;
-                  SplitCpy(DataPtr->getType(), EltPtr, DataPtr, idxList,
-                           Builder, DL, dxilTypeSys, &flatParamAnnotation);
+                  llvm::SmallVector<llvm::Value *, 16> IdxList;
+                  SplitCpy(DataPtr->getType(), EltPtr, DataPtr, IdxList,
+                            Builder, DL, dxilTypeSys, EltAnnotation);
                   CI->setArgOperand(i, EltPtr);
                 }
               }
@@ -5636,7 +5667,7 @@ void SROA_Parameter_HLSL::flattenArgument(
         }
 
         // Then split output value to generate ParamQual.
-        WorkList.push_front(outputVal);
+        WorkList.push_front({ outputVal, annotation });
       }
     }
   }
@@ -5892,7 +5923,7 @@ static void LegalizeDxilInputOutputs(Function *F,
 
     // Skip arg which is not a pointer.
     if (!Ty->isPointerTy()) {
-      if (HLMatrixLower::IsMatrixType(Ty)) {
+      if (dxilutil::IsHLSLMatrixType(Ty)) {
         // Replace matrix arg with cast to vec. It will be lowered in
         // DxilGenerationPass.
         isColMajor = paramAnnotation.GetMatrixAnnotation().Orientation ==
@@ -5920,22 +5951,18 @@ static void LegalizeDxilInputOutputs(Function *F,
     bool bStore = false;
     CheckArgUsage(&arg, bLoad, bStore);
 
-    bool bNeedTemp = false;
     bool bStoreInputToTemp = false;
     bool bLoadOutputFromTemp = false;
 
     if (qual == DxilParamInputQual::In && bStore) {
-      bNeedTemp = true;
       bStoreInputToTemp = true;
     } else if (qual == DxilParamInputQual::Out && bLoad) {
-      bNeedTemp = true;
       bLoadOutputFromTemp = true;
     } else if (bLoad && bStore) {
       switch (qual) {
       case DxilParamInputQual::InputPrimitive:
       case DxilParamInputQual::InputPatch:
       case DxilParamInputQual::OutputPatch: {
-        bNeedTemp = true;
         bStoreInputToTemp = true;
       } break;
       case DxilParamInputQual::Inout:
@@ -5945,13 +5972,11 @@ static void LegalizeDxilInputOutputs(Function *F,
       }
     } else if (qual == DxilParamInputQual::Inout) {
       // Only replace inout when (bLoad && bStore) == false.
-      bNeedTemp = true;
       bLoadOutputFromTemp = true;
       bStoreInputToTemp = true;
     }
 
-    if (HLMatrixLower::IsMatrixType(Ty)) {
-      bNeedTemp = true;
+    if (dxilutil::IsHLSLMatrixType(Ty)) {
       if (qual == DxilParamInputQual::In)
         bStoreInputToTemp = bLoad;
       else if (qual == DxilParamInputQual::Out)
@@ -5962,7 +5987,7 @@ static void LegalizeDxilInputOutputs(Function *F,
       }
     }
 
-    if (bNeedTemp) {
+    if (bStoreInputToTemp || bLoadOutputFromTemp) {
       IRBuilder<> AllocaBuilder(EntryBlk.getFirstInsertionPt());
       IRBuilder<> Builder(dxilutil::FirstNonAllocaInsertionPt(&EntryBlk));
 
@@ -6036,11 +6061,17 @@ void SROA_Parameter_HLSL::createFlattenedFunction(Function *F) {
 
   LLVMContext &Ctx = m_pHLModule->GetCtx();
   std::unique_ptr<BasicBlock> TmpBlockForFuncDecl;
+  BasicBlock *EntryBlock;
   if (F->isDeclaration()) {
+    // We still want to SROA the parameters, so creaty a dummy
+    // function body block to avoid special cases.
     TmpBlockForFuncDecl.reset(BasicBlock::Create(Ctx));
     // Create return as terminator.
     IRBuilder<> RetBuilder(TmpBlockForFuncDecl.get());
     RetBuilder.CreateRetVoid();
+    EntryBlock = TmpBlockForFuncDecl.get();
+  } else {
+    EntryBlock = &F->getEntryBlock();
   }
 
   std::vector<Value *> FlatParamList;
@@ -6053,13 +6084,6 @@ void SROA_Parameter_HLSL::createFlattenedFunction(Function *F) {
   for (Argument &Arg : F->args()) {
     // merge GEP use for arg.
     HLModule::MergeGepUse(&Arg);
-    // Insert point may be removed. So recreate builder every time.
-    IRBuilder<> Builder(Ctx);
-    if (!F->isDeclaration()) {
-      Builder.SetInsertPoint(dxilutil::FirstNonAllocaInsertionPt(F));
-    } else {
-      Builder.SetInsertPoint(dxilutil::FirstNonAllocaInsertionPt(TmpBlockForFuncDecl.get()));
-    }
 
     unsigned prevFlatParamCount = FlatParamList.size();
 
@@ -6067,7 +6091,7 @@ void SROA_Parameter_HLSL::createFlattenedFunction(Function *F) {
         funcAnnotation->GetParameterAnnotation(Arg.getArgNo());
     DbgDeclareInst *DDI = llvm::FindAllocaDbgDeclare(&Arg);
     flattenArgument(F, &Arg, bForParamTrue, paramAnnotation, FlatParamList,
-                    FlatParamAnnotationList, Builder, DDI);
+                    FlatParamAnnotationList, EntryBlock, DDI);
 
     unsigned newFlatParamCount = FlatParamList.size() - prevFlatParamCount;
     for (unsigned i = 0; i < newFlatParamCount; i++) {
@@ -6081,15 +6105,8 @@ void SROA_Parameter_HLSL::createFlattenedFunction(Function *F) {
   std::vector<DxilParameterAnnotation> FlatRetAnnotationList;
   // Split and change to out parameter.
   if (!retType->isVoidTy()) {
-    IRBuilder<> Builder(Ctx);
-    IRBuilder<> AllocaBuilder(Ctx);
-    if (!F->isDeclaration()) {
-      Builder.SetInsertPoint(dxilutil::FirstNonAllocaInsertionPt(F));
-      AllocaBuilder.SetInsertPoint(dxilutil::FindAllocaInsertionPt(F));
-    } else {
-      Builder.SetInsertPoint(dxilutil::FirstNonAllocaInsertionPt(TmpBlockForFuncDecl.get()));
-      AllocaBuilder.SetInsertPoint(TmpBlockForFuncDecl->getFirstInsertionPt());
-    }
+    IRBuilder<> Builder(dxilutil::FirstNonAllocaInsertionPt(EntryBlock));
+    IRBuilder<> AllocaBuilder(dxilutil::FindAllocaInsertionPt(EntryBlock));
     Value *retValAddr = AllocaBuilder.CreateAlloca(retType);
     DxilParameterAnnotation &retAnnotation =
         funcAnnotation->GetRetTypeAnnotation();
@@ -6143,7 +6160,7 @@ void SROA_Parameter_HLSL::createFlattenedFunction(Function *F) {
     DbgDeclareInst *DDI = llvm::FindAllocaDbgDeclare(retValAddr);
     flattenArgument(F, retValAddr, bForParamTrue,
                     funcAnnotation->GetRetTypeAnnotation(), FlatRetList,
-                    FlatRetAnnotationList, Builder, DDI);
+                    FlatRetAnnotationList, EntryBlock, DDI);
 
     const int kRetArgNo = -1;
     for (unsigned i = 0; i < FlatRetList.size(); i++) {
@@ -6341,6 +6358,8 @@ void SROA_Parameter_HLSL::createFlattenedFunction(Function *F) {
       }
 
       flatArg->replaceAllUsesWith(Arg);
+      if (isa<Instruction>(flatArg))
+        DeadInsts.emplace_back(flatArg);
 
       HLModule::MergeGepUse(Arg);
       // Flatten store of array parameter.
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 95e4e1185..0ab1fe70f 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -548,7 +548,10 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
 /// in place of the other. Note that we will always choose the non-undef
 /// value to keep.
 static bool CanMergeValues(Value *First, Value *Second) {
-  return First == Second || isa<UndefValue>(First) || isa<UndefValue>(Second);
+  return First == Second;
+   // HLSL Change Begin -Not merge undef.
+   //  || isa<UndefValue>(First) || isa<UndefValue>(Second);
+   // HLSL Change End.
 }
 
 /// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an
diff --git a/tools/clang/include/clang/AST/HlslTypes.h b/tools/clang/include/clang/AST/HlslTypes.h
index fac24a2a2..8f3d5b50a 100644
--- a/tools/clang/include/clang/AST/HlslTypes.h
+++ b/tools/clang/include/clang/AST/HlslTypes.h
@@ -371,8 +371,10 @@ bool IsHLSLVecMatType(clang::QualType);
 bool IsHLSLVecType(clang::QualType type);
 bool IsHLSLMatType(clang::QualType type);
 clang::QualType GetElementTypeOrType(clang::QualType type);
-bool HasHLSLMatOrientation(clang::QualType type, bool *pIsRowMajor);
-bool HasHLSLUNormSNorm(clang::QualType type, bool *pIsSNorm);
+bool HasHLSLMatOrientation(clang::QualType type, bool *pIsRowMajor = nullptr);
+bool IsHLSLMatRowMajor(clang::QualType type, bool defaultValue);
+bool IsHLSLUnsigned(clang::QualType type);
+bool HasHLSLUNormSNorm(clang::QualType type, bool *pIsSNorm = nullptr);
 bool IsHLSLInputPatchType(clang::QualType type);
 bool IsHLSLOutputPatchType(clang::QualType type);
 bool IsHLSLPointStreamType(clang::QualType type);
@@ -382,6 +384,7 @@ bool IsHLSLStreamOutputType(clang::QualType type);
 bool IsHLSLResourceType(clang::QualType type);
 bool IsHLSLNumeric(clang::QualType type);
 bool IsHLSLNumericUserDefinedType(clang::QualType type);
+bool IsHLSLAggregateType(clang::ASTContext& context, clang::QualType type);
 clang::QualType GetHLSLResourceResultType(clang::QualType type);
 bool IsIncompleteHLSLResourceArrayType(clang::ASTContext& context, clang::QualType type);
 clang::QualType GetHLSLInputPatchElementType(clang::QualType type);
diff --git a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
index b9116755a..542501d3c 100644
--- a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -7670,8 +7670,8 @@ def err_hlsl_intrinsic_template_arg_unsupported: Error<
    "Explicit template arguments on intrinsic %0 are not supported.">;
 def err_hlsl_intrinsic_template_arg_requires_2018: Error<
    "Explicit template arguments on intrinsic %0 requires HLSL version 2018 or above.">;
-def err_hlsl_intrinsic_template_arg_scalar_vector_16: Error<
-   "Explicit template arguments on intrinsic %0 are limited one to scalar or vector type up to 16 bytes in size.">;
+def err_hlsl_intrinsic_template_arg_scalar_vector: Error<
+   "Explicit template arguments on intrinsic %0 are limited one to scalar or vector type.">;
 }
 def err_hlsl_no_struct_user_defined_type: Error<
    "User defined type intrinsic arg must be struct">;
diff --git a/tools/clang/include/clang/Frontend/CodeGenOptions.h b/tools/clang/include/clang/Frontend/CodeGenOptions.h
index c9b50d52b..7a97c6bbf 100644
--- a/tools/clang/include/clang/Frontend/CodeGenOptions.h
+++ b/tools/clang/include/clang/Frontend/CodeGenOptions.h
@@ -212,6 +212,8 @@ public:
   /// DefaultLinkage Internal, External, or Default.  If Default, default
   /// function linkage is determined by library target.
   hlsl::DXIL::DefaultLinkage DefaultLinkage = hlsl::DXIL::DefaultLinkage::Default;
+  /// Assume UAVs/SRVs may alias.
+  bool HLSLResMayAlias = false;
   // HLSL Change Ends
 
   // SPIRV Change Starts
diff --git a/tools/clang/include/clang/SPIRV/ModuleBuilder.h b/tools/clang/include/clang/SPIRV/ModuleBuilder.h
index 0e19a4c74..636827dc9 100644
--- a/tools/clang/include/clang/SPIRV/ModuleBuilder.h
+++ b/tools/clang/include/clang/SPIRV/ModuleBuilder.h
@@ -431,6 +431,9 @@ public:
   /// \brief Decorates the given target <result-id> with nonuniformEXT
   void decorateNonUniformEXT(uint32_t targetId);
 
+  /// \brief Decorates the given target <result-id> with NoContraction
+  void decorateNoContraction(uint32_t targetId);
+
   // === Type ===
 
   uint32_t getVoidType();
diff --git a/tools/clang/include/clang/Sema/DeclSpec.h b/tools/clang/include/clang/Sema/DeclSpec.h
index 55a1bbeec..8883f48fa 100644
--- a/tools/clang/include/clang/Sema/DeclSpec.h
+++ b/tools/clang/include/clang/Sema/DeclSpec.h
@@ -347,6 +347,15 @@ private:
   /*TSCS*/unsigned ThreadStorageClassSpec : 2;
   unsigned SCS_extern_in_linkage_spec : 1;
 
+  // HLSL Change Start
+  // Whether the default matrix pack is defined at the point
+  // of the declaration. This is false when rewriting
+  // and no #pragma pack_matrix have been encountered yet.
+  unsigned HasDefaultMatrixPack : 1;
+  // Default matrix pack at the point of the declaration
+  unsigned DefaultMatrixPackRowMajor : 1;
+  // HLSL Change End
+
   // type-specifier
   /*TSW*/unsigned TypeSpecWidth : 2;
   /*TSC*/unsigned TypeSpecComplex : 2;
@@ -431,6 +440,8 @@ public:
     : StorageClassSpec(SCS_unspecified),
       ThreadStorageClassSpec(TSCS_unspecified),
       SCS_extern_in_linkage_spec(false),
+      HasDefaultMatrixPack(false), // HLSL Change
+      DefaultMatrixPackRowMajor(false), // HLSL Change
       TypeSpecWidth(TSW_unspecified),
       TypeSpecComplex(TSC_unspecified),
       TypeSpecSign(TSS_unspecified),
@@ -463,6 +474,19 @@ public:
     SCS_extern_in_linkage_spec = Value;
   }
 
+  // HLSL changes begin
+  bool TryGetDefaultMatrixPackRowMajor(bool& rowMajor) const {
+    if (!HasDefaultMatrixPack) return false;
+    rowMajor = DefaultMatrixPackRowMajor;
+    return true;
+  }
+
+  void SetDefaultMatrixPackRowMajor(bool Value) {
+    HasDefaultMatrixPack = true;
+    DefaultMatrixPackRowMajor = Value;
+  }
+  // HLSL changes end
+
   SourceLocation getStorageClassSpecLoc() const { return StorageClassSpecLoc; }
   SourceLocation getThreadStorageClassSpecLoc() const {
     return ThreadStorageClassSpecLoc;
diff --git a/tools/clang/include/clang/Sema/Sema.h b/tools/clang/include/clang/Sema/Sema.h
index 7e0261ded..2ba3d0b72 100644
--- a/tools/clang/include/clang/Sema/Sema.h
+++ b/tools/clang/include/clang/Sema/Sema.h
@@ -333,10 +333,12 @@ public:
   LangOptions::PragmaMSPointersToMembersKind
       MSPointerToMemberRepresentationMethod;
 
-  // HLSL Change Begin - pragma pack_matrix.
-  // Add both row/col to identify the default case which no pragma.
-  bool PackMatrixRowMajorPragmaOn = false; // True when \#pragma pack_matrix(row_major) on.
-  bool PackMatrixColMajorPragmaOn = false; // True when \#pragma pack_matrix(column_major) on.
+  // HLSL Change Begin
+  // The HLSL rewriter doesn't define a default matrix pack,
+  // so we must preserve the lack of annotations to avoid changing semantics.
+  bool HasDefaultMatrixPack = false;
+  // Uses of #pragma pack_matrix change the default pack.
+  bool DefaultMatrixPackRowMajor = false;
   // HLSL Change End.
 
   enum PragmaVtorDispKind {
diff --git a/tools/clang/lib/AST/ASTContext.cpp b/tools/clang/lib/AST/ASTContext.cpp
index a695e9da5..af88bfa50 100644
--- a/tools/clang/lib/AST/ASTContext.cpp
+++ b/tools/clang/lib/AST/ASTContext.cpp
@@ -1568,7 +1568,6 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     // Vector align to its element.
     if (getLangOpts().HLSL) {
       Align = EltInfo.Align;
-      Width = Align * VT->getNumElements();
     }
     // HLSL Change Ends.
     // If the alignment is not a power of 2, round up to the next power of 2.
diff --git a/tools/clang/lib/AST/ExprConstant.cpp b/tools/clang/lib/AST/ExprConstant.cpp
index 1be578887..c78cdb582 100644
--- a/tools/clang/lib/AST/ExprConstant.cpp
+++ b/tools/clang/lib/AST/ExprConstant.cpp
@@ -7670,6 +7670,7 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) {
   case CK_CopyAndAutoreleaseBlockObject:
   case CK_HLSLVectorToScalarCast:   // HLSL Change
   case CK_HLSLMatrixToScalarCast:   // HLSL Change
+  case CK_FlatConversion: // HLSL Change
     return Error(E);
 
   case CK_UserDefinedConversion:
diff --git a/tools/clang/lib/AST/HlslTypes.cpp b/tools/clang/lib/AST/HlslTypes.cpp
index bfd2e2644..f38173410 100644
--- a/tools/clang/lib/AST/HlslTypes.cpp
+++ b/tools/clang/lib/AST/HlslTypes.cpp
@@ -125,6 +125,15 @@ bool IsHLSLNumericUserDefinedType(clang::QualType type) {
   return false;
 }
 
+bool IsHLSLAggregateType(clang::ASTContext& context, clang::QualType type) {
+  // Aggregate types are arrays and user-defined structs
+  if (context.getAsArrayType(type) != nullptr) return true;
+  const RecordType *Record = dyn_cast<RecordType>(type);
+  return Record != nullptr
+    && !IsHLSLVecMatType(type) && !IsHLSLResourceType(type)
+    && !dyn_cast<ClassTemplateSpecializationDecl>(Record->getAsCXXRecordDecl());
+}
+
 clang::QualType GetElementTypeOrType(clang::QualType type) {
   if (const RecordType *RT = type->getAs<RecordType>()) {
     if (const ClassTemplateSpecializationDecl *templateDecl =
@@ -160,6 +169,26 @@ bool HasHLSLMatOrientation(clang::QualType type, bool *pIsRowMajor) {
   return false;
 }
 
+bool IsHLSLMatRowMajor(clang::QualType type, bool defaultValue) {
+  bool result = defaultValue;
+  HasHLSLMatOrientation(type, &result);
+  return result;
+}
+
+bool IsHLSLUnsigned(clang::QualType type) {
+  if (type->getAs<clang::BuiltinType>() == nullptr) {
+    type = type.getCanonicalType().getNonReferenceType();
+
+    if (IsHLSLVecMatType(type))
+      type = GetElementTypeOrType(type);
+
+    if (type->isExtVectorType())
+      type = type->getAs<clang::ExtVectorType>()->getElementType();
+  }
+
+  return type->isUnsignedIntegerType();
+}
+
 bool HasHLSLUNormSNorm(clang::QualType type, bool *pIsSNorm) {
   // snorm/unorm can be on outer vector/matrix as well as element type
   // in the template form.  Outer-most type attribute wins.
diff --git a/tools/clang/lib/Basic/DiagnosticIDs.cpp b/tools/clang/lib/Basic/DiagnosticIDs.cpp
index ad7f52682..66fae253c 100644
--- a/tools/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/tools/clang/lib/Basic/DiagnosticIDs.cpp
@@ -278,7 +278,12 @@ namespace clang {
 
       unsigned getOrCreateDiagID(DiagnosticIDs::Level L, StringRef Message,
                                  DiagnosticIDs &Diags) {
-        DiagDesc D(L, Message);
+        // HLSL Change Starts
+        // ".str()" is a workaround for a bug in VC++'s STL where std::pair<T,U>::pair<T2,U2>(T2&&,U2&&)
+        // may cause a conversion operator from U2 to U to be invoked within a noexcept function.
+        // This would cause a call std::terminate if we ran out of memory and throw std::bad_alloc
+        DiagDesc D(L, Message.str());
+        // HLSL Change Ends
         // Check to see if it already exists.
         std::map<DiagDesc, unsigned>::iterator I = DiagIDs.lower_bound(D);
         if (I != DiagIDs.end() && I->first == D)
diff --git a/tools/clang/lib/Basic/Targets.cpp b/tools/clang/lib/Basic/Targets.cpp
index fb9af0323..71e08a4e7 100644
--- a/tools/clang/lib/Basic/Targets.cpp
+++ b/tools/clang/lib/Basic/Targets.cpp
@@ -6991,9 +6991,7 @@ public:
     LongWidth = LongAlign = 32;
     LongDoubleWidth = LongDoubleAlign = 64;
     LongDoubleFormat = &llvm::APFloat::IEEEdouble;
-    BoolWidth = 32;
-    // To avoid member for alignment.
-    BoolAlign = 8;
+    BoolWidth = BoolAlign = 32;
 
     // using the Microsoft ABI.
     TheCXXABI.set(TargetCXXABI::Microsoft);
diff --git a/tools/clang/lib/CodeGen/BackendUtil.cpp b/tools/clang/lib/CodeGen/BackendUtil.cpp
index 204faa576..7ffd1975a 100644
--- a/tools/clang/lib/CodeGen/BackendUtil.cpp
+++ b/tools/clang/lib/CodeGen/BackendUtil.cpp
@@ -324,6 +324,7 @@ void EmitAssemblyHelper::CreatePasses() {
   PMBuilder.LoopVectorize = CodeGenOpts.VectorizeLoop;
   PMBuilder.HLSLHighLevel = CodeGenOpts.HLSLHighLevel; // HLSL Change
   PMBuilder.HLSLExtensionsCodeGen = CodeGenOpts.HLSLExtensionsCodegen.get(); // HLSL Change
+  PMBuilder.HLSLResMayAlias = CodeGenOpts.HLSLResMayAlias; // HLSL Change
 
   PMBuilder.DisableUnitAtATime = !CodeGenOpts.UnitAtATime;
   PMBuilder.DisableUnrollLoops = !CodeGenOpts.UnrollLoops;
diff --git a/tools/clang/lib/CodeGen/CGDecl.cpp b/tools/clang/lib/CodeGen/CGDecl.cpp
index 9867f6c37..e9096be80 100644
--- a/tools/clang/lib/CodeGen/CGDecl.cpp
+++ b/tools/clang/lib/CodeGen/CGDecl.cpp
@@ -1865,11 +1865,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, llvm::Value *Arg,
   if (CGDebugInfo *DI = getDebugInfo()) {
     if (CGM.getCodeGenOpts().getDebugInfo()
           >= CodeGenOptions::LimitedDebugInfo) {
-      // HLSL Change Begins.
-      // Use the Arg directly instead of DeclPtr for HLSL.
-      // The DeclPtr will be promoted in later pass.
-      DI->EmitDeclareOfArgVariable(&D, Arg, ArgNo, Builder);
-      // HLSL Change Ends.
+      DI->EmitDeclareOfArgVariable(&D, DeclPtr, ArgNo, Builder);
     }
   }
 
diff --git a/tools/clang/lib/CodeGen/CGExpr.cpp b/tools/clang/lib/CodeGen/CGExpr.cpp
index f23aa37f0..5b612bb0e 100644
--- a/tools/clang/lib/CodeGen/CGExpr.cpp
+++ b/tools/clang/lib/CodeGen/CGExpr.cpp
@@ -1082,6 +1082,14 @@ static bool hasBooleanRepresentation(QualType Ty) {
   return false;
 }
 
+// HLSL Change Begin.
+static bool hasBooleanScalarOrVectorRepresentation(QualType Ty) {
+  if (hlsl::IsHLSLVecType(Ty))
+    return hasBooleanRepresentation(hlsl::GetElementTypeOrType(Ty));
+  return hasBooleanRepresentation(Ty);
+}
+// HLSL Change End.
+
 static bool getRangeForType(CodeGenFunction &CGF, QualType Ty,
                             llvm::APInt &Min, llvm::APInt &End,
                             bool StrictEnums) {
@@ -1233,30 +1241,26 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(llvm::Value *Addr, bool Volatile,
 }
 
 llvm::Value *CodeGenFunction::EmitToMemory(llvm::Value *Value, QualType Ty) {
-  // Bool has a different representation in memory than in registers.
-  if (hasBooleanRepresentation(Ty)) {
-    // This should really always be an i1, but sometimes it's already
-    // an i8, and it's awkward to track those cases down.
-    if (Value->getType()->isIntegerTy(1))
+  // HLSL Change Begin.
+  // Bool scalar and vectors have a different representation in memory than in registers.
+  if (hasBooleanScalarOrVectorRepresentation(Ty)) {
+    if (Value->getType()->getScalarType()->isIntegerTy(1))
       return Builder.CreateZExt(Value, ConvertTypeForMem(Ty), "frombool");
-    assert(Value->getType()->isIntegerTy(getContext().getTypeSize(Ty)) &&
-           "wrong value rep of bool");
   }
+  // HLSL Change End.
 
   return Value;
 }
 
 llvm::Value *CodeGenFunction::EmitFromMemory(llvm::Value *Value, QualType Ty) {
-  // Bool has a different representation in memory than in registers.
-  if (hasBooleanRepresentation(Ty)) {
-    assert(Value->getType()->isIntegerTy(getContext().getTypeSize(Ty)) &&
-           "wrong value rep of bool");
-    // HLSL Change Begin.
+  // HLSL Change Begin.
+  // Bool scalar and vectors have a different representation in memory than in registers.
+  if (hasBooleanScalarOrVectorRepresentation(Ty)) {
     // Use ne v, 0 to convert to i1 instead of trunc.
     return Builder.CreateICmpNE(
-        Value, llvm::ConstantInt::get(Value->getType(), 0), "tobool");
-    // HLSL Change End.
+        Value, llvm::ConstantVector::getNullValue(Value->getType()), "tobool");
   }
+  // HLSL Change End.
 
   return Value;
 }
@@ -1392,6 +1396,20 @@ RValue CodeGenFunction::EmitLoadOfLValue(LValue LV, SourceLocation Loc) {
         return RValue::get(V);
       }
     }
+
+    if (hlsl::IsHLSLAggregateType(getContext(), LV.getType())) {
+      // We cannot load the value because we don't expect to ever have
+      // user-defined struct or array-typed llvm registers, only pointers to them.
+      // To preserve the snapshot semantics of LValue loads, we copy the
+      // value to a temporary and return a pointer to it.
+      llvm::Value *Alloca = CreateMemTemp(LV.getType(), "rval");
+      auto CharSizeAlignPair = getContext().getTypeInfoInChars(LV.getType());
+      Builder.CreateMemCpy(Alloca, LV.getAddress(),
+        static_cast<uint64_t>(CharSizeAlignPair.first.getQuantity()),
+        static_cast<unsigned>(CharSizeAlignPair.second.getQuantity()));
+
+      return RValue::get(Alloca);
+    }
     // HLSL Change End.
 
     // Everything needs a load.
@@ -1475,6 +1493,8 @@ RValue CodeGenFunction::EmitLoadOfExtVectorElementLValue(LValue LV) {
   Load->setAlignment(LV.getAlignment().getQuantity());
   llvm::Value *Vec = Load;
 
+  Vec = EmitFromMemory(Vec, LV.getType()); // HLSL Change
+
   const llvm::Constant *Elts = LV.getExtVectorElts();
 
   // If the result of the expression is a non-vector type, we must be extracting
@@ -1748,7 +1768,10 @@ void CodeGenFunction::EmitStoreThroughExtVectorComponentLValue(RValue Src,
   const llvm::Constant *Elts = Dst.getExtVectorElts();
 
   llvm::Value *SrcVal = Src.getScalarVal();
+
   // HLSL Change Starts
+  SrcVal = EmitToMemory(SrcVal, Dst.getType());
+
   const VectorType *VTy = Dst.getType()->getAs<VectorType>();
   if (VTy == nullptr && getContext().getLangOpts().HLSL)
     VTy =
@@ -2918,11 +2941,12 @@ CodeGenFunction::EmitHLSLVectorElementExpr(const HLSLVectorElementExpr *E) {
     assert(hlsl::IsHLSLVecType(E->getBase()->getType()) &&
            "Result must be a vector");
     llvm::Value *Vec = EmitScalarExpr(E->getBase());
+    Vec = EmitToMemory(Vec, E->getBase()->getType());
 
     // Store the vector to memory (because LValue wants an address).
-    llvm::Value *VecMem = CreateMemTemp(E->getBase()->getType());
-    Builder.CreateStore(Vec, VecMem);
-    Base = MakeAddrLValue(VecMem, E->getBase()->getType());
+    llvm::Value *VecMemPtr = CreateMemTemp(E->getBase()->getType());
+    Builder.CreateStore(Vec, VecMemPtr);
+    Base = MakeAddrLValue(VecMemPtr, E->getBase()->getType());
   }
 
   QualType type =
diff --git a/tools/clang/lib/CodeGen/CGExprAgg.cpp b/tools/clang/lib/CodeGen/CGExprAgg.cpp
index 61ddc898a..c987fbc8f 100644
--- a/tools/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/tools/clang/lib/CodeGen/CGExprAgg.cpp
@@ -714,12 +714,12 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
 
     if (IntegerLiteral *IL = dyn_cast<IntegerLiteral>(E->getSubExpr())) {
       llvm::Value *SrcVal = llvm::ConstantInt::get(CGF.getLLVMContext(), IL->getValue());
-      CGF.CGM.getHLSLRuntime().EmitHLSLFlatConversionToAggregate(
+      CGF.CGM.getHLSLRuntime().EmitHLSLFlatConversion(
           CGF, SrcVal, DestPtr, E->getType(), Ty);
     } else if (FloatingLiteral *FL =
                    dyn_cast<FloatingLiteral>(E->getSubExpr())) {
       llvm::Value *SrcVal = llvm::ConstantFP::get(CGF.getLLVMContext(), FL->getValue());
-      CGF.CGM.getHLSLRuntime().EmitHLSLFlatConversionToAggregate(
+      CGF.CGM.getHLSLRuntime().EmitHLSLFlatConversion(
           CGF, SrcVal, DestPtr, E->getType(), Ty);
     } else {
       Expr *Src = E->getSubExpr();
@@ -744,7 +744,7 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
       } break;
       case TEK_Scalar: {
         llvm::Value *SrcVal = CGF.EmitScalarExpr(Src);
-        CGF.CGM.getHLSLRuntime().EmitHLSLFlatConversionToAggregate(
+        CGF.CGM.getHLSLRuntime().EmitHLSLFlatConversion(
           CGF, SrcVal, DestPtr, E->getType(), Ty);
       } break;
       default:
diff --git a/tools/clang/lib/CodeGen/CGExprConstant.cpp b/tools/clang/lib/CodeGen/CGExprConstant.cpp
index 512707b1d..bbf110452 100644
--- a/tools/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/tools/clang/lib/CodeGen/CGExprConstant.cpp
@@ -1232,7 +1232,7 @@ llvm::Constant *CodeGenModule::EmitConstantInit(const VarDecl &D,
   assert(E && "No initializer to emit");
 
   llvm::Constant* C = ConstExprEmitter(*this, CGF).Visit(const_cast<Expr*>(E));
-  if (C && C->getType()->isIntegerTy(1)) {
+  if (C && C->getType()->getScalarType()->isIntegerTy(1)) { // HLSL Change
     llvm::Type *BoolTy = getTypes().ConvertTypeForMem(E->getType());
     C = llvm::ConstantExpr::getZExt(C, BoolTy);
   }
@@ -1257,7 +1257,7 @@ llvm::Constant *CodeGenModule::EmitConstantExpr(const Expr *E,
   else
     C = ConstExprEmitter(*this, CGF).Visit(const_cast<Expr*>(E));
 
-  if (C && C->getType()->isIntegerTy(1)) {
+  if (C && C->getType()->getScalarType()->isIntegerTy(1)) { // HLSL Change
     llvm::Type *BoolTy = getTypes().ConvertTypeForMem(E->getType());
     C = llvm::ConstantExpr::getZExt(C, BoolTy);
   }
@@ -1472,7 +1472,7 @@ CodeGenModule::EmitConstantValueForMemory(const APValue &Value,
                                           QualType DestType,
                                           CodeGenFunction *CGF) {
   llvm::Constant *C = EmitConstantValue(Value, DestType, CGF);
-  if (C->getType()->isIntegerTy(1)) {
+  if (C->getType()->getScalarType()->isIntegerTy(1)) { // HLSL Change
     llvm::Type *BoolTy = getTypes().ConvertTypeForMem(DestType);
     C = llvm::ConstantExpr::getZExt(C, BoolTy);
   }
diff --git a/tools/clang/lib/CodeGen/CGExprScalar.cpp b/tools/clang/lib/CodeGen/CGExprScalar.cpp
index 27025432f..f14ee177c 100644
--- a/tools/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/tools/clang/lib/CodeGen/CGExprScalar.cpp
@@ -1507,7 +1507,8 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
   QualType DestTy = CE->getType();
   CastKind Kind = CE->getCastKind();
   // HLSL Change Begins
-  if (hlsl::IsHLSLMatType(E->getType()) || hlsl::IsHLSLMatType(CE->getType())) {
+  if ((hlsl::IsHLSLMatType(E->getType()) || hlsl::IsHLSLMatType(CE->getType()))
+    && Kind != CastKind::CK_FlatConversion) {
     llvm::Value *V = CGF.EmitScalarExpr(E);
     llvm::Type *RetTy = CGF.ConvertType(DestTy);
 
@@ -1817,9 +1818,37 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     return Builder.CreateExtractElement(Visit(E), (uint64_t)0);
   }
   case CK_FlatConversion: {
-      llvm::Value *val = Visit(E);
-      llvm::Value *elem = Builder.CreateExtractValue(val, (uint64_t)0);
-      return EmitScalarConversion(elem, E->getType(), DestTy);
+    llvm::Value *Src = Visit(E);
+
+    // We should have an aggregate type (struct or array) on one side,
+    // and a numeric type (scalar, vector or matrix) on the other.
+    // If the aggregate type is the cast source, it should be a pointer.
+    // Aggregate to aggregate casts are handled in CGExprAgg.cpp
+    auto areCompoundAndNumeric = [this](QualType lhs, QualType rhs) {
+      return hlsl::IsHLSLAggregateType(CGF.getContext(), lhs)
+        && (rhs->isBuiltinType() || hlsl::IsHLSLVecMatType(rhs));
+    };
+    assert(Src->getType()->isPointerTy()
+      ? areCompoundAndNumeric(E->getType(), DestTy)
+      : areCompoundAndNumeric(DestTy, E->getType()));
+    (void)areCompoundAndNumeric;
+
+    llvm::Value *DstPtr = CGF.CreateMemTemp(DestTy, "flatconv");
+    CGF.CGM.getHLSLRuntime().EmitHLSLFlatConversion(
+      CGF, Src, DstPtr, DestTy, E->getType());
+    
+    // Return an rvalue
+    // Matrices must be loaded with the special function
+    if (hlsl::IsHLSLMatType(DestTy))
+      return CGF.CGM.getHLSLRuntime().EmitHLSLMatrixLoad(CGF, DstPtr, DestTy);
+    
+    // Structs/arrays are pointers to temporaries
+    if (hlsl::IsHLSLAggregateType(CGF.getContext(), DestTy))
+      return DstPtr;
+    
+    // Scalars/vectors are loaded regularly
+    llvm::Value *Result = Builder.CreateLoad(DstPtr);
+    return Result = CGF.EmitFromMemory(Result, DestTy);
   }
   case CK_HLSLCC_IntegralToBoolean:
     return EmitIntToBoolConversion(Visit(E));
diff --git a/tools/clang/lib/CodeGen/CGHLSLMS.cpp b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
index 72a67e8d1..4250d3968 100644
--- a/tools/clang/lib/CodeGen/CGHLSLMS.cpp
+++ b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
@@ -180,13 +180,12 @@ private:
                                     clang::QualType Type, llvm::Type *Ty,
                                     SmallVector<Value *, 4> &GepList,
                                     SmallVector<QualType, 4> &EltTyList);
-  void LoadFlattenedGepList(CodeGenFunction &CGF, ArrayRef<Value *> GepList,
-                            ArrayRef<QualType> EltTyList,
-                            SmallVector<Value *, 4> &EltList);
-  void StoreFlattenedGepList(CodeGenFunction &CGF, ArrayRef<Value *> GepList,
-                             ArrayRef<QualType> GepTyList,
-                             ArrayRef<Value *> EltValList,
-                             ArrayRef<QualType> SrcTyList);
+  void LoadElements(CodeGenFunction &CGF,
+    ArrayRef<Value *> Ptrs, ArrayRef<QualType> QualTys,
+    SmallVector<Value *, 4> &Vals);
+  void ConvertAndStoreElements(CodeGenFunction &CGF,
+    ArrayRef<Value *> SrcVals, ArrayRef<QualType> SrcQualTys,
+    ArrayRef<Value *> DstPtrs, ArrayRef<QualType> DstQualTys);
 
   void EmitHLSLAggregateCopy(CodeGenFunction &CGF, llvm::Value *SrcPtr,
                                    llvm::Value *DestPtr,
@@ -195,11 +194,11 @@ private:
                                    clang::QualType DestType,
                                    llvm::Type *Ty);
 
-  void EmitHLSLFlatConversionToAggregate(CodeGenFunction &CGF, Value *SrcVal,
-                                         llvm::Value *DestPtr,
-                                         SmallVector<Value *, 4> &idxList,
-                                         QualType Type, QualType SrcType,
-                                         llvm::Type *Ty);
+  void EmitHLSLFlatConversion(CodeGenFunction &CGF, Value *SrcVal,
+                              llvm::Value *DestPtr,
+                              SmallVector<Value *, 4> &idxList,
+                              QualType Type, QualType SrcType,
+                              llvm::Type *Ty);
 
   void EmitHLSLRootSignature(CodeGenFunction &CGF, HLSLRootSignatureAttr *RSA,
                              llvm::Function *Fn) override;
@@ -294,10 +293,10 @@ public:
                                    llvm::Value *DestPtr,
                                    clang::QualType Ty) override;
 
-  void EmitHLSLFlatConversionToAggregate(CodeGenFunction &CGF, Value *Val,
-                                         Value *DestPtr,
-                                         QualType Ty,
-                                         QualType SrcTy) override;
+  void EmitHLSLFlatConversion(CodeGenFunction &CGF, Value *Val,
+                              Value *DestPtr,
+                              QualType Ty,
+                              QualType SrcTy) override;
   Value *EmitHLSLLiteralCast(CodeGenFunction &CGF, Value *Src, QualType SrcType,
                              QualType DstType) override;
 
@@ -587,7 +586,7 @@ static unsigned AlignBaseOffset(unsigned baseOffset, unsigned size,
   unsigned scalarSizeInBytes = 4;
   const clang::BuiltinType *BT = Ty->getAs<clang::BuiltinType>();
   if (hlsl::IsHLSLVecMatType(Ty)) {
-    BT = CGHLSLRuntime::GetHLSLVecMatElementType(Ty)->getAs<clang::BuiltinType>();
+    BT = hlsl::GetElementTypeOrType(Ty)->getAs<clang::BuiltinType>();
   }
   if (BT) {
     if (BT->getKind() == clang::BuiltinType::Kind::Double ||
@@ -1107,11 +1106,27 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
   // Add hlsl intrinsic attr
   unsigned intrinsicOpcode;
   StringRef intrinsicGroup;
+  llvm::FunctionType *FT = F->getFunctionType();
+
+  auto AddResourceMetadata = [&](QualType qTy, llvm::Type *Ty) {
+    hlsl::DxilResourceBase::Class resClass = TypeToClass(qTy);
+    if (resClass != hlsl::DxilResourceBase::Class::Invalid) {
+      if (!resMetadataMap.count(Ty)) {
+        MDNode *Meta = GetOrAddResTypeMD(qTy);
+        DXASSERT(Meta, "else invalid resource type");
+        resMetadataMap[Ty] = Meta;
+      }
+    }
+  };
+
   if (hlsl::GetIntrinsicOp(FD, intrinsicOpcode, intrinsicGroup)) {
     AddHLSLIntrinsicOpcodeToFunction(F, intrinsicOpcode);
     F->addFnAttr(hlsl::HLPrefix, intrinsicGroup);
+    unsigned iParamOffset = 0; // skip this on llvm function
+
     // Save resource type annotation.
     if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD)) {
+      iParamOffset = 1;
       const CXXRecordDecl *RD = MD->getParent();
       // For nested case like sample_slice_type.
       if (const CXXRecordDecl *PRD =
@@ -1120,43 +1135,20 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
       }
 
       QualType recordTy = MD->getASTContext().getRecordType(RD);
-      hlsl::DxilResourceBase::Class resClass = TypeToClass(recordTy);
       llvm::Type *Ty = CGM.getTypes().ConvertType(recordTy);
-      llvm::FunctionType *FT = F->getFunctionType();
-      // Save resource type metadata.
-      switch (resClass) {
-      case DXIL::ResourceClass::UAV: {
-        MDNode *MD = GetOrAddResTypeMD(recordTy);
-        DXASSERT(MD, "else invalid resource type");
-        resMetadataMap[Ty] = MD;
-      } break;
-      case DXIL::ResourceClass::SRV: {
-        MDNode *Meta = GetOrAddResTypeMD(recordTy);
-        DXASSERT(Meta, "else invalid resource type");
-        resMetadataMap[Ty] = Meta;
-        if (FT->getNumParams() > 1) {
-          QualType paramTy = MD->getParamDecl(0)->getType();
-          // Add sampler type.
-          if (TypeToClass(paramTy) == DXIL::ResourceClass::Sampler) {
-            llvm::Type *Ty = FT->getParamType(1)->getPointerElementType();
-            MDNode *MD = GetOrAddResTypeMD(paramTy);
-            DXASSERT(MD, "else invalid resource type");
-            resMetadataMap[Ty] = MD;
-          }
-        }
-      } break;
-      default:
-        // Skip OutputStream for GS.
-        break;
-      }
+      AddResourceMetadata(recordTy, Ty);
     }
-    if (intrinsicOpcode == (unsigned)IntrinsicOp::IOP_TraceRay) {
-      QualType recordTy = FD->getParamDecl(0)->getType();
-      llvm::Type *Ty = CGM.getTypes().ConvertType(recordTy);
-      MDNode *MD = GetOrAddResTypeMD(recordTy);
-      DXASSERT(MD, "else invalid resource type");
-      resMetadataMap[Ty] = MD;
+
+    // Add metadata for any resources found in parameters
+    for (unsigned iParam = 0; iParam < FD->getNumParams(); iParam++) {
+      llvm::Type *Ty = FT->getParamType(iParam + iParamOffset);
+      if (!Ty->isPointerTy())
+        continue; // not a resource
+      Ty = Ty->getPointerElementType();
+      QualType paramTy = FD->getParamDecl(iParam)->getType();
+      AddResourceMetadata(paramTy, Ty);
     }
+
     StringRef lower;
     if (hlsl::GetIntrinsicLowering(FD, lower))
       hlsl::SetHLLowerStrategy(F, lower);
@@ -2624,13 +2616,27 @@ bool CGMSHLSLRuntime::SetUAVSRV(SourceLocation loc,
   if (kind == hlsl::DxilResource::Kind::Texture2DMS ||
       kind == hlsl::DxilResource::Kind::Texture2DMSArray) {
     const ClassTemplateSpecializationDecl *templateDecl =
-        dyn_cast<ClassTemplateSpecializationDecl>(RD);
+        cast<ClassTemplateSpecializationDecl>(RD);
     const clang::TemplateArgument &sampleCountArg =
         templateDecl->getTemplateArgs()[1];
     uint32_t sampleCount = sampleCountArg.getAsIntegral().getLimitedValue();
     hlslRes->SetSampleCount(sampleCount);
   }
 
+  if (hlsl::DxilResource::IsAnyTexture(kind)) {
+    const ClassTemplateSpecializationDecl *templateDecl = cast<ClassTemplateSpecializationDecl>(RD);
+    const clang::TemplateArgument &texelTyArg = templateDecl->getTemplateArgs()[0];
+    llvm::Type *texelTy = CGM.getTypes().ConvertType(texelTyArg.getAsType());
+    if (!texelTy->isFloatingPointTy() && !texelTy->isIntegerTy()
+      && !hlsl::IsHLSLVecType(texelTyArg.getAsType())) {
+      DiagnosticsEngine &Diags = CGM.getDiags();
+      unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
+        "texture resource texel type must be scalar or vector");
+      Diags.Report(loc, DiagID);
+      return false;
+    }
+  }
+
   if (kind != hlsl::DxilResource::Kind::StructuredBuffer) {
     QualType Ty = resultTy;
     QualType EltTy = Ty;
@@ -2697,7 +2703,7 @@ bool CGMSHLSLRuntime::SetUAVSRV(SourceLocation loc,
   if (kind == hlsl::DxilResource::Kind::TypedBuffer ||
       kind == hlsl::DxilResource::Kind::StructuredBuffer) {
     const ClassTemplateSpecializationDecl *templateDecl =
-        dyn_cast<ClassTemplateSpecializationDecl>(RD);
+        cast<ClassTemplateSpecializationDecl>(RD);
 
     const clang::TemplateArgument &retTyArg =
         templateDecl->getTemplateArgs()[0];
@@ -3824,6 +3830,15 @@ static Value *CastLdValue(Value *Ptr, llvm::Type *FromTy, llvm::Type *ToTy, IRBu
       // Change scalar into vec1.
       Value *Vec1 = UndefValue::get(ToTy);
       return Builder.CreateInsertElement(Vec1, V, (uint64_t)0);
+    } else if (vecSize == 1 && FromTy->isIntegerTy()
+      && ToTy->getVectorElementType()->isIntegerTy(1)) {
+      // load(bitcast i32* to <1 x i1>*)
+      // Rewrite to
+      // insertelement(icmp ne (load i32*), 0)
+      Value *IntV = Builder.CreateLoad(Ptr);
+      Value *BoolV = Builder.CreateICmpNE(IntV, ConstantInt::get(IntV->getType(), 0), "tobool");
+      Value *Vec1 = UndefValue::get(ToTy);
+      return Builder.CreateInsertElement(Vec1, BoolV, (uint64_t)0);
     } else if (FromTy->isVectorTy() && vecSize == 1) {
       Value *V = Builder.CreateLoad(Ptr);
       // VectorTrunc
@@ -4205,11 +4220,17 @@ static Value * TryEvalIntrinsic(CallInst *CI, IntrinsicOp intriOp) {
   case IntrinsicOp::IOP_max: {
     auto maxF = [](float a, float b) -> float { return a > b ? a:b; };
     auto maxD = [](double a, double b) -> double { return a > b ? a:b; };
+    // Handled in DXIL constant folding
+    if (CI->getArgOperand(0)->getType()->getScalarType()->isIntegerTy())
+      return nullptr;
     return EvalBinaryIntrinsic(CI, maxF, maxD);
   } break;
   case IntrinsicOp::IOP_min: {
     auto minF = [](float a, float b) -> float { return a < b ? a:b; };
     auto minD = [](double a, double b) -> double { return a < b ? a:b; };
+    // Handled in DXIL constant folding
+    if (CI->getArgOperand(0)->getType()->getScalarType()->isIntegerTy())
+      return nullptr;
     return EvalBinaryIntrinsic(CI, minF, minD);
   } break;
   case IntrinsicOp::IOP_rcp: {
@@ -4267,7 +4288,7 @@ static void SimpleTransformForHLDXIR(Instruction *I,
   } break;
   case Instruction::Load: {
     LoadInst *ldInst = cast<LoadInst>(I);
-    DXASSERT(!HLMatrixLower::IsMatrixType(ldInst->getType()),
+    DXASSERT(!dxilutil::IsHLSLMatrixType(ldInst->getType()),
                       "matrix load should use HL LdStMatrix");
     Value *Ptr = ldInst->getPointerOperand();
     if (ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(Ptr)) {
@@ -4279,7 +4300,7 @@ static void SimpleTransformForHLDXIR(Instruction *I,
   case Instruction::Store: {
     StoreInst *stInst = cast<StoreInst>(I);
     Value *V = stInst->getValueOperand();
-    DXASSERT_LOCALVAR(V, !HLMatrixLower::IsMatrixType(V->getType()),
+    DXASSERT_LOCALVAR(V, !dxilutil::IsHLSLMatrixType(V->getType()),
                       "matrix store should use HL LdStMatrix");
     Value *Ptr = stInst->getPointerOperand();
     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
@@ -5132,24 +5153,6 @@ static const HLUnaryOpcode UnaryOperatorKindMap[] = {
     HLUnaryOpcode::Invalid, // Extension
 };
 
-static bool IsRowMajorMatrix(QualType Ty, bool bDefaultRowMajor) {
-  bool bRowMajor = bDefaultRowMajor;
-  HasHLSLMatOrientation(Ty, &bRowMajor);
-  return bRowMajor;
-}
-
-static bool IsUnsigned(QualType Ty) {
-  Ty = Ty.getCanonicalType().getNonReferenceType();
-
-  if (hlsl::IsHLSLVecMatType(Ty))
-    Ty = CGHLSLRuntime::GetHLSLVecMatElementType(Ty);
-
-  if (Ty->isExtVectorType())
-    Ty = Ty->getAs<clang::ExtVectorType>()->getElementType();
-
-  return Ty->isUnsignedIntegerType();
-}
-
 static unsigned GetHLOpcode(const Expr *E) {
   switch (E->getStmtClass()) {
   case Stmt::CompoundAssignOperatorClass:
@@ -5157,7 +5160,7 @@ static unsigned GetHLOpcode(const Expr *E) {
     const clang::BinaryOperator *binOp = cast<clang::BinaryOperator>(E);
     HLBinaryOpcode binOpcode = BinaryOperatorKindMap[binOp->getOpcode()];
     if (HasUnsignedOpcode(binOpcode)) {
-      if (IsUnsigned(binOp->getLHS()->getType())) {
+      if (hlsl::IsHLSLUnsigned(binOp->getLHS()->getType())) {
         binOpcode = GetUnsignedOpcode(binOpcode);
       }
     }
@@ -5171,8 +5174,8 @@ static unsigned GetHLOpcode(const Expr *E) {
   case Stmt::ImplicitCastExprClass:
   case Stmt::CStyleCastExprClass: {
     const CastExpr *CE = cast<CastExpr>(E);
-    bool toUnsigned = IsUnsigned(E->getType());
-    bool fromUnsigned = IsUnsigned(CE->getSubExpr()->getType());
+    bool toUnsigned = hlsl::IsHLSLUnsigned(E->getType());
+    bool fromUnsigned = hlsl::IsHLSLUnsigned(CE->getSubExpr()->getType());
     if (toUnsigned && fromUnsigned)
       return static_cast<unsigned>(HLCastOpcode::UnsignedUnsignedCast);
     else if (toUnsigned)
@@ -5276,7 +5279,7 @@ void CGMSHLSLRuntime::FlattenValToInitList(CodeGenFunction &CGF, SmallVector<Val
         valEltTy->isSingleValueType()) {
       Value *ldVal = Builder.CreateLoad(val);
       FlattenValToInitList(CGF, elts, eltTys, Ty, ldVal);
-    } else if (HLMatrixLower::IsMatrixType(valEltTy)) {
+    } else if (dxilutil::IsHLSLMatrixType(valEltTy)) {
       Value *ldVal = EmitHLSLMatrixLoad(Builder, val, Ty);
       FlattenValToInitList(CGF, elts, eltTys, Ty, ldVal);
     } else {
@@ -5328,14 +5331,14 @@ void CGMSHLSLRuntime::FlattenValToInitList(CodeGenFunction &CGF, SmallVector<Val
       }
     }
   } else {
-    if (HLMatrixLower::IsMatrixType(valTy)) {
+    if (dxilutil::IsHLSLMatrixType(valTy)) {
       unsigned col, row;
       llvm::Type *EltTy = HLMatrixLower::GetMatrixInfo(valTy, col, row);
       // All matrix Value should be row major.
       // Init list is row major in scalar.
       // So the order is match here, just cast to vector.
       unsigned matSize = col * row;
-      bool isRowMajor = IsRowMajorMatrix(Ty, m_pHLModule->GetHLOptions().bDefaultRowMajor);
+      bool isRowMajor = hlsl::IsHLSLMatRowMajor(Ty, m_pHLModule->GetHLOptions().bDefaultRowMajor);
 
       HLCastOpcode opcode = isRowMajor ? HLCastOpcode::RowMatrixToVecCast
                                        : HLCastOpcode::ColMatrixToVecCast;
@@ -5348,7 +5351,7 @@ void CGMSHLSLRuntime::FlattenValToInitList(CodeGenFunction &CGF, SmallVector<Val
     }
 
     if (valTy->isVectorTy()) {
-      QualType EltTy = GetHLSLVecMatElementType(Ty);
+      QualType EltTy = hlsl::GetElementTypeOrType(Ty);
       unsigned vecSize = valTy->getVectorNumElements();
       for (unsigned i = 0; i < vecSize; i++) {
         Value *Elt = Builder.CreateExtractElement(val, i);
@@ -5363,22 +5366,39 @@ void CGMSHLSLRuntime::FlattenValToInitList(CodeGenFunction &CGF, SmallVector<Val
   }  
 }
 
-static bool IsBooleanType(llvm::Type *ty) {
-  return (ty->isIntegerTy() && ty->getIntegerBitWidth() == 1);
+static Value* ConvertScalarOrVector(CGBuilderTy& Builder, CodeGenTypes &Types,
+  Value *Val, QualType SrcQualTy, QualType DstQualTy) {
+  llvm::Type *SrcTy = Val->getType();
+  llvm::Type *DstTy = Types.ConvertType(DstQualTy);
+
+  DXASSERT(Val->getType() == Types.ConvertType(SrcQualTy), "QualType/Value mismatch!");
+  DXASSERT((SrcTy->isIntOrIntVectorTy() || SrcTy->isFPOrFPVectorTy())
+    && (DstTy->isIntOrIntVectorTy() || DstTy->isFPOrFPVectorTy()),
+    "EmitNumericConversion can only be used with int/float scalars/vectors.");
+
+  if (SrcTy == DstTy) return Val; // Valid no-op, including uint to int / int to uint
+  DXASSERT(SrcTy->isVectorTy()
+    ? (DstTy->isVectorTy() && SrcTy->getVectorNumElements() == DstTy->getVectorNumElements())
+    : !DstTy->isVectorTy(),
+    "EmitNumericConversion can only cast between scalars or vectors of matching sizes");
+
+  // Conversions to bools are comparisons
+  if (DstTy->getScalarSizeInBits() == 1) {
+    // fcmp une is what regular clang uses in C++ for (bool)f;
+    return SrcTy->isIntOrIntVectorTy()
+      ? Builder.CreateICmpNE(Val, llvm::Constant::getNullValue(SrcTy), "tobool")
+      : Builder.CreateFCmpUNE(Val, llvm::Constant::getNullValue(SrcTy), "tobool");
+  }
+
+  // Cast necessary
+  auto CastOp = static_cast<Instruction::CastOps>(HLModule::GetNumericCastOp(
+    SrcTy, hlsl::IsHLSLUnsigned(SrcQualTy), DstTy, hlsl::IsHLSLUnsigned(DstQualTy)));
+  return Builder.CreateCast(CastOp, Val, DstTy);
 }
 
-static Value *CreateCastforBoolDestType(CGBuilderTy &Builder, Value *srcVal) {
-  llvm::Type *srcTy = srcVal->getType();
-  if (srcTy->isFloatingPointTy()) {
-    return Builder.CreateFCmp(FCmpInst::FCMP_UNE, srcVal,
-                              ConstantFP::get(srcTy, 0));
-  } else {
-    // must be an integer type here
-    DXASSERT(srcTy->isIntegerTy() && srcTy->getIntegerBitWidth() > 1,
-             "must be a non-boolean integer type.");
-    return Builder.CreateICmp(ICmpInst::ICMP_NE, srcVal,
-                              ConstantInt::get(srcTy, 0));
-  }
+static Value* ConvertScalarOrVector(CodeGenFunction &CGF,
+  Value *Val, QualType SrcQualTy, QualType DstQualTy) {
+  return ConvertScalarOrVector(CGF.Builder, CGF.getTypes(), Val, SrcQualTy, DstQualTy);
 }
 
 // Cast elements in initlist if not match the target type.
@@ -5435,40 +5455,30 @@ static void AddMissingCastOpsInInitList(SmallVector<Value *, 4> &elts, SmallVect
   }
   else {
     // Basic type.
-    Value *val = elts[idx];
-    llvm::Type *srcTy = val->getType();
-    llvm::Type *dstTy = CGF.ConvertType(Ty);
-    if (srcTy != dstTy) {
-      if (IsBooleanType(dstTy)) {
-        elts[idx] = CreateCastforBoolDestType(CGF.Builder, val);
-      } else {
-        Instruction::CastOps castOp =
-          static_cast<Instruction::CastOps>(HLModule::FindCastOp(
-            IsUnsigned(eltTys[idx]), IsUnsigned(Ty), srcTy, dstTy));
-        elts[idx] = CGF.Builder.CreateCast(castOp, val, dstTy);
-      }
-    }
+    elts[idx] = ConvertScalarOrVector(CGF, elts[idx], eltTys[idx], Ty);
     idx++;
   }
 }
 
 static void StoreInitListToDestPtr(Value *DestPtr,
                                    SmallVector<Value *, 4> &elts, unsigned &idx,
-                                   QualType Type, CodeGenTypes &Types, bool bDefaultRowMajor,
-                                   CGBuilderTy &Builder, llvm::Module &M) {
+                                   QualType Type, bool bDefaultRowMajor,
+                                   CodeGenFunction &CGF, llvm::Module &M) {
+  CodeGenTypes &Types = CGF.getTypes();
+  CGBuilderTy &Builder = CGF.Builder;
+
   llvm::Type *Ty = DestPtr->getType()->getPointerElementType();
-  llvm::Type *i32Ty = llvm::Type::getInt32Ty(Ty->getContext());
 
   if (Ty->isVectorTy()) {
-    Value *Result = UndefValue::get(Ty);
-    for (unsigned i = 0; i < Ty->getVectorNumElements(); i++)
+    llvm::Type *RegTy = CGF.ConvertType(Type);
+    Value *Result = UndefValue::get(RegTy);
+    for (unsigned i = 0; i < RegTy->getVectorNumElements(); i++)
       Result = Builder.CreateInsertElement(Result, elts[idx + i], i);
+    Result = CGF.EmitToMemory(Result, Type);
     Builder.CreateStore(Result, DestPtr);
     idx += Ty->getVectorNumElements();
-  } else if (HLMatrixLower::IsMatrixType(Ty)) {
-    bool isRowMajor =
-        IsRowMajorMatrix(Type, bDefaultRowMajor);
-
+  } else if (dxilutil::IsHLSLMatrixType(Ty)) {
+    bool isRowMajor = hlsl::IsHLSLMatRowMajor(Type, bDefaultRowMajor);
     unsigned row, col;
     HLMatrixLower::GetMatrixInfo(Ty, col, row);
     std::vector<Value *> matInitList(col * row);
@@ -5507,7 +5517,7 @@ static void StoreInitListToDestPtr(Value *DestPtr,
       Builder.CreateStore(elts[idx], DestPtr);
       idx++;
     } else {
-      Constant *zero = ConstantInt::get(i32Ty, 0);
+      Constant *zero = Builder.getInt32(0);
 
       const RecordType *RT = Type->getAsStructureType();
       // For CXXRecord.
@@ -5525,29 +5535,29 @@ static void StoreInitListToDestPtr(Value *DestPtr,
               continue;
             QualType parentTy = QualType(BaseDecl->getTypeForDecl(), 0);
             unsigned i = RL.getNonVirtualBaseLLVMFieldNo(BaseDecl);
-            Constant *gepIdx = ConstantInt::get(i32Ty, i);
+            Constant *gepIdx = Builder.getInt32(i);
             Value *GEP = Builder.CreateInBoundsGEP(DestPtr, {zero, gepIdx});
-            StoreInitListToDestPtr(GEP, elts, idx, parentTy, Types,
-                                   bDefaultRowMajor, Builder, M);
+            StoreInitListToDestPtr(GEP, elts, idx, parentTy,
+                                   bDefaultRowMajor, CGF, M);
           }
         }
       }
       for (FieldDecl *field : RD->fields()) {
         unsigned i = RL.getLLVMFieldNo(field);
-        Constant *gepIdx = ConstantInt::get(i32Ty, i);
+        Constant *gepIdx = Builder.getInt32(i);
         Value *GEP = Builder.CreateInBoundsGEP(DestPtr, {zero, gepIdx});
-        StoreInitListToDestPtr(GEP, elts, idx, field->getType(), Types,
-                               bDefaultRowMajor, Builder, M);
+        StoreInitListToDestPtr(GEP, elts, idx, field->getType(),
+                               bDefaultRowMajor, CGF, M);
       }
     }
   } else if (Ty->isArrayTy()) {
-    Constant *zero = ConstantInt::get(i32Ty, 0);
+    Constant *zero = Builder.getInt32(0);
     QualType EltType = Type->getAsArrayTypeUnsafe()->getElementType();
     for (unsigned i = 0; i < Ty->getArrayNumElements(); i++) {
-      Constant *gepIdx = ConstantInt::get(i32Ty, i);
+      Constant *gepIdx = Builder.getInt32(i);
       Value *GEP = Builder.CreateInBoundsGEP(DestPtr, {zero, gepIdx});
-      StoreInitListToDestPtr(GEP, elts, idx, EltType, Types, bDefaultRowMajor,
-                             Builder, M);
+      StoreInitListToDestPtr(GEP, elts, idx, EltType, bDefaultRowMajor,
+                             CGF, M);
     }
   } else {
     DXASSERT(Ty->isSingleValueType(), "invalid type");
@@ -5727,8 +5737,8 @@ Value *CGMSHLSLRuntime::EmitHLSLInitListExpr(CodeGenFunction &CGF, InitListExpr
     ParamList.append(EltValList.begin(), EltValList.end());
     idx = 0;
     bool bDefaultRowMajor = m_pHLModule->GetHLOptions().bDefaultRowMajor;
-    StoreInitListToDestPtr(DestPtr, EltValList, idx, ResultTy, CGF.getTypes(),
-                           bDefaultRowMajor, CGF.Builder, TheModule);
+    StoreInitListToDestPtr(DestPtr, EltValList, idx, ResultTy,
+                           bDefaultRowMajor, CGF, TheModule);
     return nullptr;
   }
 
@@ -5746,106 +5756,131 @@ Value *CGMSHLSLRuntime::EmitHLSLInitListExpr(CodeGenFunction &CGF, InitListExpr
   }
 }
 
-static void FlatConstToList(Constant *C, SmallVector<Constant *, 4> &EltValList,
-                            QualType Type, CodeGenTypes &Types,
-                            bool bDefaultRowMajor) {
+static void FlatConstToList(CodeGenTypes &Types, bool bDefaultRowMajor,
+    Constant *C, QualType QualTy,
+    SmallVectorImpl<Constant *> &EltVals, SmallVectorImpl<QualType> &EltQualTys) {
   llvm::Type *Ty = C->getType();
-  if (llvm::VectorType *VT = dyn_cast<llvm::VectorType>(Ty)) {
-    // Type is only for matrix. Keep use Type to next level.
-    for (unsigned i = 0; i < VT->getNumElements(); i++) {
-      FlatConstToList(C->getAggregateElement(i), EltValList, Type, Types,
-                      bDefaultRowMajor);
+  DXASSERT(Types.ConvertTypeForMem(QualTy) == Ty, "QualType/Type mismatch!");
+
+  if (llvm::VectorType *VecTy = dyn_cast<llvm::VectorType>(Ty)) {
+    DXASSERT(hlsl::IsHLSLVecType(QualTy), "QualType/Type mismatch!");
+    QualType VecElemQualTy = hlsl::GetHLSLVecElementType(QualTy);
+    for (unsigned i = 0; i < VecTy->getNumElements(); i++) {
+      EltVals.emplace_back(C->getAggregateElement(i));
+      EltQualTys.emplace_back(VecElemQualTy);
     }
-  } else if (HLMatrixLower::IsMatrixType(Ty)) {
-    bool isRowMajor = IsRowMajorMatrix(Type, bDefaultRowMajor);
-    // matrix type is struct { vector<Ty, row> [col] };
+  } else if (dxilutil::IsHLSLMatrixType(Ty)) {
+    DXASSERT(hlsl::IsHLSLMatType(QualTy), "QualType/Type mismatch!");
+    // matrix type is struct { [rowcount x <colcount x T>] };
     // Strip the struct level here.
-    Constant *matVal = C->getAggregateElement((unsigned)0);
-    const RecordType *RT = Type->getAs<RecordType>();
-    RecordDecl *RD = RT->getDecl();
-    QualType EltTy = RD->field_begin()->getType();
-    // When scan, init list scalars is row major.
-    if (isRowMajor) {
-      // Don't change the major for row major value.
-      FlatConstToList(matVal, EltValList, EltTy, Types, bDefaultRowMajor);
-    } else {
-      // Save to tmp list.
-      SmallVector<Constant *, 4> matEltList;
-      FlatConstToList(matVal, matEltList, EltTy, Types, bDefaultRowMajor);
-      unsigned row, col;
-      HLMatrixLower::GetMatrixInfo(Ty, col, row);
-      // Change col major value to row major.
-      for (unsigned r = 0; r < row; r++)
-        for (unsigned c = 0; c < col; c++) {
-          unsigned colMajorIdx = c * row + r;
-          EltValList.emplace_back(matEltList[colMajorIdx]);
+    Constant *RowArrayVal = C->getAggregateElement((unsigned)0);
+    QualType MatEltQualTy = hlsl::GetHLSLMatElementType(QualTy);
+
+    unsigned RowCount, ColCount;
+    hlsl::GetHLSLMatRowColCount(QualTy, RowCount, ColCount);
+
+    // Get all the elements from the array of row vectors.
+    // Matrices are never in memory representation so convert as needed.
+    SmallVector<Constant *, 16> MatElts;
+    for (unsigned r = 0; r < RowCount; ++r) {
+      Constant *RowVec = RowArrayVal->getAggregateElement(r);
+      for (unsigned c = 0; c < ColCount; ++c) {
+        Constant *MatElt = RowVec->getAggregateElement(c);
+        if (MatEltQualTy->isBooleanType()) {
+          DXASSERT(MatElt->getType()->isIntegerTy(1),
+            "Matrix elements should be in their register representation.");
+          MatElt = llvm::ConstantExpr::getZExt(MatElt, Types.ConvertTypeForMem(MatEltQualTy));
         }
+        MatElts.emplace_back(MatElt);
+      }
     }
-  } else if (llvm::ArrayType *AT = dyn_cast<llvm::ArrayType>(Ty)) {
-    QualType EltTy = Type->getAsArrayTypeUnsafe()->getElementType();
-    for (unsigned i = 0; i < AT->getNumElements(); i++) {
-      FlatConstToList(C->getAggregateElement(i), EltValList, EltTy, Types,
-                      bDefaultRowMajor);
+
+    // Return the elements in the order respecting the orientation.
+    // Constant initializers are used as the initial value for static variables,
+    // which live in memory. This is why they have to respect memory packing order.
+    bool IsRowMajor = hlsl::IsHLSLMatRowMajor(QualTy, bDefaultRowMajor);
+    for (unsigned r = 0; r < RowCount; ++r) {
+      for (unsigned c = 0; c < ColCount; ++c) {
+        unsigned Idx = IsRowMajor ? (r * ColCount + c) : (c * RowCount + r);
+        EltVals.emplace_back(MatElts[Idx]);
+        EltQualTys.emplace_back(MatEltQualTy);
+      }
     }
-  } else if (dyn_cast<llvm::StructType>(Ty)) {
-    RecordDecl *RD = Type->getAsStructureType()->getDecl();
-    const CGRecordLayout &RL = Types.getCGRecordLayout(RD);
+  }
+  else if (const clang::ConstantArrayType *ClangArrayTy = Types.getContext().getAsConstantArrayType(QualTy)) {
+    QualType ArrayEltQualTy = ClangArrayTy->getElementType();
+    uint64_t ArraySize = ClangArrayTy->getSize().getLimitedValue();
+    DXASSERT(cast<llvm::ArrayType>(Ty)->getArrayNumElements() == ArraySize, "QualType/Type mismatch!");
+    for (unsigned i = 0; i < ArraySize; i++) {
+      FlatConstToList(Types, bDefaultRowMajor, C->getAggregateElement(i), ArrayEltQualTy, 
+        EltVals, EltQualTys);
+    }
+  }
+  else if (const clang::RecordType* RecordTy = QualTy->getAs<clang::RecordType>()) {
+    DXASSERT(dyn_cast<llvm::StructType>(Ty) != nullptr, "QualType/Type mismatch!");
+    RecordDecl *RecordDecl = RecordTy->getDecl();
+    const CGRecordLayout &RL = Types.getCGRecordLayout(RecordDecl);
     // Take care base.
-    if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+    if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RecordDecl)) {
       if (CXXRD->getNumBases()) {
         for (const auto &I : CXXRD->bases()) {
           const CXXRecordDecl *BaseDecl =
               cast<CXXRecordDecl>(I.getType()->castAs<RecordType>()->getDecl());
           if (BaseDecl->field_empty())
             continue;
-          QualType parentTy = QualType(BaseDecl->getTypeForDecl(), 0);
-          unsigned i = RL.getNonVirtualBaseLLVMFieldNo(BaseDecl);
-          FlatConstToList(C->getAggregateElement(i), EltValList, parentTy,
-                          Types, bDefaultRowMajor);
+          QualType BaseQualTy = QualType(BaseDecl->getTypeForDecl(), 0);
+          unsigned BaseFieldIdx = RL.getNonVirtualBaseLLVMFieldNo(BaseDecl);
+          FlatConstToList(Types, bDefaultRowMajor,
+            C->getAggregateElement(BaseFieldIdx), BaseQualTy, EltVals, EltQualTys);
         }
       }
     }
 
-    for (auto fieldIter = RD->field_begin(), fieldEnd = RD->field_end();
-         fieldIter != fieldEnd; ++fieldIter) {
-      unsigned i = RL.getLLVMFieldNo(*fieldIter);
+    for (auto FieldIt = RecordDecl->field_begin(), fieldEnd = RecordDecl->field_end();
+      FieldIt != fieldEnd; ++FieldIt) {
+      unsigned FieldIndex = RL.getLLVMFieldNo(*FieldIt);
 
-      FlatConstToList(C->getAggregateElement(i), EltValList,
-                      fieldIter->getType(), Types, bDefaultRowMajor);
+      FlatConstToList(Types, bDefaultRowMajor,
+        C->getAggregateElement(FieldIndex), FieldIt->getType(), EltVals, EltQualTys);
     }
-  } else {
-    EltValList.emplace_back(C);
+  }
+  else {
+    // At this point, we should have scalars in their memory representation
+    DXASSERT_NOMSG(QualTy->isBuiltinType());
+    EltVals.emplace_back(C);
+    EltQualTys.emplace_back(QualTy);
   }
 }
 
-static bool ScanConstInitList(CodeGenModule &CGM, InitListExpr *E,
-                              SmallVector<Constant *, 4> &EltValList,
-                              CodeGenTypes &Types, bool bDefaultRowMajor) {
-  unsigned NumInitElements = E->getNumInits();
+static bool ScanConstInitList(CodeGenModule &CGM, bool bDefaultRowMajor, 
+                              InitListExpr *InitList,
+                              SmallVectorImpl<Constant *> &EltVals,
+                              SmallVectorImpl<QualType> &EltQualTys) {
+  unsigned NumInitElements = InitList->getNumInits();
   for (unsigned i = 0; i != NumInitElements; ++i) {
-    Expr *init = E->getInit(i);
-    QualType iType = init->getType();
-    if (InitListExpr *initList = dyn_cast<InitListExpr>(init)) {
-      if (!ScanConstInitList(CGM, initList, EltValList, Types,
-                             bDefaultRowMajor))
+    Expr *InitExpr = InitList->getInit(i);
+    QualType InitQualTy = InitExpr->getType();
+    if (InitListExpr *SubInitList = dyn_cast<InitListExpr>(InitExpr)) {
+      if (!ScanConstInitList(CGM, bDefaultRowMajor, SubInitList, EltVals, EltQualTys))
         return false;
-    } else if (DeclRefExpr *ref = dyn_cast<DeclRefExpr>(init)) {
-      if (VarDecl *D = dyn_cast<VarDecl>(ref->getDecl())) {
-        if (!D->hasInit())
+    } else if (DeclRefExpr *DeclRef = dyn_cast<DeclRefExpr>(InitExpr)) {
+      if (VarDecl *Var = dyn_cast<VarDecl>(DeclRef->getDecl())) {
+        if (!Var->hasInit())
           return false;
-        if (Constant *initVal = CGM.EmitConstantInit(*D)) {
-          FlatConstToList(initVal, EltValList, iType, Types, bDefaultRowMajor);
+        if (Constant *InitVal = CGM.EmitConstantInit(*Var)) {
+          FlatConstToList(CGM.getTypes(), bDefaultRowMajor,
+            InitVal, InitQualTy, EltVals, EltQualTys);
         } else {
           return false;
         }
       } else {
         return false;
       }
-    } else if (hlsl::IsHLSLMatType(iType)) {
+    } else if (hlsl::IsHLSLMatType(InitQualTy)) {
       return false;
-    } else if (CodeGenFunction::hasScalarEvaluationKind(iType)) {
-      if (Constant *initVal = CGM.EmitConstantExpr(init, iType)) {
-        FlatConstToList(initVal, EltValList, iType, Types, bDefaultRowMajor);
+    } else if (CodeGenFunction::hasScalarEvaluationKind(InitQualTy)) {
+      if (Constant *InitVal = CGM.EmitConstantExpr(InitExpr, InitQualTy)) {
+        FlatConstToList(CGM.getTypes(), bDefaultRowMajor, InitVal, InitQualTy, EltVals, EltQualTys);
       } else {
         return false;
       }
@@ -5856,163 +5891,164 @@ static bool ScanConstInitList(CodeGenModule &CGM, InitListExpr *E,
   return true;
 }
 
-static Constant *BuildConstInitializer(QualType Type, unsigned &offset,
-                                       SmallVector<Constant *, 4> &EltValList,
-                                       CodeGenTypes &Types,
-                                       bool bDefaultRowMajor);
+static Constant *BuildConstInitializer(CodeGenTypes &Types, bool bDefaultRowMajor,
+  QualType QualTy, bool MemRepr,
+  SmallVectorImpl<Constant *> &EltVals, SmallVectorImpl<QualType> &EltQualTys, unsigned &EltIdx);
 
-static Constant *BuildConstVector(llvm::VectorType *VT, unsigned &offset,
-                                  SmallVector<Constant *, 4> &EltValList,
-                                  QualType Type, CodeGenTypes &Types) {
-  SmallVector<Constant *, 4> Elts;
-  QualType EltTy = hlsl::GetHLSLVecElementType(Type);
-  for (unsigned i = 0; i < VT->getNumElements(); i++) {
-    Elts.emplace_back(BuildConstInitializer(EltTy, offset, EltValList, Types,
-                                            // Vector don't need major.
-                                            /*bDefaultRowMajor*/ false));
-  }
-  return llvm::ConstantVector::get(Elts);
-}
+static Constant *BuildConstMatrix(CodeGenTypes &Types, bool bDefaultRowMajor, QualType QualTy,
+    SmallVectorImpl<Constant *> &EltVals, SmallVectorImpl<QualType> &EltQualTys, unsigned &EltIdx) {
+  QualType MatEltTy = hlsl::GetHLSLMatElementType(QualTy);
+  unsigned RowCount, ColCount;
+  hlsl::GetHLSLMatRowColCount(QualTy, RowCount, ColCount);
+  bool IsRowMajor = hlsl::IsHLSLMatRowMajor(QualTy, bDefaultRowMajor);
 
-static Constant *BuildConstMatrix(llvm::Type *Ty, unsigned &offset,
-                                  SmallVector<Constant *, 4> &EltValList,
-                                  QualType Type, CodeGenTypes &Types,
-                                  bool bDefaultRowMajor) {
-  QualType EltTy = hlsl::GetHLSLMatElementType(Type);
-  unsigned col, row;
-  HLMatrixLower::GetMatrixInfo(Ty, col, row);
-  llvm::ArrayType *AT = cast<llvm::ArrayType>(Ty->getStructElementType(0));
   // Save initializer elements first.
   // Matrix initializer is row major.
-  SmallVector<Constant *, 16> elts;
-  for (unsigned i = 0; i < col * row; i++) {
-    elts.emplace_back(BuildConstInitializer(EltTy, offset, EltValList, Types,
-                                            bDefaultRowMajor));
+  SmallVector<Constant *, 16> RowMajorMatElts;
+  for (unsigned i = 0; i < RowCount * ColCount; i++) {
+    // Matrix elements are never in their memory representation,
+    // to preserve type information for later lowering.
+    bool MemRepr = false; 
+    RowMajorMatElts.emplace_back(BuildConstInitializer(
+      Types, bDefaultRowMajor, MatEltTy, MemRepr,
+      EltVals, EltQualTys, EltIdx));
   }
 
-  bool isRowMajor = IsRowMajorMatrix(Type, bDefaultRowMajor);
-
-  SmallVector<Constant *, 16> majorElts(elts.begin(), elts.end());
-  if (!isRowMajor) {
-    // cast row major to col major.
-    for (unsigned c = 0; c < col; c++) {
-      SmallVector<Constant *, 4> rows;
-      for (unsigned r = 0; r < row; r++) {
-        unsigned rowMajorIdx = r * col + c;
-        unsigned colMajorIdx = c * row + r;
-        majorElts[colMajorIdx] = elts[rowMajorIdx];
+  SmallVector<Constant *, 16> FinalMatElts;
+  if (IsRowMajor) {
+    FinalMatElts = RowMajorMatElts;
+  }
+  else {
+    // Cast row major to col major.
+    for (unsigned c = 0; c < ColCount; c++) {
+      for (unsigned r = 0; r < RowCount; r++) {
+        FinalMatElts.emplace_back(RowMajorMatElts[r * ColCount + c]);
       }
     }
   }
   // The type is vector<element, col>[row].
-  SmallVector<Constant *, 4> rows;
+  SmallVector<Constant *, 4> Rows;
   unsigned idx = 0;
-  for (unsigned r = 0; r < row; r++) {
-    SmallVector<Constant *, 4> cols;
-    for (unsigned c = 0; c < col; c++) {
-      cols.emplace_back(majorElts[idx++]);
+  for (unsigned r = 0; r < RowCount; r++) {
+    SmallVector<Constant *, 4> RowElts;
+    for (unsigned c = 0; c < ColCount; c++) {
+      RowElts.emplace_back(FinalMatElts[idx++]);
     }
-    rows.emplace_back(llvm::ConstantVector::get(cols));
+    Rows.emplace_back(llvm::ConstantVector::get(RowElts));
   }
-  Constant *mat = llvm::ConstantArray::get(AT, rows);
-  return llvm::ConstantStruct::get(cast<llvm::StructType>(Ty), mat);
+
+  Constant *RowArray = llvm::ConstantArray::get(
+    llvm::ArrayType::get(Rows[0]->getType(), Rows.size()), Rows);
+  return llvm::ConstantStruct::get(cast<llvm::StructType>(Types.ConvertType(QualTy)), RowArray);
 }
 
-static Constant *BuildConstArray(llvm::ArrayType *AT, unsigned &offset,
-                                 SmallVector<Constant *, 4> &EltValList,
-                                 QualType Type, CodeGenTypes &Types,
-                                 bool bDefaultRowMajor) {
-  SmallVector<Constant *, 4> Elts;
-  QualType EltType = QualType(Type->getArrayElementTypeNoTypeQual(), 0);
-  for (unsigned i = 0; i < AT->getNumElements(); i++) {
-    Elts.emplace_back(BuildConstInitializer(EltType, offset, EltValList, Types,
-                                            bDefaultRowMajor));
-  }
-  return llvm::ConstantArray::get(AT, Elts);
-}
-
-static Constant *BuildConstStruct(llvm::StructType *ST, unsigned &offset,
-                                  SmallVector<Constant *, 4> &EltValList,
-                                  QualType Type, CodeGenTypes &Types,
-                                  bool bDefaultRowMajor) {
-  SmallVector<Constant *, 4> Elts;
-
-  const RecordType *RT = Type->getAsStructureType();
-  if (!RT)
-    RT = Type->getAs<RecordType>();
-  const RecordDecl *RD = RT->getDecl();
-
-  if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
-    if (CXXRD->getNumBases()) {
+static Constant *BuildConstStruct(CodeGenTypes &Types, bool bDefaultRowMajor, QualType QualTy,
+    SmallVectorImpl<Constant *> &EltVals, SmallVectorImpl<QualType> &EltQualTys, unsigned &EltIdx) {
+  const RecordDecl *Record = QualTy->castAs<RecordType>()->getDecl();
+  bool MemRepr = true; // Structs are always in their memory representation
+  SmallVector<Constant *, 4> FieldVals;
+  if (const CXXRecordDecl *CXXRecord = dyn_cast<CXXRecordDecl>(Record)) {
+    if (CXXRecord->getNumBases()) {
       // Add base as field.
-      for (const auto &I : CXXRD->bases()) {
+      for (const auto &BaseSpec : CXXRecord->bases()) {
         const CXXRecordDecl *BaseDecl =
-            cast<CXXRecordDecl>(I.getType()->castAs<RecordType>()->getDecl());
+            cast<CXXRecordDecl>(BaseSpec.getType()->castAs<RecordType>()->getDecl());
         // Skip empty struct.
         if (BaseDecl->field_empty())
           continue;
 
         // Add base as a whole constant. Not as element.
-        Elts.emplace_back(BuildConstInitializer(I.getType(), offset, EltValList,
-                                                Types, bDefaultRowMajor));
+        FieldVals.emplace_back(BuildConstInitializer(Types, bDefaultRowMajor,
+          BaseSpec.getType(), MemRepr, EltVals, EltQualTys, EltIdx));
       }
     }
   }
 
-  for (auto fieldIter = RD->field_begin(), fieldEnd = RD->field_end();
-       fieldIter != fieldEnd; ++fieldIter) {
-    Elts.emplace_back(BuildConstInitializer(
-        fieldIter->getType(), offset, EltValList, Types, bDefaultRowMajor));
+  for (auto FieldIt = Record->field_begin(), FieldEnd = Record->field_end();
+      FieldIt != FieldEnd; ++FieldIt) {
+    FieldVals.emplace_back(BuildConstInitializer(Types, bDefaultRowMajor,
+      FieldIt->getType(), MemRepr, EltVals, EltQualTys, EltIdx));
   }
 
-  return llvm::ConstantStruct::get(ST, Elts);
+  return llvm::ConstantStruct::get(cast<llvm::StructType>(Types.ConvertTypeForMem(QualTy)), FieldVals);
 }
 
-static Constant *BuildConstInitializer(QualType Type, unsigned &offset,
-                                       SmallVector<Constant *, 4> &EltValList,
-                                       CodeGenTypes &Types,
-                                       bool bDefaultRowMajor) {
-  llvm::Type *Ty = Types.ConvertType(Type);
-  if (llvm::VectorType *VT = dyn_cast<llvm::VectorType>(Ty)) {
-    return BuildConstVector(VT, offset, EltValList, Type, Types);
-  } else if (llvm::ArrayType *AT = dyn_cast<llvm::ArrayType>(Ty)) {
-    return BuildConstArray(AT, offset, EltValList, Type, Types,
-                           bDefaultRowMajor);
-  } else if (HLMatrixLower::IsMatrixType(Ty)) {
-    return BuildConstMatrix(Ty, offset, EltValList, Type, Types,
-                            bDefaultRowMajor);
-  } else if (StructType *ST = dyn_cast<llvm::StructType>(Ty)) {
-    return BuildConstStruct(ST, offset, EltValList, Type, Types,
-                            bDefaultRowMajor);
-  } else {
-    // Scalar basic types.
-    Constant *Val = EltValList[offset++];
-    if (Val->getType() == Ty) {
-      return Val;
-    } else {
-      IRBuilder<> Builder(Ty->getContext());
-      // Don't cast int to bool. bool only for scalar.
-      if (Ty == Builder.getInt1Ty() && Val->getType() == Builder.getInt32Ty())
-        return Val;
-      Instruction::CastOps castOp =
-          static_cast<Instruction::CastOps>(HLModule::FindCastOp(
-              IsUnsigned(Type), IsUnsigned(Type), Val->getType(), Ty));
-      return cast<Constant>(Builder.CreateCast(castOp, Val, Ty));
+static Constant *BuildConstInitializer(CodeGenTypes &Types, bool bDefaultRowMajor,
+    QualType QualTy, bool MemRepr,
+    SmallVectorImpl<Constant *> &EltVals, SmallVectorImpl<QualType> &EltQualTys, unsigned &EltIdx) {
+  if (hlsl::IsHLSLVecType(QualTy)) {
+    QualType VecEltQualTy = hlsl::GetHLSLVecElementType(QualTy);
+    unsigned VecSize = hlsl::GetHLSLVecSize(QualTy);
+    SmallVector<Constant *, 4> VecElts;
+    for (unsigned i = 0; i < VecSize; i++) {
+      VecElts.emplace_back(BuildConstInitializer(Types, bDefaultRowMajor,
+        VecEltQualTy, MemRepr,
+        EltVals, EltQualTys, EltIdx));
     }
+    return llvm::ConstantVector::get(VecElts);
+  }
+  else if (const clang::ConstantArrayType *ArrayTy = Types.getContext().getAsConstantArrayType(QualTy)) {
+    QualType ArrayEltQualTy = QualType(ArrayTy->getArrayElementTypeNoTypeQual(), 0);
+    uint64_t ArraySize = ArrayTy->getSize().getLimitedValue();
+    SmallVector<Constant *, 4> ArrayElts;
+    for (unsigned i = 0; i < ArraySize; i++) {
+      ArrayElts.emplace_back(BuildConstInitializer(Types, bDefaultRowMajor,
+        ArrayEltQualTy, true, // Array elements must be in their memory representation
+        EltVals, EltQualTys, EltIdx));
+    }
+    return llvm::ConstantArray::get(
+      cast<llvm::ArrayType>(Types.ConvertTypeForMem(QualTy)), ArrayElts);
+  }
+  else if (hlsl::IsHLSLMatType(QualTy)) {
+    return BuildConstMatrix(Types, bDefaultRowMajor, QualTy,
+      EltVals, EltQualTys, EltIdx);
+  }
+  else if (QualTy->getAs<clang::RecordType>() != nullptr) {
+    return BuildConstStruct(Types, bDefaultRowMajor, QualTy,
+      EltVals, EltQualTys, EltIdx);
+  } else {
+    DXASSERT_NOMSG(QualTy->isBuiltinType());
+    Constant *EltVal = EltVals[EltIdx];
+    QualType EltQualTy = EltQualTys[EltIdx];
+    EltIdx++;
+
+    // Initializer constants are in their memory representation.
+    if (EltQualTy == QualTy && MemRepr) return EltVal;
+
+    CGBuilderTy Builder(EltVal->getContext());
+    if (EltQualTy->isBooleanType()) {
+      // Convert to register representation
+      // We don't have access to CodeGenFunction::EmitFromMemory here
+      DXASSERT_NOMSG(!EltVal->getType()->isIntegerTy(1));
+      EltVal = cast<Constant>(Builder.CreateICmpNE(EltVal, Constant::getNullValue(EltVal->getType())));
+    }
+
+    Constant *Result = cast<Constant>(ConvertScalarOrVector(Builder, Types, EltVal, EltQualTy, QualTy));
+
+    if (QualTy->isBooleanType() && MemRepr) {
+      // Convert back to the memory representation
+      // We don't have access to CodeGenFunction::EmitToMemory here
+      DXASSERT_NOMSG(Result->getType()->isIntegerTy(1));
+      Result = cast<Constant>(Builder.CreateZExt(Result, Types.ConvertTypeForMem(QualTy)));
+    }
+
+    return Result;
   }
 }
 
 Constant *CGMSHLSLRuntime::EmitHLSLConstInitListExpr(CodeGenModule &CGM,
                                                      InitListExpr *E) {
   bool bDefaultRowMajor = m_pHLModule->GetHLOptions().bDefaultRowMajor;
-  SmallVector<Constant *, 4> EltValList;
-  if (!ScanConstInitList(CGM, E, EltValList, CGM.getTypes(), bDefaultRowMajor))
+  SmallVector<Constant *, 4> EltVals;
+  SmallVector<QualType, 4> EltQualTys;
+  if (!ScanConstInitList(CGM, bDefaultRowMajor, E, EltVals, EltQualTys))
     return nullptr;
 
-  QualType Type = E->getType();
-  unsigned offset = 0;
-  return BuildConstInitializer(Type, offset, EltValList, CGM.getTypes(),
-                               bDefaultRowMajor);
+  QualType QualTy = E->getType();
+  unsigned EltIdx = 0;
+  bool MemRepr = true;
+  return BuildConstInitializer(CGM.getTypes(), bDefaultRowMajor,
+    QualTy, MemRepr, EltVals, EltQualTys, EltIdx);
 }
 
 Value *CGMSHLSLRuntime::EmitHLSLMatrixOperationCall(
@@ -6260,7 +6296,7 @@ Value *CGMSHLSLRuntime::EmitHLSLMatrixSubscript(CodeGenFunction &CGF,
                                                 llvm::Value *Idx,
                                                 clang::QualType Ty) {
   bool isRowMajor =
-      IsRowMajorMatrix(Ty, m_pHLModule->GetHLOptions().bDefaultRowMajor);
+      hlsl::IsHLSLMatRowMajor(Ty, m_pHLModule->GetHLOptions().bDefaultRowMajor);
   unsigned opcode =
       isRowMajor ? static_cast<unsigned>(HLSubscriptOpcode::RowMatSubscript)
                  : static_cast<unsigned>(HLSubscriptOpcode::ColMatSubscript);
@@ -6306,7 +6342,7 @@ Value *CGMSHLSLRuntime::EmitHLSLMatrixElement(CodeGenFunction &CGF,
                                               ArrayRef<Value *> paramList,
                                               QualType Ty) {
   bool isRowMajor =
-      IsRowMajorMatrix(Ty, m_pHLModule->GetHLOptions().bDefaultRowMajor);
+    hlsl::IsHLSLMatRowMajor(Ty, m_pHLModule->GetHLOptions().bDefaultRowMajor);
   unsigned opcode =
       isRowMajor ? static_cast<unsigned>(HLSubscriptOpcode::RowMatElement)
                  : static_cast<unsigned>(HLSubscriptOpcode::ColMatElement);
@@ -6358,7 +6394,7 @@ Value *CGMSHLSLRuntime::EmitHLSLMatrixElement(CodeGenFunction &CGF,
 Value *CGMSHLSLRuntime::EmitHLSLMatrixLoad(CGBuilderTy &Builder, Value *Ptr,
                                            QualType Ty) {
   bool isRowMajor =
-      IsRowMajorMatrix(Ty, m_pHLModule->GetHLOptions().bDefaultRowMajor);
+    hlsl::IsHLSLMatRowMajor(Ty, m_pHLModule->GetHLOptions().bDefaultRowMajor);
   unsigned opcode =
       isRowMajor
           ? static_cast<unsigned>(HLMatLoadStoreOpcode::RowMatLoad)
@@ -6381,7 +6417,7 @@ Value *CGMSHLSLRuntime::EmitHLSLMatrixLoad(CGBuilderTy &Builder, Value *Ptr,
 void CGMSHLSLRuntime::EmitHLSLMatrixStore(CGBuilderTy &Builder, Value *Val,
                                           Value *DestPtr, QualType Ty) {
   bool isRowMajor =
-      IsRowMajorMatrix(Ty, m_pHLModule->GetHLOptions().bDefaultRowMajor);
+    hlsl::IsHLSLMatRowMajor(Ty, m_pHLModule->GetHLOptions().bDefaultRowMajor);
   unsigned opcode =
       isRowMajor
           ? static_cast<unsigned>(HLMatLoadStoreOpcode::RowMatStore)
@@ -6482,7 +6518,7 @@ void CGMSHLSLRuntime::FlattenAggregatePtrToGepList(
                                  GepList, EltTyList);
 
     idxList.pop_back();
-  } else if (HLMatrixLower::IsMatrixType(Ty)) {
+  } else if (dxilutil::IsHLSLMatrixType(Ty)) {
     // Use matLd/St for matrix.
     unsigned col, row;
     llvm::Type *EltTy = HLMatrixLower::GetMatrixInfo(Ty, col, row);
@@ -6592,37 +6628,33 @@ void CGMSHLSLRuntime::FlattenAggregatePtrToGepList(
   }
 }
 
-void CGMSHLSLRuntime::LoadFlattenedGepList(CodeGenFunction &CGF,
-                                           ArrayRef<Value *> GepList,
-                                           ArrayRef<QualType> EltTyList,
-                                           SmallVector<Value *, 4> &EltList) {
-  unsigned eltSize = GepList.size();
-  for (unsigned i = 0; i < eltSize; i++) {
-    Value *Ptr = GepList[i];
-    // Everying is element type.
-    EltList.push_back(CGF.Builder.CreateLoad(Ptr));
+void CGMSHLSLRuntime::LoadElements(CodeGenFunction &CGF,
+    ArrayRef<Value *> Ptrs, ArrayRef<QualType> QualTys,
+    SmallVector<Value *, 4> &Vals) {
+  for (size_t i = 0, e = Ptrs.size(); i < e; i++) {
+    Value *Ptr = Ptrs[i];
+    llvm::Type *Ty = Ptr->getType()->getPointerElementType();
+    DXASSERT_LOCALVAR(Ty, Ty->isIntegerTy() || Ty->isFloatingPointTy(), "Expected only element types.");
+    Value *Val = CGF.Builder.CreateLoad(Ptr);
+    Val = CGF.EmitFromMemory(Val, QualTys[i]);
+    Vals.push_back(Val);
   }
 }
 
-void CGMSHLSLRuntime::StoreFlattenedGepList(CodeGenFunction &CGF, ArrayRef<Value *> GepList,
-    ArrayRef<QualType> GepTyList, ArrayRef<Value *> EltValList, ArrayRef<QualType> SrcTyList) {
-  unsigned eltSize = GepList.size();
-  for (unsigned i = 0; i < eltSize; i++) {
-    Value *Ptr = GepList[i];
-    QualType DestType = GepTyList[i];
-    Value *Val = EltValList[i];
-    QualType SrcType = SrcTyList[i];
+void CGMSHLSLRuntime::ConvertAndStoreElements(CodeGenFunction &CGF,
+    ArrayRef<Value *> SrcVals, ArrayRef<QualType> SrcQualTys,
+    ArrayRef<Value *> DstPtrs, ArrayRef<QualType> DstQualTys) {
+  for (size_t i = 0, e = DstPtrs.size(); i < e; i++) {
+    Value *DstPtr = DstPtrs[i];
+    QualType DstQualTy = DstQualTys[i];
+    Value *SrcVal = SrcVals[i];
+    QualType SrcQualTy = SrcQualTys[i];
+    DXASSERT(SrcVal->getType()->isIntegerTy() || SrcVal->getType()->isFloatingPointTy(),
+      "Expected only element types.");
 
-    llvm::Type *Ty = Ptr->getType()->getPointerElementType();
-    // Everything is element type.
-    if (Ty != Val->getType()) {
-      Instruction::CastOps castOp =
-          static_cast<Instruction::CastOps>(HLModule::FindCastOp(
-              IsUnsigned(SrcType), IsUnsigned(DestType), Val->getType(), Ty));
-
-      Val = CGF.Builder.CreateCast(castOp, Val, Ty);
-    }
-    CGF.Builder.CreateStore(Val, Ptr);
+    llvm::Value *Result = ConvertScalarOrVector(CGF, SrcVal, SrcQualTy, DstQualTy);
+    Result = CGF.EmitToMemory(Result, DstQualTy);
+    CGF.Builder.CreateStore(Result, DstPtr);
   }
 }
 
@@ -6644,7 +6676,7 @@ void CGMSHLSLRuntime::EmitHLSLAggregateCopy(
                           PT->getElementType());
 
     idxList.pop_back();
-  } else if (HLMatrixLower::IsMatrixType(Ty)) {
+  } else if (dxilutil::IsHLSLMatrixType(Ty)) {
     // Use matLd/St for matrix.
     Value *srcGEP = CGF.Builder.CreateInBoundsGEP(SrcPtr, idxList);
     Value *dstGEP = CGF.Builder.CreateInBoundsGEP(DestPtr, idxList);
@@ -6773,26 +6805,25 @@ void CGMSHLSLRuntime::EmitHLSLFlatConversionAggregateCopy(CodeGenFunction &CGF,
     }
   }
 
-  // It is possiable to implement EmitHLSLAggregateCopy, EmitHLSLAggregateStore
+  // It is possible to implement EmitHLSLAggregateCopy, EmitHLSLAggregateStore
   // the same way. But split value to scalar will generate many instruction when
   // src type is same as dest type.
-  SmallVector<Value *, 4> idxList;
-  SmallVector<Value *, 4> SrcGEPList;
-  SmallVector<QualType, 4> SrcEltTyList;
-  FlattenAggregatePtrToGepList(CGF, SrcPtr, idxList, SrcTy, SrcPtr->getType(),
-                               SrcGEPList, SrcEltTyList);
+  SmallVector<Value *, 4> GEPIdxStack;
+  SmallVector<Value *, 4> SrcPtrs;
+  SmallVector<QualType, 4> SrcQualTys;
+  FlattenAggregatePtrToGepList(CGF, SrcPtr, GEPIdxStack, SrcTy, SrcPtr->getType(),
+                               SrcPtrs, SrcQualTys);
 
-  SmallVector<Value *, 4> LdEltList;
-  LoadFlattenedGepList(CGF, SrcGEPList, SrcEltTyList, LdEltList);
+  SmallVector<Value *, 4> SrcVals;
+  LoadElements(CGF, SrcPtrs, SrcQualTys, SrcVals);
 
-  idxList.clear();
-  SmallVector<Value *, 4> DestGEPList;
-  SmallVector<QualType, 4> DestEltTyList;
-  FlattenAggregatePtrToGepList(CGF, DestPtr, idxList, DestTy,
-                               DestPtr->getType(), DestGEPList, DestEltTyList);
+  GEPIdxStack.clear();
+  SmallVector<Value *, 4> DstPtrs;
+  SmallVector<QualType, 4> DstQualTys;
+  FlattenAggregatePtrToGepList(CGF, DestPtr, GEPIdxStack, DestTy,
+                               DestPtr->getType(), DstPtrs, DstQualTys);
 
-  StoreFlattenedGepList(CGF, DestGEPList, DestEltTyList, LdEltList,
-                        SrcEltTyList);
+  ConvertAndStoreElements(CGF, SrcVals, SrcQualTys, DstPtrs, DstQualTys);
 }
 
 void CGMSHLSLRuntime::EmitHLSLAggregateStore(CodeGenFunction &CGF, llvm::Value *SrcVal,
@@ -6801,68 +6832,58 @@ void CGMSHLSLRuntime::EmitHLSLAggregateStore(CodeGenFunction &CGF, llvm::Value *
     DXASSERT(0, "aggregate return type will use SRet, no aggregate store should exist");
 }
 
-static void SimpleFlatValCopy(Value *DestPtr, Value *SrcVal, QualType Ty,
-                              QualType SrcTy, ArrayRef<Value *> idxList,
-                              CGBuilderTy &Builder) {
-  Value *DestGEP = Builder.CreateInBoundsGEP(DestPtr, idxList);
-  llvm::Type *ToTy = DestGEP->getType()->getPointerElementType();
+// Either copies a scalar to a scalar, a scalar to a vector, or splats a scalar to a vector
+static void SimpleFlatValCopy(CodeGenFunction &CGF, 
+    Value *SrcVal, QualType SrcQualTy, Value *DstPtr, QualType DstQualTy) {
+  DXASSERT(SrcVal->getType() == CGF.ConvertType(SrcQualTy), "QualType/Type mismatch!");
+  
+  llvm::Type *DstTy = DstPtr->getType()->getPointerElementType();
+  DXASSERT(DstTy == CGF.ConvertTypeForMem(DstQualTy), "QualType/Type mismatch!");
 
-  llvm::Type *EltToTy = ToTy;
-  if (llvm::VectorType *VT = dyn_cast<llvm::VectorType>(ToTy)) {
-    EltToTy = VT->getElementType();
+  llvm::VectorType *DstVecTy = dyn_cast<llvm::VectorType>(DstTy);
+  QualType DstScalarQualTy = DstQualTy;
+  if (DstVecTy) {
+    DstScalarQualTy = hlsl::GetHLSLVecElementType(DstQualTy);
   }
 
-  if (EltToTy != SrcVal->getType()) {
-    Instruction::CastOps castOp =
-        static_cast<Instruction::CastOps>(HLModule::FindCastOp(
-            IsUnsigned(SrcTy), IsUnsigned(Ty), SrcVal->getType(), ToTy));
+  Value *ResultScalar = ConvertScalarOrVector(CGF, SrcVal, SrcQualTy, DstScalarQualTy);
+  ResultScalar = CGF.EmitToMemory(ResultScalar, DstScalarQualTy);
 
-    SrcVal = Builder.CreateCast(castOp, SrcVal, EltToTy);
-  }
-
-  if (llvm::VectorType *VT = dyn_cast<llvm::VectorType>(ToTy)) {
-    llvm::VectorType *VT1 = llvm::VectorType::get(EltToTy, 1);
-    Value *V1 =
-        Builder.CreateInsertElement(UndefValue::get(VT1), SrcVal, (uint64_t)0);
-    std::vector<int> shufIdx(VT->getNumElements(), 0);
-    Value *Vec = Builder.CreateShuffleVector(V1, V1, shufIdx);
-    Builder.CreateStore(Vec, DestGEP);
+  if (DstVecTy) {
+    llvm::VectorType *DstScalarVecTy = llvm::VectorType::get(ResultScalar->getType(), 1);
+    Value *ResultScalarVec = CGF.Builder.CreateInsertElement(
+      UndefValue::get(DstScalarVecTy), ResultScalar, (uint64_t)0);
+    std::vector<int> ShufIdx(DstVecTy->getNumElements(), 0);
+    Value *ResultVec = CGF.Builder.CreateShuffleVector(ResultScalarVec, ResultScalarVec, ShufIdx);
+    CGF.Builder.CreateStore(ResultVec, DstPtr);
   } else
-    Builder.CreateStore(SrcVal, DestGEP);
+    CGF.Builder.CreateStore(ResultScalar, DstPtr);
 }
 
-void CGMSHLSLRuntime::EmitHLSLFlatConversionToAggregate(
+void CGMSHLSLRuntime::EmitHLSLFlatConversion(
     CodeGenFunction &CGF, Value *SrcVal, llvm::Value *DestPtr,
     SmallVector<Value *, 4> &idxList, QualType Type, QualType SrcType,
     llvm::Type *Ty) {
   if (llvm::PointerType *PT = dyn_cast<llvm::PointerType>(Ty)) {
-    Constant *idx = Constant::getIntegerValue(
-        IntegerType::get(Ty->getContext(), 32), APInt(32, 0));
-    idxList.emplace_back(idx);
+    idxList.emplace_back(CGF.Builder.getInt32(0));
 
-    EmitHLSLFlatConversionToAggregate(CGF, SrcVal, DestPtr, idxList, Type,
+    EmitHLSLFlatConversion(CGF, SrcVal, DestPtr, idxList, Type,
                                       SrcType, PT->getElementType());
 
     idxList.pop_back();
-  } else if (HLMatrixLower::IsMatrixType(Ty)) {
+  } else if (dxilutil::IsHLSLMatrixType(Ty)) {
     // Use matLd/St for matrix.
     Value *dstGEP = CGF.Builder.CreateInBoundsGEP(DestPtr, idxList);
     unsigned row, col;
     llvm::Type *EltTy = HLMatrixLower::GetMatrixInfo(Ty, col, row);
 
     llvm::VectorType *VT1 = llvm::VectorType::get(EltTy, 1);
-    if (EltTy != SrcVal->getType()) {
-      Instruction::CastOps castOp =
-          static_cast<Instruction::CastOps>(HLModule::FindCastOp(
-              IsUnsigned(SrcType), IsUnsigned(Type), SrcVal->getType(), EltTy));
-
-      SrcVal = CGF.Builder.CreateCast(castOp, SrcVal, EltTy);
-    }
+    SrcVal = ConvertScalarOrVector(CGF, SrcVal, SrcType, hlsl::GetHLSLMatElementType(Type));
 
+    // Splat the value
     Value *V1 = CGF.Builder.CreateInsertElement(UndefValue::get(VT1), SrcVal,
                                                 (uint64_t)0);
     std::vector<int> shufIdx(col * row, 0);
-
     Value *VecMat = CGF.Builder.CreateShuffleVector(V1, V1, shufIdx);
     Value *MatInit = EmitHLSLMatrixOperationCallImp(
         CGF.Builder, HLOpcodeGroup::HLInit, 0, Ty, {VecMat}, TheModule);
@@ -6889,7 +6910,7 @@ void CGMSHLSLRuntime::EmitHLSLFlatConversionToAggregate(
           Constant *idx = llvm::Constant::getIntegerValue(
               IntegerType::get(Ty->getContext(), 32), APInt(32, i));
           idxList.emplace_back(idx);
-          EmitHLSLFlatConversionToAggregate(CGF, SrcVal, DestPtr, idxList,
+          EmitHLSLFlatConversion(CGF, SrcVal, DestPtr, idxList,
                                             parentTy, SrcType, ET);
           idxList.pop_back();
         }
@@ -6904,7 +6925,7 @@ void CGMSHLSLRuntime::EmitHLSLFlatConversionToAggregate(
           IntegerType::get(Ty->getContext(), 32), APInt(32, i));
       idxList.emplace_back(idx);
 
-      EmitHLSLFlatConversionToAggregate(CGF, SrcVal, DestPtr, idxList,
+      EmitHLSLFlatConversion(CGF, SrcVal, DestPtr, idxList,
                                         fieldIter->getType(), SrcType, ET);
 
       idxList.pop_back();
@@ -6920,43 +6941,42 @@ void CGMSHLSLRuntime::EmitHLSLFlatConversionToAggregate(
           IntegerType::get(Ty->getContext(), 32), APInt(32, i));
       idxList.emplace_back(idx);
 
-      EmitHLSLFlatConversionToAggregate(CGF, SrcVal, DestPtr, idxList, EltType,
+      EmitHLSLFlatConversion(CGF, SrcVal, DestPtr, idxList, EltType,
                                         SrcType, ET);
 
       idxList.pop_back();
     }
   } else {
-    SimpleFlatValCopy(DestPtr, SrcVal, Type, SrcType, idxList, CGF.Builder);
+    DestPtr = CGF.Builder.CreateInBoundsGEP(DestPtr, idxList);
+    SimpleFlatValCopy(CGF, SrcVal, SrcType, DestPtr, Type);
   }
 }
 
-void CGMSHLSLRuntime::EmitHLSLFlatConversionToAggregate(CodeGenFunction &CGF,
-                                                        Value *Val,
-                                                        Value *DestPtr,
-                                                        QualType Ty,
-                                                        QualType SrcTy) {
+void CGMSHLSLRuntime::EmitHLSLFlatConversion(CodeGenFunction &CGF,
+                                             Value *Val,
+                                             Value *DestPtr,
+                                             QualType Ty,
+                                             QualType SrcTy) {
   if (SrcTy->isBuiltinType()) {
     SmallVector<Value *, 4> idxList;
     // Add first 0 for DestPtr.
-    Constant *idx = Constant::getIntegerValue(
-        IntegerType::get(Val->getContext(), 32), APInt(32, 0));
-    idxList.emplace_back(idx);
+    idxList.emplace_back(CGF.Builder.getInt32(0));
 
-    EmitHLSLFlatConversionToAggregate(
+    EmitHLSLFlatConversion(
         CGF, Val, DestPtr, idxList, Ty, SrcTy,
         DestPtr->getType()->getPointerElementType());
   }
   else {
-    SmallVector<Value *, 4> idxList;
-    SmallVector<Value *, 4> DestGEPList;
-    SmallVector<QualType, 4> DestEltTyList;
-    FlattenAggregatePtrToGepList(CGF, DestPtr, idxList, Ty, DestPtr->getType(), DestGEPList, DestEltTyList);
+    SmallVector<Value *, 4> GEPIdxStack;
+    SmallVector<Value *, 4> DstPtrs;
+    SmallVector<QualType, 4> DstQualTys;
+    FlattenAggregatePtrToGepList(CGF, DestPtr, GEPIdxStack, Ty, DestPtr->getType(), DstPtrs, DstQualTys);
 
-    SmallVector<Value *, 4> EltList;
-    SmallVector<QualType, 4> EltTyList;
-    FlattenValToInitList(CGF, EltList, EltTyList, SrcTy, Val);
+    SmallVector<Value *, 4> SrcVals;
+    SmallVector<QualType, 4> SrcQualTys;
+    FlattenValToInitList(CGF, SrcVals, SrcQualTys, SrcTy, Val);
 
-    StoreFlattenedGepList(CGF, DestGEPList, DestEltTyList, EltList, EltTyList);
+    ConvertAndStoreElements(CGF, SrcVals, SrcQualTys, DstPtrs, DstQualTys);
   }
 }
 
@@ -7063,15 +7083,10 @@ void CGMSHLSLRuntime::EmitHLSLOutParamConversionInit(
     BasicBlock *InsertBlock = CGF.Builder.GetInsertBlock();
     Function *F = InsertBlock->getParent();
 
-    if (ParamTy->isBooleanType()) {
-      // Create i32 for bool.
-      ParamTy = CGM.getContext().IntTy;
-    }
     // Make sure the alloca is in entry block to stop inline create stacksave.
     IRBuilder<> AllocaBuilder(dxilutil::FindAllocaInsertionPt(F));
-    tmpArgAddr = AllocaBuilder.CreateAlloca(CGF.ConvertType(ParamTy));
+    tmpArgAddr = AllocaBuilder.CreateAlloca(CGF.ConvertTypeForMem(ParamTy));
 
-      
     // add it to local decl map
     TmpArgMap(tmpArg, tmpArgAddr);
 
@@ -7093,9 +7108,8 @@ void CGMSHLSLRuntime::EmitHLSLOutParamConversionInit(
         !isObject) {
       QualType ArgTy = Arg->getType();
       Value *outVal = nullptr;
-      bool isAggrageteTy = ParamTy->isAggregateType();
-      isAggrageteTy &= !IsHLSLVecMatType(ParamTy);
-      if (!isAggrageteTy) {
+      bool isAggregateTy = ParamTy->isAggregateType() && !IsHLSLVecMatType(ParamTy);
+      if (!isAggregateTy) {
         if (!IsHLSLMatType(ParamTy)) {
           RValue outRVal = CGF.EmitLoadOfLValue(argLV, SourceLocation());
           outVal = outRVal.getScalarVal();
@@ -7105,16 +7119,15 @@ void CGMSHLSLRuntime::EmitHLSLOutParamConversionInit(
         }
 
         llvm::Type *ToTy = tmpArgAddr->getType()->getPointerElementType();
-        Instruction::CastOps castOp =
-            static_cast<Instruction::CastOps>(HLModule::FindCastOp(
-                IsUnsigned(argLV.getType()), IsUnsigned(tmpLV.getType()),
-                outVal->getType(), ToTy));
-
-        Value *castVal = CGF.Builder.CreateCast(castOp, outVal, ToTy);
-        if (!HLMatrixLower::IsMatrixType(ToTy))
-          CGF.Builder.CreateStore(castVal, tmpArgAddr);
-        else
+        if (dxilutil::IsHLSLMatrixType(ToTy)) {
+          Value *castVal = CGF.Builder.CreateBitCast(outVal, ToTy);
           EmitHLSLMatrixStore(CGF, castVal, tmpArgAddr, ParamTy);
+        }
+        else {
+          Value *castVal = ConvertScalarOrVector(CGF, outVal, argLV.getType(), tmpLV.getType());
+          castVal = CGF.EmitToMemory(castVal, tmpLV.getType());
+          CGF.Builder.CreateStore(castVal, tmpArgAddr);
+        }
       } else {
         SmallVector<Value *, 4> idxList;
         EmitHLSLAggregateCopy(CGF, argLV.getAddress(), tmpLV.getAddress(),
@@ -7150,6 +7163,8 @@ void CGMSHLSLRuntime::EmitHLSLOutParamConversionCopyBack(
         else
           outVal = EmitHLSLMatrixLoad(CGF, tmpArgAddr, ParamTy);
 
+        outVal = CGF.EmitFromMemory(outVal, ParamTy);
+
         llvm::Type *ToTy = CGF.ConvertType(ArgTy);
         llvm::Type *FromTy = outVal->getType();
         Value *castVal = outVal;
@@ -7170,14 +7185,10 @@ void CGMSHLSLRuntime::EmitHLSLOutParamConversionCopyBack(
                 CGF.Builder.CreateInsertElement(castVal, outVal, (uint64_t)0);
           }
         } else {
-          Instruction::CastOps castOp =
-              static_cast<Instruction::CastOps>(HLModule::FindCastOp(
-                  IsUnsigned(tmpLV.getType()), IsUnsigned(argLV.getType()),
-                  outVal->getType(), ToTy));
-
-          castVal = CGF.Builder.CreateCast(castOp, outVal, ToTy);
+          castVal = ConvertScalarOrVector(CGF,
+            outVal, tmpLV.getType(), argLV.getType());
         }
-        if (!HLMatrixLower::IsMatrixType(ToTy))
+        if (!dxilutil::IsHLSLMatrixType(ToTy))
           CGF.EmitStoreThroughLValue(RValue::get(castVal), argLV);
         else {
           Value *destPtr = argLV.getAddress();
diff --git a/tools/clang/lib/CodeGen/CGHLSLRuntime.cpp b/tools/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 63244301b..f65eeede0 100644
--- a/tools/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/tools/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -21,23 +21,3 @@ using namespace clang;
 using namespace CodeGen;
 
 CGHLSLRuntime::~CGHLSLRuntime() {}
-
-bool CGHLSLRuntime::IsHLSLVecMatType(clang::QualType &type) {
-  return hlsl::IsHLSLVecMatType(type);
-}
-
-const clang::ExtVectorType *CGHLSLRuntime::ConvertHLSLVecMatTypeToExtVectorType(
-    const clang::ASTContext &context, clang::QualType &type) {
-  return hlsl::ConvertHLSLVecMatTypeToExtVectorType(context, type);
-}
-
-QualType CGHLSLRuntime::GetHLSLVecMatElementType(QualType type) {
-  const Type *Ty = type.getCanonicalType().getTypePtr();
-  // TODO: check isVecMatrix
-  const RecordType *RT = cast<RecordType>(Ty);
-  const ClassTemplateSpecializationDecl *templateDecl =
-            cast<ClassTemplateSpecializationDecl>(RT->getDecl());
-  const TemplateArgumentList &argList = templateDecl->getTemplateArgs();
-  const TemplateArgument &arg0 = argList[0];
-  return arg0.getAsType();
-}
diff --git a/tools/clang/lib/CodeGen/CGHLSLRuntime.h b/tools/clang/lib/CodeGen/CGHLSLRuntime.h
index 7f84e58fa..cf0848c41 100644
--- a/tools/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/tools/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -102,7 +102,7 @@ public:
   virtual void EmitHLSLAggregateStore(CodeGenFunction &CGF, llvm::Value *Val,
                                    llvm::Value *DestPtr,
                                    clang::QualType Ty) = 0;
-  virtual void EmitHLSLFlatConversionToAggregate(CodeGenFunction &CGF, llvm::Value *Val,
+  virtual void EmitHLSLFlatConversion(CodeGenFunction &CGF, llvm::Value *Val,
                                    llvm::Value *DestPtr,
                                    clang::QualType Ty, clang::QualType SrcTy) = 0;
   virtual void EmitHLSLFlatConversionAggregateCopy(CodeGenFunction &CGF, llvm::Value *SrcPtr,
@@ -122,11 +122,6 @@ public:
   virtual void AddControlFlowHint(CodeGenFunction &CGF, const Stmt &S, llvm::TerminatorInst *TI, llvm::ArrayRef<const Attr *> Attrs) = 0;
 
   virtual void FinishAutoVar(CodeGenFunction &CGF, const VarDecl &D, llvm::Value *V) = 0;
-  static const clang::ExtVectorType *
-  ConvertHLSLVecMatTypeToExtVectorType(const clang::ASTContext &context,
-                                       clang::QualType &type);
-  static bool IsHLSLVecMatType(clang::QualType &type);
-  static clang::QualType GetHLSLVecMatElementType(clang::QualType type);
 };
 
 /// Create an instance of a HLSL runtime class.
diff --git a/tools/clang/lib/CodeGen/CGStmt.cpp b/tools/clang/lib/CodeGen/CGStmt.cpp
index c5eaf3122..5b238d9a2 100644
--- a/tools/clang/lib/CodeGen/CGStmt.cpp
+++ b/tools/clang/lib/CodeGen/CGStmt.cpp
@@ -1122,7 +1122,7 @@ void CodeGenFunction::EmitReturnStmt(const ReturnStmt &S) {
     case TEK_Scalar:
       // HLSL Change Begins.
       if (hlsl::IsHLSLMatType(RV->getType())) {
-        CGM.getHLSLRuntime().EmitHLSLMatrixStore(*this, EmitScalarExpr(RV), ReturnValue, RV->getType());
+        CGM.getHLSLRuntime().EmitHLSLMatrixStore(*this, EmitScalarExpr(RV), ReturnValue, FnRetTy);
       } else
         // HLSL Change Ends.
         Builder.CreateStore(EmitScalarExpr(RV), ReturnValue);
diff --git a/tools/clang/lib/CodeGen/CodeGenModule.cpp b/tools/clang/lib/CodeGen/CodeGenModule.cpp
index a6d41bc77..6ce712b1a 100644
--- a/tools/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/tools/clang/lib/CodeGen/CodeGenModule.cpp
@@ -612,11 +612,14 @@ StringRef CodeGenModule::getMangledName(GlobalDecl GD) {
 
   const auto *ND = cast<NamedDecl>(GD.getDecl());
   // HLSL Change Starts
+  // Entry point doesn't get mangled
   if (ND->getKind() == Decl::Function &&
-      ND->getNameAsString() == CodeGenOpts.HLSLEntryFunction) {
+    ND->getDeclContext()->getDeclKind() == Decl::Kind::TranslationUnit &&
+    ND->getNameAsString() == CodeGenOpts.HLSLEntryFunction) {
     return CodeGenOpts.HLSLEntryFunction;
   }
   // HLSL Change Ends
+
   SmallString<256> Buffer;
   StringRef Str;
   if (getCXXABI().getMangleContext().shouldMangleDeclName(ND)) {
diff --git a/tools/clang/lib/CodeGen/CodeGenTypes.cpp b/tools/clang/lib/CodeGen/CodeGenTypes.cpp
index 425c35ff2..5641cfc8a 100644
--- a/tools/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/tools/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -109,15 +109,26 @@ void CodeGenTypes::addRecordTypeName(const RecordDecl *RD,
 /// a type.  For example, the scalar representation for _Bool is i1, but the
 /// memory representation is usually i8 or i32, depending on the target.
 llvm::Type *CodeGenTypes::ConvertTypeForMem(QualType T) {
+  // HLSL Change Starts
+  if (hlsl::IsHLSLVecType(T)) {
+    // Vectors of bools in memory should become vectors of
+    // the memory representation of the elements.
+    // Clang doesn't do this for plain VectorTypes,
+    // which is fine otherwise a bool1x1 matrix would become
+    // [n x <m x i32>] since array elements always have memory representation.
+    QualType ElemT = hlsl::GetElementTypeOrType(T);
+    return llvm::VectorType::get(ConvertTypeForMem(ElemT), hlsl::GetHLSLVecSize(T));
+  }
+
   llvm::Type *R = ConvertType(T);
 
-  // If this is a non-bool type, don't map it.
-  if (!R->isIntegerTy(1))
-    return R;
+  if (R->isIntegerTy(1)) {
+    // Bools have a different representation in memory
+    return llvm::IntegerType::get(getLLVMContext(), (unsigned)Context.getTypeSize(T));
+  }
 
-  // Otherwise, return an integer of the target-specified size.
-  return llvm::IntegerType::get(getLLVMContext(),
-                                (unsigned)Context.getTypeSize(T));
+  return R;
+  // HLSL Change Ends
 }
 
 
diff --git a/tools/clang/lib/Frontend/ASTUnit.cpp b/tools/clang/lib/Frontend/ASTUnit.cpp
index 253027036..66bc2571d 100644
--- a/tools/clang/lib/Frontend/ASTUnit.cpp
+++ b/tools/clang/lib/Frontend/ASTUnit.cpp
@@ -2054,14 +2054,10 @@ ASTUnit *ASTUnit::LoadFromCommandLine(
   AST.reset(new ASTUnit(false));
   // HLSL Change Starts
   AST->HlslLangExtensions = HlslLangExtensions;
-  // Enable -verify on the libclang initialization path.
-  bool VerifyDiagnostics = false;
-  for (const char** Arg = ArgBegin; Arg != ArgEnd; ++Arg) {
-    if (strcmp(*Arg, "-verify") == 0) {
-      VerifyDiagnostics = true;
-      break;
-    }
-  }
+  // Enable -verify and -verify-ignore-unexpected on the libclang initialization path.
+  bool VerifyDiagnostics = CI->getDiagnosticOpts().VerifyDiagnostics;
+  Diags->getDiagnosticOptions().setVerifyIgnoreUnexpected(
+	  CI->getDiagnosticOpts().getVerifyIgnoreUnexpected());
   // HLSL Change Ends
   ConfigureDiags(Diags, *AST, CaptureDiagnostics, VerifyDiagnostics); // HLSL Change
   AST->Diagnostics = Diags;
diff --git a/tools/clang/lib/Parse/ParseDecl.cpp b/tools/clang/lib/Parse/ParseDecl.cpp
index 294d7eeea..7477e8daa 100644
--- a/tools/clang/lib/Parse/ParseDecl.cpp
+++ b/tools/clang/lib/Parse/ParseDecl.cpp
@@ -3478,6 +3478,13 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
       if (DS.hasTypeSpecifier() && DS.hasTagDefinition())
         goto DoneWithDeclSpec;
 
+      // HLSL Change Starts
+      // Remember the current state of the default matrix orientation,
+      // since it can change between any two tokens with #pragma pack_matrix
+      if (Parser::Actions.HasDefaultMatrixPack)
+        DS.SetDefaultMatrixPackRowMajor(Parser::Actions.DefaultMatrixPackRowMajor);
+      // HLSL Change Ends
+
       if (Tok.getAnnotationValue()) {
         ParsedType T = getTypeAnnotation(Tok);
         isInvalid = DS.SetTypeSpecType(DeclSpec::TST_typename, Loc, PrevSpec,
@@ -3588,12 +3595,20 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
           Actions.isCurrentClassName(*Tok.getIdentifierInfo(), getCurScope()) &&
           isConstructorDeclarator(/*Unqualified*/true))
         goto DoneWithDeclSpec;
-      // HLSL Change start - modify TypeRep for unsigned vectors/matrix
+
+      // HLSL Change Starts
+      // Modify TypeRep for unsigned vectors/matrix
       QualType qt = TypeRep.get();
       QualType newType = ApplyTypeSpecSignToParsedType(&Actions, qt, DS.getTypeSpecSign(), Loc);
       isInvalid = DS.SetTypeSpecType(DeclSpec::TST_typename, Loc, PrevSpec,
                                      DiagID, ParsedType::make(newType), Policy);
-      // HLSL Change end
+
+      // Remember the current state of the default matrix orientation,
+      // since it can change between any two tokens with #pragma pack_matrix
+      if (Parser::Actions.HasDefaultMatrixPack)
+        DS.SetDefaultMatrixPackRowMajor(Parser::Actions.DefaultMatrixPackRowMajor);
+      // HLSL Change Ends
+
       if (isInvalid)
         break;
 
diff --git a/tools/clang/lib/SPIRV/InstBuilderAuto.cpp b/tools/clang/lib/SPIRV/InstBuilderAuto.cpp
index abc283546..db4f0c191 100644
--- a/tools/clang/lib/SPIRV/InstBuilderAuto.cpp
+++ b/tools/clang/lib/SPIRV/InstBuilderAuto.cpp
@@ -16,7 +16,7 @@
 namespace clang {
 namespace spirv {
 
-static_assert(spv::Version == 0x00010300 && spv::Revision == 1,
+static_assert(spv::Version == 0x00010300 && spv::Revision == 6,
               "Needs to regenerate outdated InstBuilder");
 
 namespace {
diff --git a/tools/clang/lib/SPIRV/ModuleBuilder.cpp b/tools/clang/lib/SPIRV/ModuleBuilder.cpp
index b35457f26..adac762d8 100644
--- a/tools/clang/lib/SPIRV/ModuleBuilder.cpp
+++ b/tools/clang/lib/SPIRV/ModuleBuilder.cpp
@@ -893,6 +893,11 @@ void ModuleBuilder::decorateNonUniformEXT(uint32_t targetId) {
   theModule.addDecoration(d, targetId);
 }
 
+void ModuleBuilder::decorateNoContraction(uint32_t targetId) {
+  const Decoration *d = Decoration::getNoContraction(theContext);
+  theModule.addDecoration(d, targetId);
+}
+
 #define IMPL_GET_PRIMITIVE_TYPE(ty)                                            \
                                                                                \
   uint32_t ModuleBuilder::get##ty##Type() {                                    \
diff --git a/tools/clang/lib/SPIRV/SPIRVEmitter.cpp b/tools/clang/lib/SPIRV/SPIRVEmitter.cpp
index 02dd8fec0..466305307 100644
--- a/tools/clang/lib/SPIRV/SPIRVEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SPIRVEmitter.cpp
@@ -3455,12 +3455,14 @@ bool SPIRVEmitter::tryToAssignCounterVar(const DeclaratorDecl *dstDecl,
   const auto *srcFields = getIntermediateACSBufferCounter(srcExpr, &srcIndices);
 
   if (dstFields && srcFields) {
-    if (!dstFields->assign(*srcFields, theBuilder, typeTranslator)) {
-      emitFatalError("cannot handle associated counter variable assignment",
-                     srcExpr->getExprLoc());
-      return false;
-    }
-    return true;
+    // The destination is a struct whose fields are directly alias resources.
+    // But that's not necessarily true for the source, which can be deep
+    // nested structs. That means they will have different index "prefixes"
+    // for all their fields; while the "prefix" for destination is effectively
+    // an empty list (since it is not nested in other structs). We need to
+    // strip the index prefix from the source.
+    return dstFields->assign(*srcFields, /*dstIndices=*/{}, srcIndices,
+                             theBuilder, typeTranslator);
   }
 
   // AssocCounter#2 and AssocCounter#4 for the lhs cannot happen since the lhs
@@ -6574,6 +6576,9 @@ SpirvEvalInfo SPIRVEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
   case hlsl::IntrinsicOp::IOP_lit:
     retVal = processIntrinsicLit(callExpr);
     break;
+  case hlsl::IntrinsicOp::IOP_mad:
+    retVal = processIntrinsicMad(callExpr);
+    break;
   case hlsl::IntrinsicOp::IOP_modf:
     retVal = processIntrinsicModf(callExpr);
     break;
@@ -6747,7 +6752,6 @@ SpirvEvalInfo SPIRVEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
     INTRINSIC_OP_CASE(lerp, FMix, true);
     INTRINSIC_OP_CASE(log, Log, true);
     INTRINSIC_OP_CASE(log2, Log2, true);
-    INTRINSIC_OP_CASE(mad, Fma, true);
     INTRINSIC_OP_CASE_SINT_UINT_FLOAT(max, SMax, UMax, FMax, true);
     INTRINSIC_OP_CASE(umax, UMax, true);
     INTRINSIC_OP_CASE_SINT_UINT_FLOAT(min, SMin, UMin, FMin, true);
@@ -7425,6 +7429,105 @@ uint32_t SPIRVEmitter::processIntrinsicModf(const CallExpr *callExpr) {
   return 0;
 }
 
+uint32_t SPIRVEmitter::processIntrinsicMad(const CallExpr *callExpr) {
+  // Signature is: ret mad(a,b,c)
+  // All of the above must be a scalar, vector, or matrix with the same
+  // component types. Component types can be float or int.
+  // The return value is equal to  "a * b + c"
+
+  // In the case of float arguments, we can use the GLSL extended instruction
+  // set's Fma instruction with NoContraction decoration. In the case of integer
+  // arguments, we'll have to manually perform an OpIMul followed by an OpIAdd
+  // (We should also apply NoContraction decoration to these two instructions to
+  // get precise arithmetic).
+
+  // TODO: We currently don't propagate the NoContraction decoration.
+
+  const Expr *arg0 = callExpr->getArg(0);
+  const Expr *arg1 = callExpr->getArg(1);
+  const Expr *arg2 = callExpr->getArg(2);
+  // All arguments and the return type are the same.
+  const auto argType = arg0->getType();
+  const auto argTypeId = typeTranslator.translateType(argType);
+  const uint32_t arg0Id = doExpr(arg0);
+  const uint32_t arg1Id = doExpr(arg1);
+  const uint32_t arg2Id = doExpr(arg2);
+
+  // For floating point arguments, we can use the extended instruction set's Fma
+  // instruction. Sadly we can't simply call processIntrinsicUsingGLSLInst
+  // because we need to specifically decorate the Fma instruction with
+  // NoContraction decoration.
+  if (isFloatOrVecMatOfFloatType(argType)) {
+    const auto opcode = GLSLstd450::GLSLstd450Fma;
+    const uint32_t glslInstSetId = theBuilder.getGLSLExtInstSet();
+    // For matrix cases, operate on each row of the matrix.
+    if (isMxNMatrix(arg0->getType())) {
+      const auto actOnEachVec = [this, glslInstSetId, opcode, arg1Id,
+                                 arg2Id](uint32_t index, uint32_t vecType,
+                                         uint32_t arg0RowId) {
+        const uint32_t arg1RowId =
+            theBuilder.createCompositeExtract(vecType, arg1Id, {index});
+        const uint32_t arg2RowId =
+            theBuilder.createCompositeExtract(vecType, arg2Id, {index});
+        const uint32_t fma = theBuilder.createExtInst(
+            vecType, glslInstSetId, opcode, {arg0RowId, arg1RowId, arg2RowId});
+        theBuilder.decorateNoContraction(fma);
+        return fma;
+      };
+      return processEachVectorInMatrix(arg0, arg0Id, actOnEachVec);
+    }
+    // Non-matrix cases
+    const uint32_t fma = theBuilder.createExtInst(
+        argTypeId, glslInstSetId, opcode, {arg0Id, arg1Id, arg2Id});
+    theBuilder.decorateNoContraction(fma);
+    return fma;
+  }
+
+  // For scalar and vector argument types.
+  {
+    if (isScalarType(argType) || isVectorType(argType)) {
+      const auto mul =
+          theBuilder.createBinaryOp(spv::Op::OpIMul, argTypeId, arg0Id, arg1Id);
+      const auto add =
+          theBuilder.createBinaryOp(spv::Op::OpIAdd, argTypeId, mul, arg2Id);
+      theBuilder.decorateNoContraction(mul);
+      theBuilder.decorateNoContraction(add);
+      return add;
+    }
+  }
+
+  // For matrix argument types.
+  {
+    uint32_t rowCount = 0, colCount = 0;
+    QualType elemType = {};
+    if (isMxNMatrix(argType, &elemType, &rowCount, &colCount)) {
+      const auto elemTypeId = typeTranslator.translateType(elemType);
+      const auto colTypeId = theBuilder.getVecType(elemTypeId, colCount);
+      llvm::SmallVector<uint32_t, 4> resultRows;
+      for (uint32_t i = 0; i < rowCount; ++i) {
+        const auto rowArg0 =
+            theBuilder.createCompositeExtract(colTypeId, arg0Id, {i});
+        const auto rowArg1 =
+            theBuilder.createCompositeExtract(colTypeId, arg1Id, {i});
+        const auto rowArg2 =
+            theBuilder.createCompositeExtract(colTypeId, arg2Id, {i});
+        const auto mul = theBuilder.createBinaryOp(spv::Op::OpIMul, colTypeId,
+                                                   rowArg0, rowArg1);
+        const auto add =
+            theBuilder.createBinaryOp(spv::Op::OpIAdd, colTypeId, mul, rowArg2);
+        theBuilder.decorateNoContraction(mul);
+        theBuilder.decorateNoContraction(add);
+        resultRows.push_back(add);
+      }
+      return theBuilder.createCompositeConstruct(argTypeId, resultRows);
+    }
+  }
+
+  emitError("invalid argument type passed to mad intrinsic function",
+            callExpr->getExprLoc());
+  return 0;
+}
+
 uint32_t SPIRVEmitter::processIntrinsicLit(const CallExpr *callExpr) {
   // Signature is: float4 lit(float n_dot_l, float n_dot_h, float m)
   //
@@ -7763,7 +7866,6 @@ uint32_t SPIRVEmitter::processIntrinsicMemoryBarrier(const CallExpr *callExpr,
       spv::MemorySemanticsMask::ImageMemory |
       spv::MemorySemanticsMask::UniformMemory |
       spv::MemorySemanticsMask::WorkgroupMemory |
-      spv::MemorySemanticsMask::AtomicCounterMemory |
       spv::MemorySemanticsMask::AcquireRelease;
 
   // Get <result-id> for execution scope.
diff --git a/tools/clang/lib/SPIRV/SPIRVEmitter.h b/tools/clang/lib/SPIRV/SPIRVEmitter.h
index 7a0184304..3e3ac1765 100644
--- a/tools/clang/lib/SPIRV/SPIRVEmitter.h
+++ b/tools/clang/lib/SPIRV/SPIRVEmitter.h
@@ -359,6 +359,9 @@ private:
   uint32_t processIntrinsicMemoryBarrier(const CallExpr *, bool isDevice,
                                          bool groupSync, bool isAllBarrier);
 
+  /// Processes the 'mad' intrinsic function.
+  uint32_t processIntrinsicMad(const CallExpr *);
+
   /// Processes the 'modf' intrinsic function.
   uint32_t processIntrinsicModf(const CallExpr *);
 
diff --git a/tools/clang/lib/Sema/SemaAttr.cpp b/tools/clang/lib/Sema/SemaAttr.cpp
index 8409b9a33..252006180 100644
--- a/tools/clang/lib/Sema/SemaAttr.cpp
+++ b/tools/clang/lib/Sema/SemaAttr.cpp
@@ -266,13 +266,10 @@ void Sema::ActOnPragmaPack(PragmaPackKind Kind, IdentifierInfo *Name,
 }
 
 void Sema::ActOnPragmaPackMatrix(bool bRowMajor, SourceLocation PragmaLoc) {
-  if (bRowMajor) {
-    PackMatrixRowMajorPragmaOn = true;
-    PackMatrixColMajorPragmaOn = false;
-  } else {
-    PackMatrixRowMajorPragmaOn = false;
-    PackMatrixColMajorPragmaOn = true;
-  }
+  // Once we've encountered one #pragma pack_matrix, we have a well-defined
+  // default orientation for the rest of the program, even if we're rewriting.
+  HasDefaultMatrixPack = true;
+  DefaultMatrixPackRowMajor = bRowMajor;
 }
 
 void Sema::ActOnPragmaMSStruct(PragmaMSStructKind Kind) { 
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index e1b7bf24d..3ef13faf7 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -185,7 +185,7 @@ enum ArBasicKind {
   AR_OBJECT_WAVE,
 
   AR_OBJECT_RAY_DESC,
-  AR_OBJECT_ACCELARATION_STRUCT,
+  AR_OBJECT_ACCELERATION_STRUCT,
   AR_OBJECT_USER_DEFINED_TYPE,
   AR_OBJECT_TRIANGLE_INTERSECTION_ATTRIBUTES,
 
@@ -462,7 +462,7 @@ const UINT g_uBasicKindProps[] =
   BPROP_OBJECT,   // AR_OBJECT_WAVE
 
   LICOMPTYPE_RAYDESC,               // AR_OBJECT_RAY_DESC
-  LICOMPTYPE_ACCELERATION_STRUCT,   // AR_OBJECT_ACCELARATION_STRUCT
+  LICOMPTYPE_ACCELERATION_STRUCT,   // AR_OBJECT_ACCELERATION_STRUCT
   LICOMPTYPE_USER_DEFINED_TYPE,      // AR_OBJECT_USER_DEFINED_TYPE
   0,      // AR_OBJECT_TRIANGLE_INTERSECTION_ATTRIBUTES
 
@@ -1098,9 +1098,9 @@ static const ArBasicKind g_RayDescCT[] =
   AR_BASIC_UNKNOWN
 };
 
-static const ArBasicKind g_AccelarationStructCT[] =
+static const ArBasicKind g_AccelerationStructCT[] =
 {
-  AR_OBJECT_ACCELARATION_STRUCT,
+  AR_OBJECT_ACCELERATION_STRUCT,
   AR_BASIC_UNKNOWN
 };
 
@@ -1201,7 +1201,7 @@ const ArBasicKind* g_LegalIntrinsicCompTypes[] =
   g_UInt16CT,           // LICOMPTYPE_UINT16
   g_Numeric16OnlyCT,    // LICOMPTYPE_NUMERIC16_ONLY
   g_RayDescCT,          // LICOMPTYPE_RAYDESC
-  g_AccelarationStructCT,   // LICOMPTYPE_ACCELERATION_STRUCT,
+  g_AccelerationStructCT,   // LICOMPTYPE_ACCELERATION_STRUCT,
   g_UDTCT,              // LICOMPTYPE_USER_DEFINED_TYPE
 };
 C_ASSERT(ARRAYSIZE(g_LegalIntrinsicCompTypes) == LICOMPTYPE_COUNT);
@@ -1275,7 +1275,7 @@ const ArBasicKind g_ArBasicKindsAsTypes[] =
 
   AR_OBJECT_WAVE,
   AR_OBJECT_RAY_DESC,
-  AR_OBJECT_ACCELARATION_STRUCT,
+  AR_OBJECT_ACCELERATION_STRUCT,
   AR_OBJECT_TRIANGLE_INTERSECTION_ATTRIBUTES,
 
   // subobjects
@@ -1355,7 +1355,7 @@ const uint8_t g_ArBasicKindsTemplateCount[] =
   0, // AR_OBJECT_LEGACY_EFFECT   // Used for all unsupported but ignored legacy effect types
   0, // AR_OBJECT_WAVE
   0, // AR_OBJECT_RAY_DESC
-  0, // AR_OBJECT_ACCELARATION_STRUCT
+  0, // AR_OBJECT_ACCELERATION_STRUCT
   0, // AR_OBJECT_TRIANGLE_INTERSECTION_ATTRIBUTES
 
   0, // AR_OBJECT_STATE_OBJECT_CONFIG,
@@ -1444,7 +1444,7 @@ const SubscriptOperatorRecord g_ArBasicKindsSubscripts[] =
   { 0, MipsFalse, SampleFalse }, // AR_OBJECT_LEGACY_EFFECT (legacy effect objects)
   { 0, MipsFalse, SampleFalse },  // AR_OBJECT_WAVE
   { 0, MipsFalse, SampleFalse },  // AR_OBJECT_RAY_DESC
-  { 0, MipsFalse, SampleFalse },  // AR_OBJECT_ACCELARATION_STRUCT
+  { 0, MipsFalse, SampleFalse },  // AR_OBJECT_ACCELERATION_STRUCT
   { 0, MipsFalse, SampleFalse },  // AR_OBJECT_TRIANGLE_INTERSECTION_ATTRIBUTES
 
   { 0, MipsFalse, SampleFalse },  // AR_OBJECT_STATE_OBJECT_CONFIG,
@@ -1573,24 +1573,26 @@ const char* g_ArBasicTypeNames[] =
 
 C_ASSERT(_countof(g_ArBasicTypeNames) == AR_BASIC_MAXIMUM_COUNT);
 
+static bool IsValidBasicKind(ArBasicKind kind) {
+  return kind != AR_BASIC_COUNT &&
+    kind != AR_BASIC_NONE &&
+    kind != AR_BASIC_UNKNOWN &&
+    kind != AR_BASIC_NOCAST &&
+    kind != AR_BASIC_POINTER &&
+    kind != AR_OBJECT_RENDERTARGETVIEW &&
+    kind != AR_OBJECT_DEPTHSTENCILVIEW &&
+    kind != AR_OBJECT_COMPUTESHADER &&
+    kind != AR_OBJECT_DOMAINSHADER &&
+    kind != AR_OBJECT_GEOMETRYSHADER &&
+    kind != AR_OBJECT_HULLSHADER &&
+    kind != AR_OBJECT_PIXELSHADER &&
+    kind != AR_OBJECT_VERTEXSHADER &&
+    kind != AR_OBJECT_PIXELFRAGMENT &&
+    kind != AR_OBJECT_VERTEXFRAGMENT;
+}
 // kind should never be a flag value or effects framework type - we simply do not expect to deal with these
 #define DXASSERT_VALIDBASICKIND(kind) \
-  DXASSERT(\
-  kind != AR_BASIC_COUNT && \
-  kind != AR_BASIC_NONE && \
-  kind != AR_BASIC_UNKNOWN && \
-  kind != AR_BASIC_NOCAST && \
-  kind != AR_BASIC_POINTER && \
-  kind != AR_OBJECT_RENDERTARGETVIEW && \
-  kind != AR_OBJECT_DEPTHSTENCILVIEW && \
-  kind != AR_OBJECT_COMPUTESHADER && \
-  kind != AR_OBJECT_DOMAINSHADER && \
-  kind != AR_OBJECT_GEOMETRYSHADER && \
-  kind != AR_OBJECT_HULLSHADER && \
-  kind != AR_OBJECT_PIXELSHADER && \
-  kind != AR_OBJECT_VERTEXSHADER && \
-  kind != AR_OBJECT_PIXELFRAGMENT && \
-  kind != AR_OBJECT_VERTEXFRAGMENT, "otherwise caller is using a special flag or an unsupported kind value");
+  DXASSERT(IsValidBasicKind(kind), "otherwise caller is using a special flag or an unsupported kind value");
 
 static
 const char* g_DeprecatedEffectObjectNames[] =
@@ -3901,7 +3903,7 @@ public:
     case AR_OBJECT_APPEND_STRUCTURED_BUFFER:
     case AR_OBJECT_CONSUME_STRUCTURED_BUFFER:
     case AR_OBJECT_WAVE:
-    case AR_OBJECT_ACCELARATION_STRUCT:
+    case AR_OBJECT_ACCELERATION_STRUCT:
     case AR_OBJECT_RAY_DESC:
     case AR_OBJECT_TRIANGLE_INTERSECTION_ATTRIBUTES:
     {
@@ -4067,6 +4069,12 @@ public:
   {
     DXASSERT_NOMSG(ULE != nullptr);
 
+    // Intrinsics live in the global namespace, so references to their names
+    // should be either unqualified or '::'-prefixed.
+    if (ULE->getQualifier() && ULE->getQualifier()->getKind() != NestedNameSpecifier::Global) {
+      return false;
+    }
+
     const DeclarationNameInfo declName = ULE->getNameInfo();
     IdentifierInfo* idInfo = declName.getName().getAsIdentifierInfo();
     if (idInfo == nullptr)
@@ -5582,7 +5590,10 @@ bool HLSLExternalSource::MatchArguments(
           return false;
         }
         pEltType = GetTypeElementKind(objectElement);
-        DXASSERT_VALIDBASICKIND(pEltType);
+        if (!IsValidBasicKind(pEltType)) {
+          // This can happen with Texture2D<Struct> or other invalid declarations
+          return false;
+        }
       }
       else {
         pEltType = ComponentType[pArgument->uComponentTypeId];
@@ -8041,10 +8052,11 @@ bool HLSLExternalSource::CanConvert(
   // Cannot cast function type.
   if (source->isFunctionType())
     return false;
-  // Convert to an r-value to begin with.
-  bool needsLValueToRValue = sourceExpr->isLValue() &&
-    !target->isLValueReferenceType() && 
-    IsConversionToLessOrEqualElements(source, target, explicitConversion);
+
+  // Convert to an r-value to begin with, with an exception for strings
+  // since they are not first-class values and we want to preserve them as literals.
+  bool needsLValueToRValue = sourceExpr->isLValue() && !target->isLValueReferenceType()
+    && sourceExpr->getStmtClass() != Expr::StringLiteralClass;
 
   bool targetRef = target->isReferenceType();
 
@@ -8978,11 +8990,12 @@ Sema::TemplateDeductionResult HLSLExternalSource::DeduceTemplateArgumentsForHLSL
   }
 
   // Find the table of intrinsics based on the object type.
-  const HLSL_INTRINSIC* intrinsics;
-  size_t intrinsicCount;
-  const char* objectName;
+  const HLSL_INTRINSIC* intrinsics = nullptr;
+  size_t intrinsicCount = 0;
+  const char* objectName = nullptr;
   FindIntrinsicTable(FunctionTemplate->getDeclContext(), &objectName, &intrinsics, &intrinsicCount);
-  DXASSERT(intrinsics != nullptr,
+  DXASSERT(objectName != nullptr &&
+    (intrinsics != nullptr || m_intrinsicTables.size() > 0),
     "otherwise FindIntrinsicTable failed to lookup a valid object, "
     "or the parser let a user-defined template object through");
 
@@ -9021,16 +9034,13 @@ Sema::TemplateDeductionResult HLSLExternalSource::DeduceTemplateArgumentsForHLSL
           !IsBABLoad
               ? diag::err_hlsl_intrinsic_template_arg_unsupported
               : !Is2018 ? diag::err_hlsl_intrinsic_template_arg_requires_2018
-                        : diag::err_hlsl_intrinsic_template_arg_requires_2018;
+                        : diag::err_hlsl_intrinsic_template_arg_scalar_vector;
       if (IsBABLoad && Is2018 && ExplicitTemplateArgs->size() == 1) {
         Loc = (*ExplicitTemplateArgs)[0].getLocation();
         QualType explicitType = (*ExplicitTemplateArgs)[0].getArgument().getAsType();
         ArTypeObjectKind explicitKind = GetTypeObjectKind(explicitType);
         if (explicitKind == AR_TOBJ_BASIC || explicitKind == AR_TOBJ_VECTOR) {
-          isLegalTemplate = GET_BASIC_BITS(GetTypeElementKind(explicitType)) != BPROP_BITS64 ||
-            GetNumElements(explicitType) <= 2;
-        }
-        if (isLegalTemplate) {
+          isLegalTemplate = true;
           argTypes[0] = explicitType;
         }
       }
@@ -9050,15 +9060,6 @@ Sema::TemplateDeductionResult HLSLExternalSource::DeduceTemplateArgumentsForHLSL
         }
         argTypes[2] = getSema()->getASTContext().getIntTypeForBitwidth(
             32, /*signed*/ false);
-      } else {
-        // not supporting types > 16 bytes yet.
-        if (GET_BASIC_BITS(GetTypeElementKind(argTypes[2])) == BPROP_BITS64 &&
-            GetNumElements(argTypes[2]) > 2) {
-          getSema()->Diag(Args[1]->getLocStart(),
-                          diag::err_ovl_no_viable_member_function_in_call)
-              << intrinsicName;
-          return Sema::TemplateDeductionResult::TDK_Invalid;
-        }
       }
     }
     Specialization = AddHLSLIntrinsicMethod(cursor.GetTableName(), cursor.GetLoweringStrategy(), *cursor, FunctionTemplate, Args, argTypes, argCount);
@@ -9942,7 +9943,8 @@ FlattenedTypeIterator::FlattenedTypeIterator(SourceLocation loc, QualType type,
   m_source(source), m_draining(false), m_springLoaded(false), m_incompleteCount(0), m_typeDepth(0), m_loc(loc)
 {
   if (pushTrackerForType(type, nullptr)) {
-    considerLeaf();
+    while (!m_typeTrackers.empty() && !considerLeaf())
+      consumeLeaf();
   }
 }
 
@@ -10060,17 +10062,11 @@ bool FlattenedTypeIterator::considerLeaf()
   case FlattenedIterKind::FK_Fields:
     if (pushTrackerForType(tracker.CurrentField->getType(), nullptr)) {
       result = considerLeaf();
-    } else {
-      // Pop empty struct.
-      m_typeTrackers.pop_back();
     }
     break;
   case FlattenedIterKind::FK_Bases:
     if (pushTrackerForType(tracker.CurrentBase->getType(), nullptr)) {
       result = considerLeaf();
-    } else {
-      // Pop empty base.
-      m_typeTrackers.pop_back();
     }
     break;
   case FlattenedIterKind::FK_IncompleteArray:
@@ -11286,56 +11282,6 @@ void Sema::TransferUnusualAttributes(Declarator &D, NamedDecl *NewDecl) {
         D.UnusualAnnotations.size()));
     D.UnusualAnnotations.clear();
   }
-  // pragma pack_matrix.
-  // Do this for struct member also.
-  if (ValueDecl *VD = dyn_cast<ValueDecl>(NewDecl)) {
-    QualType Ty = VD->getType();
-    QualType EltTy = Ty;
-    while (EltTy->isArrayType()) {
-      EltTy = EltTy->getAsArrayTypeUnsafe()->getElementType();
-    }
-    if (hlsl::IsHLSLMatType(EltTy)) {
-      bool bRowMajor = false;
-      if (!hlsl::HasHLSLMatOrientation(EltTy, &bRowMajor)) {
-        if (PackMatrixColMajorPragmaOn || PackMatrixRowMajorPragmaOn) {
-          // Add major.
-          QualType NewEltTy = Context.getAttributedType(
-              PackMatrixRowMajorPragmaOn
-                  ? AttributedType::attr_hlsl_row_major
-                  : AttributedType::attr_hlsl_column_major,
-              EltTy, EltTy);
-
-          QualType NewTy = NewEltTy;
-          if (Ty->isArrayType()) {
-            // Build new array type.
-            SmallVector<const ArrayType *, 2> arrayTys;
-            while (EltTy->isArrayType()) {
-              const ArrayType *AT = EltTy->getAsArrayTypeUnsafe();
-              arrayTys.emplace_back(AT);
-            }
-            for (auto rit = arrayTys.rbegin(); rit != arrayTys.rend(); rit++) {
-              // Create array type with NewTy.
-              const ArrayType *AT = *rit;
-              if (const ConstantArrayType *CAT =
-                      dyn_cast<ConstantArrayType>(AT)) {
-                NewTy = Context.getConstantArrayType(
-                    NewTy, CAT->getSize(), CAT->getSizeModifier(),
-                    CAT->getIndexTypeCVRQualifiers());
-              } else if (const IncompleteArrayType *IAT =
-                             dyn_cast<IncompleteArrayType>(AT)) {
-                NewTy = Context.getIncompleteArrayType(NewTy, IAT->getSizeModifier(),
-                    IAT->getIndexTypeCVRQualifiers());
-              } else {
-                DXASSERT(false, "");
-              }
-            }
-          }
-          // Update Type.
-          VD->setType(NewTy);
-        }
-      }
-    }
-  }
 }
 
 /// Checks whether a usage attribute is compatible with those seen so far and
diff --git a/tools/clang/lib/Sema/SemaType.cpp b/tools/clang/lib/Sema/SemaType.cpp
index a99901b54..72119ce02 100644
--- a/tools/clang/lib/Sema/SemaType.cpp
+++ b/tools/clang/lib/Sema/SemaType.cpp
@@ -4319,6 +4319,23 @@ TypeSourceInfo *Sema::GetTypeForDeclarator(Declarator &D, Scope *S) {
   if (D.isPrototypeContext() && getLangOpts().ObjCAutoRefCount)
     inferARCWriteback(state, T);
 
+  // HLSL changes begin
+  // If there is no explicit pack orientation on matrix types, but there is file-level
+  // default orientation set by #pragma pack_matrix, apply it here.
+  // There is no default if rewriting (in the absence of #pragma pack_matrix), since
+  // it is agnostic to default orientation and we want to preserve the lack of annotation.
+  // For codegen, it'd be nice to annotate everything here, but it causes error
+  // messages to have pack orientation added to types, so we handle it through
+  // the codegen option's default packing orientation flag.
+  bool defaultRowMajor;
+  if (getLangOpts().HLSL && hlsl::IsHLSLMatType(T) && !hlsl::HasHLSLMatOrientation(T)
+    && D.getDeclSpec().TryGetDefaultMatrixPackRowMajor(defaultRowMajor)) {
+    AttributedType::Kind AttributeKind = defaultRowMajor
+      ? AttributedType::attr_hlsl_row_major : AttributedType::attr_hlsl_column_major;
+    T = Context.getAttributedType(AttributeKind, T, T);
+  }
+  // HLSL changes end
+
   return GetFullTypeForDeclarator(state, T, ReturnTypeInfo);
 }
 
@@ -4511,6 +4528,14 @@ static AttributeList::Kind getAttrListKind(AttributedType::Kind kind) {
 static void fillAttributedTypeLoc(AttributedTypeLoc TL,
                                   const AttributeList *attrs,
                                   const AttributeList *DeclAttrs = nullptr) {
+
+  // HLSL changes begin
+  // Don't fill the location info for matrix orientation attributes
+  if (TL.getAttrKind() == AttributedType::attr_hlsl_row_major ||
+      TL.getAttrKind() == AttributedType::attr_hlsl_column_major)
+    return;
+  // HLSL changes end
+
   // DeclAttrs and attrs cannot be both empty.
   assert((attrs || DeclAttrs) &&
          "no type attributes in the expected location!");
diff --git a/tools/clang/lib/Sema/gen_intrin_main_tables_15.h b/tools/clang/lib/Sema/gen_intrin_main_tables_15.h
index ad0136b20..3be77c6f7 100644
--- a/tools/clang/lib/Sema/gen_intrin_main_tables_15.h
+++ b/tools/clang/lib/Sema/gen_intrin_main_tables_15.h
@@ -1585,7 +1585,7 @@ static const HLSL_INTRINSIC g_Intrinsics[] =
     {(UINT)hlsl::IntrinsicOp::IOP_round, false, true, -1, 2, g_Intrinsics_Args170},
     {(UINT)hlsl::IntrinsicOp::IOP_rsqrt, false, true, -1, 2, g_Intrinsics_Args171},
     {(UINT)hlsl::IntrinsicOp::IOP_saturate, false, true, -1, 2, g_Intrinsics_Args172},
-    {(UINT)hlsl::IntrinsicOp::IOP_sign, false, true, -1, 2, g_Intrinsics_Args173},
+    {(UINT)hlsl::IntrinsicOp::IOP_sign, false, true, 0, 2, g_Intrinsics_Args173},
     {(UINT)hlsl::IntrinsicOp::IOP_sin, false, true, -1, 2, g_Intrinsics_Args174},
     {(UINT)hlsl::IntrinsicOp::IOP_sincos, false, false, -1, 4, g_Intrinsics_Args175},
     {(UINT)hlsl::IntrinsicOp::IOP_sinh, false, true, -1, 2, g_Intrinsics_Args176},
diff --git a/tools/clang/test/CodeGenHLSL/Include.hlsl b/tools/clang/test/CodeGenHLSL/Include.hlsl
index 38d049729..45b8d1def 100644
--- a/tools/clang/test/CodeGenHLSL/Include.hlsl
+++ b/tools/clang/test/CodeGenHLSL/Include.hlsl
@@ -1,4 +1,4 @@
-// RUN: %dxc -E main -T ps_6_0 -Vi -I inc %s | StdErrCheck %s
+// RUN: %dxc -E main -T ps_6_0 -Vi -I inc %s | FileCheck -input=stderr %s
 
 
 
diff --git a/tools/clang/test/CodeGenHLSL/RValSubscript.hlsl b/tools/clang/test/CodeGenHLSL/RValSubscript.hlsl
index 231a55cb8..51432e370 100644
--- a/tools/clang/test/CodeGenHLSL/RValSubscript.hlsl
+++ b/tools/clang/test/CodeGenHLSL/RValSubscript.hlsl
@@ -5,7 +5,7 @@
 // CHECK: i32 5)
 // CHECK: extractvalue
 // CHECK: , 2
-// CHECK: icmp eq
+// CHECK: icmp ne
 // CHECK 0
 
 // For (x4 < 3)[1]
@@ -47,7 +47,7 @@
 // CHECK: fcmp fast oeq
 // CHECK: fcmp fast oeq
 // CHECK: fcmp fast oeq
-// CHECK: alloca [16 x i1]
+// CHECK: alloca [16 x i32]
 
 
 float4x4 xt;
diff --git a/tools/clang/test/CodeGenHLSL/Samples/MiniEngine/ParticleTileCullingCS.hlsl b/tools/clang/test/CodeGenHLSL/Samples/MiniEngine/ParticleTileCullingCS.hlsl
index 91a37553a..4d7ecc8ad 100644
--- a/tools/clang/test/CodeGenHLSL/Samples/MiniEngine/ParticleTileCullingCS.hlsl
+++ b/tools/clang/test/CodeGenHLSL/Samples/MiniEngine/ParticleTileCullingCS.hlsl
@@ -146,7 +146,7 @@ void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 GTid : SV_Grou
 	for (uint Iter = 0; Iter < ParticleCountInBin; Iter += GROUP_THREAD_COUNT)
 	{
 		// Reset temporary particle intersection masks.  There are two words (64-bits) per thread.
-		[unroll]
+    // [unroll] // Change to allow new unroll behavior.
 		for (uint C = GI; C < TILES_PER_BIN * MASK_WORDS_PER_ITER; C += GROUP_THREAD_COUNT)
 			gs_IntersectionMasks[C] = 0;
 
@@ -239,4 +239,4 @@ void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 GTid : SV_Grou
 			g_FastDrawPackets[NewPacketIndex] = Packet;
 		}
 	}
-}
\ No newline at end of file
+}
diff --git a/tools/clang/test/CodeGenHLSL/Samples/MiniEngine/ParticleTileCullingCS_fail_unroll.hlsl b/tools/clang/test/CodeGenHLSL/Samples/MiniEngine/ParticleTileCullingCS_fail_unroll.hlsl
new file mode 100644
index 000000000..3fb0ad00a
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/Samples/MiniEngine/ParticleTileCullingCS_fail_unroll.hlsl
@@ -0,0 +1,233 @@
+// RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
+
+// CHECK: Could not unroll loop.
+
+// Copied from the original ParticleBinCullingCS.hlsl
+// The loop on line 141 cannot be unrolled because
+// the starting index is not known at compile time.
+
+//
+// Copyright (c) Microsoft. All rights reserved.
+// This code is licensed under the MIT License (MIT).
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+// Developed by Minigraph
+//
+// Author(s):   James Stanard 
+//              Julia Careaga
+//
+
+#include "ParticleUtility.hlsli"
+
+StructuredBuffer<uint> g_BinParticles : register(t0);
+StructuredBuffer<uint> g_BinCounters : register(t1);
+Texture2D<uint> g_DepthBounds : register(t2);
+StructuredBuffer<ParticleScreenData> g_VisibleParticles : register(t3);
+
+RWStructuredBuffer<uint> g_SortedParticles : register(u0);
+RWByteAddressBuffer g_TileHitMasks : register(u1);
+RWStructuredBuffer<uint> g_DrawPackets : register(u2);
+RWStructuredBuffer<uint> g_FastDrawPackets : register(u3);
+RWByteAddressBuffer g_DrawPacketCount : register(u4);
+
+#if TILES_PER_BIN < 64
+#define GROUP_THREAD_COUNT 64
+#else
+#define GROUP_THREAD_COUNT TILES_PER_BIN
+#endif
+#define GROUP_SIZE_X TILES_PER_BIN_X
+#define GROUP_SIZE_Y (GROUP_THREAD_COUNT / GROUP_SIZE_X)
+#define MASK_WORDS_PER_ITER (GROUP_THREAD_COUNT / 32)
+
+groupshared uint gs_SortKeys[MAX_PARTICLES_PER_BIN];
+groupshared uint gs_IntersectionMasks[TILES_PER_BIN * MASK_WORDS_PER_ITER];
+groupshared uint gs_TileParticleCounts[TILES_PER_BIN];
+groupshared uint gs_SlowTileParticleCounts[TILES_PER_BIN];
+groupshared uint gs_MinMaxDepth[TILES_PER_BIN];
+
+void BitonicSort(uint GI, uint NumElements, uint NextPow2, uint NumThreads)
+{
+	for (uint k = 2; k <= NextPow2; k *= 2)
+	{
+		// Align NumElements to the next multiple of k
+		NumElements = (NumElements + k - 1) & ~(k - 1);
+
+		for (uint j = k / 2; j > 0; j /= 2)
+		{
+			// Loop over all N/2 unique element pairs
+			for (uint i = GI; i < NumElements / 2; i += NumThreads)
+			{
+				uint Index1 = InsertZeroBit(i, j);
+				uint Index2 = Index1 | j;
+
+				uint A = gs_SortKeys[Index1];
+				uint B = gs_SortKeys[Index2];
+
+				if ((A < B) != ((Index1 & k) == 0))
+				{
+					gs_SortKeys[Index1] = B;
+					gs_SortKeys[Index2] = A;
+				}
+			}
+
+			GroupMemoryBarrierWithGroupSync();
+		}
+	}
+}
+
+uint ComputeMaskOffset( uint2 Gid, uint2 GTid )
+{
+	// Sometimes we have more threads than tiles per bin.
+	uint2 OutTileCoord = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y) + uint2(GTid.x, GTid.y % TILES_PER_BIN_Y);
+	uint OutTileIdx = OutTileCoord.x + OutTileCoord.y * gTileRowPitch;
+	return OutTileIdx * MAX_PARTICLES_PER_BIN / 8 + GTid.y / TILES_PER_BIN_Y * 4;
+}
+
+[RootSignature(Particle_RootSig)]
+[numthreads(GROUP_SIZE_X, GROUP_SIZE_Y, 1)]
+void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 GTid : SV_GroupThreadID )
+{
+	// Each group is assigned a bin
+	uint BinIndex = Gid.y * gBinsPerRow + Gid.x;
+
+	uint ParticleCountInBin = g_BinCounters[BinIndex];
+	if (ParticleCountInBin == 0)	
+		return;
+
+	// Get the start location for particles in this bin
+	uint BinStart = BinIndex * MAX_PARTICLES_PER_BIN;
+
+	// Each thread is assigned a tile
+	uint2 TileCoord = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y) + GTid.xy;
+
+	if (GI < TILES_PER_BIN)
+	{
+		gs_TileParticleCounts[GI] = 0;
+		gs_SlowTileParticleCounts[GI] = 0;
+		gs_MinMaxDepth[GI] = g_DepthBounds[TileCoord] << 2;
+	}
+
+	// Sometimes the counter value exceeds the actual storage size
+	ParticleCountInBin = min(MAX_PARTICLES_PER_BIN, ParticleCountInBin);
+
+	// Compute the next power of two for the bitonic sort
+	uint NextPow2 = countbits(ParticleCountInBin) <= 1 ? ParticleCountInBin : (2 << firstbithigh(ParticleCountInBin));
+
+	// Fill in the sort key array.  Each sort key has passenger data (in the least signficant
+	// bits, so that as the sort keys are moved around, they retain a pointer to the particle
+	// they refer to.
+	for (uint k = GI; k < NextPow2; k += GROUP_THREAD_COUNT)
+		gs_SortKeys[k] = k < ParticleCountInBin ? g_BinParticles[BinStart + k] : 0xffffffff;
+
+	GroupMemoryBarrierWithGroupSync();
+
+	// Sort the particles from front to back.
+	BitonicSort(GI, ParticleCountInBin, NextPow2, GROUP_THREAD_COUNT);
+
+	// Upper-left tile coord and lower-right coord, clamped to the screen
+	const int2 StartTile = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y);
+
+	// Each thread writes the hit mask for one tile
+	uint OutOffsetInBytes = ComputeMaskOffset(Gid.xy, GTid.xy);
+
+	// Loop over all sorted particles, group-size count at a time
+	for (uint Iter = 0; Iter < ParticleCountInBin; Iter += GROUP_THREAD_COUNT)
+	{
+		// Reset temporary particle intersection masks.  There are two words (64-bits) per thread.
+		[unroll]
+		for (uint C = GI; C < TILES_PER_BIN * MASK_WORDS_PER_ITER; C += GROUP_THREAD_COUNT)
+			gs_IntersectionMasks[C] = 0;
+
+		GroupMemoryBarrierWithGroupSync();
+
+		// The array index of the particle this thread will test
+		uint SortIdx = Iter + GI;
+
+		// Compute word and bit to set (from thread index)
+		uint WordOffset = GI >> 5;
+		uint BitOffset = GI & 31;
+
+		// Only do the loads and stores if this is a valid index (see constant number of iterations comment above)
+		if (SortIdx < ParticleCountInBin)
+		{
+			uint SortKey = gs_SortKeys[SortIdx];
+			uint GlobalIdx = SortKey & 0x3FFFF;
+
+			// After this phase, all we care about is its global index
+			g_SortedParticles[BinStart + SortIdx] = SortKey;
+
+			uint Bounds = g_VisibleParticles[GlobalIdx].Bounds;
+			int2 MinTile = uint2(Bounds >>  0, Bounds >>  8) & 0xFF;
+			int2 MaxTile = uint2(Bounds >> 16, Bounds >> 24) & 0xFF;
+			MinTile = max(MinTile - StartTile, 0);
+			MaxTile = min(MaxTile - StartTile, int2(TILES_PER_BIN_X, TILES_PER_BIN_Y) - 1);
+
+			for (int y = MinTile.y; y <= MaxTile.y; y++)
+			{
+				for (int x = MinTile.x; x <= MaxTile.x; x++)
+				{
+					uint TileIndex = y * TILES_PER_BIN_X + x;
+					uint TileMaxZ = gs_MinMaxDepth[TileIndex];
+					uint Inside = SortKey < TileMaxZ ? 1 : 0;
+					uint SlowPath = SortKey > (TileMaxZ << 16) ? Inside : 0;
+					InterlockedAdd(gs_SlowTileParticleCounts[TileIndex], SlowPath);
+					InterlockedOr(gs_IntersectionMasks[TileIndex * MASK_WORDS_PER_ITER + WordOffset], Inside << BitOffset);
+				}
+			}
+		}
+
+		GroupMemoryBarrierWithGroupSync();
+
+#if TILES_PER_BIN < GROUP_THREAD_COUNT
+		// Copy the hit masks from LDS to the output buffer.  Here, each thread copies a single word
+		if (GI < TILES_PER_BIN * MASK_WORDS_PER_ITER)
+		{
+			uint TileIndex = GI % TILES_PER_BIN;
+			uint Offset = TileIndex * MASK_WORDS_PER_ITER + (GI / TILES_PER_BIN);
+			uint Mask = gs_IntersectionMasks[Offset];
+			InterlockedAdd(gs_TileParticleCounts[TileIndex], countbits(Mask));
+			g_TileHitMasks.Store(OutOffsetInBytes, Mask);
+			OutOffsetInBytes += 8;
+		}
+#else
+		// Copy the hit masks from LDS to the output buffer.  Here, each thread is assigned a tile.
+		uint Offset = GI * MASK_WORDS_PER_ITER;
+		[unroll]
+		for (uint O = 0; O < MASK_WORDS_PER_ITER; O += 2)
+		{
+			uint Mask0 = gs_IntersectionMasks[Offset+O];
+			uint Mask1 = gs_IntersectionMasks[Offset+O+1];
+			InterlockedAdd(gs_TileParticleCounts[GI], countbits(Mask0) + countbits(Mask1));
+			g_TileHitMasks.Store2( OutOffsetInBytes, uint2(Mask0, Mask1) );
+			OutOffsetInBytes += 8;
+		}
+#endif
+
+		GroupMemoryBarrierWithGroupSync();
+	}
+
+	if (GI >= TILES_PER_BIN)
+		return;
+
+	uint ParticleCountInThisThreadsTile = gs_TileParticleCounts[GI];
+	if (ParticleCountInThisThreadsTile > 0)
+	{
+		uint SlowParticlesInThisThreadsTile = gs_SlowTileParticleCounts[GI];
+		uint Packet = TileCoord.x << 16 | TileCoord.y << 24 | ParticleCountInThisThreadsTile;
+
+		uint NewPacketIndex;
+		if (SlowParticlesInThisThreadsTile > 0)
+		{
+			g_DrawPacketCount.InterlockedAdd(0, 1, NewPacketIndex);
+			g_DrawPackets[NewPacketIndex] = Packet;
+		}
+		else
+		{
+			g_DrawPacketCount.InterlockedAdd(12, 1, NewPacketIndex);
+			g_FastDrawPackets[NewPacketIndex] = Packet;
+		}
+	}
+}
diff --git a/tools/clang/test/CodeGenHLSL/SimpleHs10.hlsl b/tools/clang/test/CodeGenHLSL/SimpleHs10.hlsl
index cccc63305..1fab16363 100644
--- a/tools/clang/test/CodeGenHLSL/SimpleHs10.hlsl
+++ b/tools/clang/test/CodeGenHLSL/SimpleHs10.hlsl
@@ -1,4 +1,4 @@
-// RUN: %dxc -E main -T hs_6_0  %s 2>&1 | StdErrCheck %s
+// RUN: %dxc -E main -T hs_6_0  %s 2>&1 | FileCheck -input=stderr %s
 
 // Same as SimpleHS11.hlsl, except that we only verify StdErr for the warning
 // message.
diff --git a/tools/clang/test/CodeGenHLSL/attributes_Mod.hlsl b/tools/clang/test/CodeGenHLSL/attributes_Mod.hlsl
index d72bb432d..316a9ed61 100644
--- a/tools/clang/test/CodeGenHLSL/attributes_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/attributes_Mod.hlsl
@@ -1,7 +1,7 @@
 // RUN: %dxc -E main -T ps_6_0 %s
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T ps_5_1 attributes.hlsl
+// fxc.exe /T ps_5_1 attributes.hlsl
 
 // The following is a directive to override default behavior for "VerifyHelper.py fxc RunAttributes".  When this is specified, main shader must be defined manually.
 // :FXC_VERIFY_ARGUMENTS: /T ps_5_1 /E main
diff --git a/tools/clang/test/HLSL/constprop/Acos.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Acos.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Acos.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Acos.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Asin.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Asin.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Asin.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Asin.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Atan.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Atan.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Atan.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Atan.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Bfrev.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Bfrev.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Bfrev.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Bfrev.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Cos.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Cos.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Cos.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Cos.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Countbits.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Countbits.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Countbits.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Countbits.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Dot2.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Dot2.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Dot2.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Dot2.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Dot3.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Dot3.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Dot3.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Dot3.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Dot4.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Dot4.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Dot4.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Dot4.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Exp.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Exp.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Exp.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Exp.hlsl
diff --git a/tools/clang/test/HLSL/constprop/FAbs.hlsl b/tools/clang/test/CodeGenHLSL/constprop/FAbs.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/FAbs.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/FAbs.hlsl
diff --git a/tools/clang/test/HLSL/constprop/FMad.hlsl b/tools/clang/test/CodeGenHLSL/constprop/FMad.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/FMad.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/FMad.hlsl
diff --git a/tools/clang/test/HLSL/constprop/FMax.hlsl b/tools/clang/test/CodeGenHLSL/constprop/FMax.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/FMax.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/FMax.hlsl
diff --git a/tools/clang/test/HLSL/constprop/FMin.hlsl b/tools/clang/test/CodeGenHLSL/constprop/FMin.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/FMin.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/FMin.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Firstbithi.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Firstbithi.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Firstbithi.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Firstbithi.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Firstbitlo.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Firstbitlo.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Firstbitlo.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Firstbitlo.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Fma.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Fma.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Fma.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Fma.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Frc.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Frc.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Frc.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Frc.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Hcos.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Hcos.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Hcos.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Hcos.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Hsin.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Hsin.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Hsin.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Hsin.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Htan.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Htan.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Htan.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Htan.hlsl
diff --git a/tools/clang/test/HLSL/constprop/IMad.hlsl b/tools/clang/test/CodeGenHLSL/constprop/IMad.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/IMad.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/IMad.hlsl
diff --git a/tools/clang/test/HLSL/constprop/IMax.hlsl b/tools/clang/test/CodeGenHLSL/constprop/IMax.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/IMax.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/IMax.hlsl
diff --git a/tools/clang/test/HLSL/constprop/IMin.hlsl b/tools/clang/test/CodeGenHLSL/constprop/IMin.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/IMin.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/IMin.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Log.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Log.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Log.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Log.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Round_ne.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Round_ne.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Round_ne.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Round_ne.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Round_ni.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Round_ni.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Round_ni.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Round_ni.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Round_pi.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Round_pi.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Round_pi.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Round_pi.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Round_z.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Round_z.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Round_z.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Round_z.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Rsqrt.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Rsqrt.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Rsqrt.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Rsqrt.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Saturate_double.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Saturate_double.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Saturate_double.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Saturate_double.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Saturate_float.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Saturate_float.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Saturate_float.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Saturate_float.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Saturate_half.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Saturate_half.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Saturate_half.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Saturate_half.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Sin.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Sin.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Sin.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Sin.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Sqrt.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Sqrt.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Sqrt.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Sqrt.hlsl
diff --git a/tools/clang/test/HLSL/constprop/Tan.hlsl b/tools/clang/test/CodeGenHLSL/constprop/Tan.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/Tan.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/Tan.hlsl
diff --git a/tools/clang/test/HLSL/constprop/UMad.hlsl b/tools/clang/test/CodeGenHLSL/constprop/UMad.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/UMad.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/UMad.hlsl
diff --git a/tools/clang/test/HLSL/constprop/UMax.hlsl b/tools/clang/test/CodeGenHLSL/constprop/UMax.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/UMax.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/UMax.hlsl
diff --git a/tools/clang/test/HLSL/constprop/UMin.hlsl b/tools/clang/test/CodeGenHLSL/constprop/UMin.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/constprop/UMin.hlsl
rename to tools/clang/test/CodeGenHLSL/constprop/UMin.hlsl
diff --git a/tools/clang/test/HLSL/constprop/bfi.ll b/tools/clang/test/CodeGenHLSL/constprop/bfi.ll
similarity index 100%
rename from tools/clang/test/HLSL/constprop/bfi.ll
rename to tools/clang/test/CodeGenHLSL/constprop/bfi.ll
diff --git a/tools/clang/test/HLSL/constprop/ibfe.ll b/tools/clang/test/CodeGenHLSL/constprop/ibfe.ll
similarity index 100%
rename from tools/clang/test/HLSL/constprop/ibfe.ll
rename to tools/clang/test/CodeGenHLSL/constprop/ibfe.ll
diff --git a/tools/clang/test/HLSL/constprop/ubfe.ll b/tools/clang/test/CodeGenHLSL/constprop/ubfe.ll
similarity index 100%
rename from tools/clang/test/HLSL/constprop/ubfe.ll
rename to tools/clang/test/CodeGenHLSL/constprop/ubfe.ll
diff --git a/tools/clang/test/CodeGenHLSL/declarations/bool_representation/buffer_load_store.hlsl b/tools/clang/test/CodeGenHLSL/declarations/bool_representation/buffer_load_store.hlsl
new file mode 100644
index 000000000..607e3ad44
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/bool_representation/buffer_load_store.hlsl
@@ -0,0 +1,100 @@
+// RUN: %dxc -E main -T vs_6_0 -O0 %s | FileCheck %s
+
+// Ensure that bools are converted from/to their memory representation when loaded/stored in buffers
+
+struct AllTheBools
+{
+    bool2x2 m;
+    bool2 v;
+    bool s;
+    bool2x2 ma[2];
+    bool2 va[2];
+    bool sa[2];
+};
+
+ConstantBuffer<AllTheBools> cb;
+StructuredBuffer<AllTheBools> sb;
+RWStructuredBuffer<AllTheBools> rwsb;
+
+int main(int i : IN) : OUT
+{
+    int result = 0;
+
+    // Constant buffer loads
+    // CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+    // CHECK: extractvalue %dx.types.CBufRet.i32
+    // CHECK: icmp ne i32 {{.*}}, 0
+    // CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+    // CHECK: extractvalue %dx.types.CBufRet.i32
+    // CHECK: icmp ne i32 {{.*}}, 0
+    // CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+    // CHECK: extractvalue %dx.types.CBufRet.i32
+    // CHECK: icmp ne i32 {{.*}}, 0
+    // CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+    // CHECK: extractvalue %dx.types.CBufRet.i32
+    // CHECK: icmp ne i32 {{.*}}, 0
+    // CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+    // CHECK: extractvalue %dx.types.CBufRet.i32
+    // CHECK: icmp ne i32 {{.*}}, 0
+    // CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+    // CHECK: extractvalue %dx.types.CBufRet.i32
+    // CHECK: icmp ne i32 {{.*}}, 0
+    if (cb.m._22 && cb.v.y && cb.s
+        && cb.ma[1]._22 && cb.va[1].y && cb.sa[1])
+    {
+        result++;
+    }
+    
+    // Structured buffer loads
+    // CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+    // CHECK: extractvalue %dx.types.ResRet.i32
+    // CHECK: icmp ne i32 {{.*}}, 0
+    // CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+    // CHECK: extractvalue %dx.types.ResRet.i32
+    // CHECK: icmp ne i32 {{.*}}, 0
+    // CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+    // CHECK: extractvalue %dx.types.ResRet.i32
+    // CHECK: icmp ne i32 {{.*}}, 0
+    // CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+    // CHECK: extractvalue %dx.types.ResRet.i32
+    // CHECK: icmp ne i32 {{.*}}, 0
+    // CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+    // CHECK: extractvalue %dx.types.ResRet.i32
+    // CHECK: icmp ne i32 {{.*}}, 0
+    // CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+    // CHECK: extractvalue %dx.types.ResRet.i32
+    // CHECK: icmp ne i32 {{.*}}, 0
+    if (sb[0].m._22 && sb[0].v.y && sb[0].s
+        && sb[0].ma[1]._22 && sb[0].va[1].y && sb[0].sa[1])
+    {
+        result++;
+    }
+
+    // Structured buffer stores
+    // CHECK: icmp eq i32 {{.*}}, 42
+    // CHECK: zext i1 {{.*}} to i32
+    // CHECK: call void @dx.op.bufferStore.i32
+    // CHECK: icmp eq i32 {{.*}}, 42
+    // CHECK: zext i1 {{.*}} to i32
+    // CHECK: call void @dx.op.bufferStore.i32
+    // CHECK: icmp eq i32 {{.*}}, 42
+    // CHECK: zext i1 {{.*}} to i32
+    // CHECK: call void @dx.op.bufferStore.i32
+    // CHECK: icmp eq i32 {{.*}}, 42
+    // CHECK: zext i1 {{.*}} to i32
+    // CHECK: call void @dx.op.bufferStore.i32
+    // CHECK: icmp eq i32 {{.*}}, 42
+    // CHECK: zext i1 {{.*}} to i32
+    // CHECK: call void @dx.op.bufferStore.i32
+    // CHECK: icmp eq i32 {{.*}}, 42
+    // CHECK: zext i1 {{.*}} to i32
+    // CHECK: call void @dx.op.bufferStore.i32
+    rwsb[0].m._22 = i == 42;
+    rwsb[0].v.y = i == 42;
+    rwsb[0].s = i == 42;
+    rwsb[0].ma[1]._22 = i == 42;
+    rwsb[0].va[1].y = i == 42;
+    rwsb[0].sa[1] = i == 42;
+
+    return result;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/bool_representation/cbuffer_load_whole_matrix.hlsl b/tools/clang/test/CodeGenHLSL/declarations/bool_representation/cbuffer_load_whole_matrix.hlsl
new file mode 100644
index 000000000..2b83fd0cf
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/bool_representation/cbuffer_load_whole_matrix.hlsl
@@ -0,0 +1,10 @@
+// RUN: %dxc -E main -T vs_6_0 %s | FileCheck %s
+
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: icmp ne i32 {{.*}}, 0
+
+struct Struct { bool2x2 mat; };
+ConstantBuffer<Struct> cb;
+bool2x2 main() : B { return cb.mat; }
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/bool_representation/const_init_list_no_crash.hlsl b/tools/clang/test/CodeGenHLSL/declarations/bool_representation/const_init_list_no_crash.hlsl
new file mode 100644
index 000000000..443bdffac
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/bool_representation/const_init_list_no_crash.hlsl
@@ -0,0 +1,40 @@
+// RUN: %dxc -E main -T vs_6_0 %s | FileCheck %s
+
+// Regression test for bools in constant initialization lists crashing
+// due to a mismatch between register and memory representations (GitHub #1880)
+
+// CHECK: ret void
+
+bool main() : OUT
+{
+    // There are special cases around structs, arrays and matrices,
+    // so it's important to test all combinations:
+    // scalars/vectors can be in memory representation or not
+    // matrices always store elements in register representation (until lowering to vector)
+    // arrays/structs always store elements in memory representation.
+
+    // Test target types
+    static const bool b = { false };
+    static const bool2 v = { false, true };
+    static const bool2x2 m = { false, true, false, true };
+    static const bool ab[] = { false };
+    static const bool2 av[] = { false, true };
+    static const bool2x2 am[] = { false, true, false, true };
+    static const struct { bool x; } sb = { false };
+    static const struct { bool2 x; } sv = { false, true };
+    static const struct { bool2x2 x; } sm = { false, true, false, true };
+
+    // Test source types
+    static const bool ab_b[] = { false, b };
+    static const bool ab_v[] = { bool2(false, true), v };
+    static const bool ab_m[] = { bool2x2(false, true, false, true), m };
+    static const bool ab_a[] = { ab, av, am };
+    static const bool ab_s[] = { sb, sv, sm };
+
+    // Reference everything to ensure they get codegen'd
+    // Bool matrix accesses crash due to GitHub #1881
+    return b && v.x /* && m._11 */
+        && ab[0] && av[0].x /* && am[0]._11 */
+        && sb.x && sv.x /* && sm.x._11 */
+        && ab_b[0] && ab_v[0] && ab_m[0] && ab_a[0] && ab_s[0];
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/bool_representation/local_load_store.hlsl b/tools/clang/test/CodeGenHLSL/declarations/bool_representation/local_load_store.hlsl
new file mode 100644
index 000000000..7ea5ec50d
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/bool_representation/local_load_store.hlsl
@@ -0,0 +1,54 @@
+// RUN: %dxc -E main -T vs_6_0 -O0 %s | FileCheck %s
+
+// Ensure that bools are converted from/to their memory representation when loaded/stored
+// in local variables.
+
+// Local variables should never be i1s
+// CHECK-NOT: alloca {{.*}}i1
+
+int main(int i : I) : OUT
+{
+    // CHECK: icmp eq i32 {{.*}}, 42
+    // CHECK: zext i1 {{.*}} to i32
+    // CHECK: store i32
+    bool s = i == 42;
+    // CHECK: icmp eq i32 {{.*}}, 42
+    // CHECK: zext i1 {{.*}} to i32
+    // CHECK: store i32
+    bool1 v = i == 42;
+    // CHECK: icmp eq i32 {{.*}}, 42
+    // CHECK: zext i1 {{.*}} to i32
+    // CHECK: store i32
+    bool1x1 m = i == 42;
+    // CHECK: icmp eq i32 {{.*}}, 42
+    // CHECK: zext i1 {{.*}} to i32
+    // CHECK: store i32
+    bool sa[1] = { i == 42 };
+    // CHECK: icmp eq i32 {{.*}}, 42
+    // CHECK: zext i1 {{.*}} to i32
+    // CHECK: store i32
+    bool1 va[1] = { i == 42 };
+    // CHECK: icmp eq i32 {{.*}}, 42
+    // CHECK: zext i1 {{.*}} to i32
+    // CHECK: store i32
+    bool1x1 ma[1] = { i == 42 };
+
+    // CHECK: load i32
+    // CHECK: icmp ne i32 {{.*}}, 0
+    return (s
+        // CHECK: load i32
+        // CHECK: icmp ne i32 {{.*}}, 0
+        && v.x
+        // CHECK: load i32
+        // CHECK: icmp ne i32 {{.*}}, 0
+        && m._11
+        // CHECK: load i32
+        // CHECK: icmp ne i32 {{.*}}, 0
+        && sa[0]
+        // CHECK: load i32
+        // CHECK: icmp ne i32 {{.*}}, 0
+        && va[0].x
+        // CHECK: load i32
+        // CHECK: icmp ne i32 {{.*}}, 0
+        && ma[0]._11) ? 1 : 2;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/constant_buffers/layout.hlsl b/tools/clang/test/CodeGenHLSL/declarations/constant_buffers/layout.hlsl
new file mode 100644
index 000000000..e0c9ab5e3
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/constant_buffers/layout.hlsl
@@ -0,0 +1,55 @@
+// RUN: %dxc -T vs_6_0 -E main %s | FileCheck %s
+
+// Tests the printed layout of constant and texture buffers.
+// We don't care in what order they get printed
+
+// CHECK: int2 a; ; Offset: 0
+// CHECK: int b[2]; ; Offset: 16
+// CHECK: int2 c; ; Offset: 36
+// CHECK: int2 d; ; Offset: 48
+// CHECK: int e; ; Offset: 56
+// CHECK: Size: 60
+
+// CHECK: int2 a; ; Offset: 0
+// CHECK: int b[2]; ; Offset: 16
+// CHECK: int2 c; ; Offset: 36
+// CHECK: int2 d; ; Offset: 48
+// CHECK: int e; ; Offset: 56
+// CHECK: Size: 60
+
+// CHECK: int2 a; ; Offset: 0
+// CHECK: int b[2]; ; Offset: 16
+// CHECK: int2 c; ; Offset: 36
+// CHECK: int2 d; ; Offset: 48
+// CHECK: int e; ; Offset: 56
+// CHECK: Size: 60
+
+// CHECK: int2 a; ; Offset: 0
+// CHECK: int b[2]; ; Offset: 16
+// CHECK: int2 c; ; Offset: 36
+// CHECK: int2 d; ; Offset: 48
+// CHECK: int e; ; Offset: 56
+// CHECK: Size: 60
+
+struct Struct
+{
+    int2 a;
+    struct
+    {
+        int b[2]; // Each element is int4-aligned
+        int2 c; // Fits in b[1].yz
+        int2 d; // Doesn't fit in b[1].w-, so gets its own int4
+    } s;
+    int e; // Fits in d.z
+};
+
+cbuffer _cbl { Struct cbl; };
+ConstantBuffer<Struct> cb;
+
+tbuffer _tbl { Struct tbl; };
+TextureBuffer<Struct> tb;
+
+int main() : OUT
+{
+    return cbl.e + cb.e + tbl.e + tb.e;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/functions/overloading/intrinsic_overloading.hlsl b/tools/clang/test/CodeGenHLSL/declarations/functions/overloading/intrinsic_overloading.hlsl
new file mode 100644
index 000000000..058deca5c
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/functions/overloading/intrinsic_overloading.hlsl
@@ -0,0 +1,17 @@
+// RUN: %dxc -E main -T vs_6_0 %s | FileCheck %s
+
+// Test that intrinsics can be overloaded without
+// shadowing the original definition.
+
+// CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 42)
+// CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 1, i32 1)
+
+struct Struct { int x; };
+int abs(Struct s) { return 42; }
+
+int2 main() : OUT
+{
+    Struct s = { -1 };
+    return int2(abs(s), // Should call struct overload
+        abs(-1)); // Should call intrinsic
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/functions/overloading/intrinsic_shadowing.hlsl b/tools/clang/test/CodeGenHLSL/declarations/functions/overloading/intrinsic_shadowing.hlsl
new file mode 100644
index 000000000..4db682bde
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/functions/overloading/intrinsic_shadowing.hlsl
@@ -0,0 +1,8 @@
+// RUN: %dxc -E main -T vs_6_0 %s | FileCheck %s | XFail GitHub #1887
+
+// Test that global functions can shadow intrinsics.
+
+// CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 42)
+
+int abs(int x) { return 42; }
+int main() : OUT { return abs(-1); }
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/functions/overloading/namespace_qualified_no_intrinsic_candidate.hlsl b/tools/clang/test/CodeGenHLSL/declarations/functions/overloading/namespace_qualified_no_intrinsic_candidate.hlsl
new file mode 100644
index 000000000..7ae2948e4
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/functions/overloading/namespace_qualified_no_intrinsic_candidate.hlsl
@@ -0,0 +1,20 @@
+// RUN: %dxc -E main -T vs_6_0 %s | FileCheck %s
+
+// Regression test for GitHub #1884, where intrinsics were considered
+// valid overload candidates for overloaded functions of the same name in a namespace
+
+// CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 42)
+// CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 1, i32 13)
+
+namespace foo
+{
+    int abs(int2 x) { return 42; }
+    int abs() { return 13; }
+}
+
+int2 main() : OUT
+{
+    // This should not consider the abs(int) intrinsic from the global namespace,
+    // regardless of the fact that it is a better match for the arguments.
+    return int2(foo::abs(0), foo::abs());
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/globals/no_initialization.hlsl b/tools/clang/test/CodeGenHLSL/declarations/globals/no_initialization.hlsl
new file mode 100644
index 000000000..24069eaff
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/globals/no_initialization.hlsl
@@ -0,0 +1,50 @@
+// RUN: %dxc -E main -T vs_6_0 > %s | FileCheck %s
+
+// Test that no variable initializers are emitted, especially for cbuffers globals.
+
+// CHECK-NOT: {{.*}} = constant
+// CHECK: define void @main()
+
+int var;
+int var_init = 1;
+const int const_var;
+const int const_var_init = 1;
+extern int extern_var;
+extern int extern_var_init = 1;
+extern const int extern_const_var;
+extern const int extern_const_var_init = 1;
+
+// Those get optimized away
+static int static_var;
+static int static_var_init = 1;
+static const int static_const_var;
+static const int static_const_var_init = 1;
+
+struct s
+{
+  // Those get optimized away
+  static int struct_static_var;
+  // static int struct_static_var_init = 1; // error: struct/class members cannot have default values
+  static const int struct_static_const_var;
+  static const int struct_static_const_var_init = 1;
+};
+
+int s::struct_static_var = 1;
+const int s::struct_static_const_var = 1;
+
+int main() : OUT {
+  static int func_static_var;
+  static int func_static_var_init = 1;
+  static const int func_static_const_var;
+  static const int func_static_const_var_init = 1;
+  return var + var_init
+    + const_var + const_var_init
+    + extern_var + extern_var_init
+    + extern_const_var + extern_const_var_init
+    + static_var + static_var_init
+    + static_const_var + static_const_var_init
+    + s::struct_static_var + /*s::struct_static_var_init*/
+    + s::struct_static_const_var + s::struct_static_const_var_init
+    + func_static_var + func_static_var_init
+    + func_static_const_var + func_static_const_var_init;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/cbuffer_load.hlsl b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/cbuffer_load.hlsl
new file mode 100644
index 000000000..d0aac61c0
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/cbuffer_load.hlsl
@@ -0,0 +1,18 @@
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s
+
+// Test reading elements from constant buffer matrices
+// with both orientations.
+
+cbuffer cb
+{
+    row_major int4x4 r;
+    column_major int4x4 c;
+};
+int main() : OUT
+{
+    // CHECK: %[[r:.+]] = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle {{.*}}, i32 1)
+    // CHECK: extractvalue %dx.types.CBufRet.i32 %[[r]], 2
+    // CHECK: %[[c:.+]] = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle {{.*}}, i32 6)
+    // CHECK: extractvalue %dx.types.CBufRet.i32 %[[c]], 1
+    return r._23 + c._23;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/input_column_major.hlsl b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/input_column_major.hlsl
new file mode 100644
index 000000000..53195e97d
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/input_column_major.hlsl
@@ -0,0 +1,6 @@
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s
+
+// Test reading input matrix elements in both orientations.
+
+// CHECK: call i32 @dx.op.loadInput.i32(i32 4, i32 0, i32 2, i8 1, i32 undef)
+int main(column_major int4x4 c : C) : OUT { return c._23; }
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/input_row_major.hlsl b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/input_row_major.hlsl
new file mode 100644
index 000000000..2a21f26fb
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/input_row_major.hlsl
@@ -0,0 +1,6 @@
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s
+
+// Test reading input matrix elements in both orientations.
+
+// CHECK: call i32 @dx.op.loadInput.i32(i32 4, i32 0, i32 1, i8 2, i32 undef)
+int main(row_major int4x4 r : R) : OUT { return r._23; }
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/output_param.hlsl b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/output_param.hlsl
new file mode 100644
index 000000000..af07f4cb6
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/output_param.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s
+
+// Test that outputting a matrix value through an out param
+// correctly takes the parameter orientation into account.
+
+typedef row_major int2x2 rmi2x2;
+typedef column_major int2x2 cmi2x2;
+void main(out rmi2x2 mat : OUT)
+{
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 11)
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 1, i32 12)
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 1, i8 0, i32 21)
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 1, i8 1, i32 22)
+    mat = cmi2x2(11, 12, 21, 22);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/output_return.hlsl b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/output_return.hlsl
new file mode 100644
index 000000000..401098f27
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/output_return.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s
+
+// Test that outputting a matrix value through a return statement
+// correctly takes the return parameter orientation into account.
+
+typedef row_major int2x2 rmi2x2;
+typedef column_major int2x2 cmi2x2;
+rmi2x2 main() : OUT
+{
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 11)
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 1, i32 12)
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 1, i8 0, i32 21)
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 1, i8 1, i32 22)
+    return cmi2x2(11, 12, 21, 22);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/matrix_orientation_preserved_with_typedef.hlsl b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/preserved_with_typedef.hlsl
similarity index 100%
rename from tools/clang/test/CodeGenHLSL/quick-test/matrix_orientation_preserved_with_typedef.hlsl
rename to tools/clang/test/CodeGenHLSL/declarations/matrix_pack/preserved_with_typedef.hlsl
diff --git a/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/static_const_init_list.hlsl b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/static_const_init_list.hlsl
new file mode 100644
index 000000000..edfbe6642
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/static_const_init_list.hlsl
@@ -0,0 +1,37 @@
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s
+
+// Test that matrix packing order does not cause undesirable transposes
+// with constant initialization lists. Constant initializers should
+// be emitted in the target memory packing order and "fixed up"
+// when the static variable gets loaded.
+
+AppendStructuredBuffer<int4> buf;
+
+void main()
+{
+    // Test building matrix constants
+    // Matrices need to be hidden in structures because
+    // otherwise we do not consider them for constant initialization.
+    static const struct { row_major int2x2 mat; } r = { 11, 12, 21, 22 };
+    static const struct { column_major int2x2 mat; } c = { 11, 12, 21, 22 };
+    
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    buf.Append((int4)r.mat);
+    buf.Append((int4)c.mat);
+
+    // Convert between packing orders (ie test flattening matrix constants).
+    // Use two fields per variable so that constant init list logic is used.
+    // If there is a single initializer, it becomes a mere cast. 
+    static const struct { row_major int2x2 mat1, mat2; } r2 = { r, c };
+    static const struct { column_major int2x2 mat1, mat2; } c2 = { r, c };
+
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    buf.Append((int4)r2.mat1);
+    buf.Append((int4)r2.mat2);
+    buf.Append((int4)c2.mat1);
+    buf.Append((int4)c2.mat2);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/structbuf_load_struct.hlsl b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/structbuf_load_struct.hlsl
new file mode 100644
index 000000000..564d65afb
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/structbuf_load_struct.hlsl
@@ -0,0 +1,21 @@
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s
+
+// Test reading matrices from structured buffers
+// respects the declared pack orientation.
+
+struct S
+{
+    row_major int4x4 rm;
+    column_major int4x4 cm; // Offset: 64 bytes
+};
+StructuredBuffer<S> b;
+
+int main() : OUT
+{
+    S s = b[0];
+    // CHECK: %[[row:.*]] = call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 0, i32 16)
+    // CHECK: extractvalue %dx.types.ResRet.i32 %[[row]], 2
+    // CHECK: %[[col:.*]] = call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 0, i32 96)
+    // CHECK: extractvalue %dx.types.ResRet.i32 %[[col]], 1
+    return s.rm._23 + s.cm._23;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/structbuf_store.hlsl b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/structbuf_store.hlsl
new file mode 100644
index 000000000..b6e90c4fa
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/structbuf_store.hlsl
@@ -0,0 +1,23 @@
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s | XFail GitHub #1788
+
+// Test writing matrices to structured buffers
+// with every combination of source/dest orientations.
+
+typedef row_major int2x2 rmi2x2;
+typedef column_major int2x2 cmi2x2;
+
+RWStructuredBuffer<rmi2x2> r;
+RWStructuredBuffer<cmi2x2> c;
+
+void main()
+{
+    // CHECK: i32 11, i32 12, i32 21, i32 22
+    r[0] = rmi2x2(11, 12, 21, 22);
+    // CHECK: i32 11, i32 12, i32 21, i32 22
+    r[1] = cmi2x2(11, 12, 21, 22);
+    
+    // CHECK: i32 11, i32 21, i32 12, i32 22
+    c[0] = rmi2x2(11, 12, 21, 22);
+    // CHECK: i32 11, i32 21, i32 12, i32 22
+    c[1] = cmi2x2(11, 12, 21, 22);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/structbuf_store_struct.hlsl b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/structbuf_store_struct.hlsl
new file mode 100644
index 000000000..4c6145832
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/structbuf_store_struct.hlsl
@@ -0,0 +1,24 @@
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s
+
+// Test writing matrices to structured buffers
+// with every combination of source/dest orientations.
+
+typedef row_major int2x2 rmi2x2;
+typedef column_major int2x2 cmi2x2;
+struct R { rmi2x2 mat; };
+struct C { cmi2x2 mat; };
+RWStructuredBuffer<R> r;
+RWStructuredBuffer<C> c;
+
+void main()
+{
+    // CHECK: i32 11, i32 12, i32 21, i32 22
+    r[0].mat = rmi2x2(11, 12, 21, 22);
+    // CHECK: i32 11, i32 12, i32 21, i32 22
+    r[1].mat = cmi2x2(11, 12, 21, 22);
+    
+    // CHECK: i32 11, i32 21, i32 12, i32 22
+    c[0].mat = rmi2x2(11, 12, 21, 22);
+    // CHECK: i32 11, i32 21, i32 12, i32 22
+    c[1].mat = cmi2x2(11, 12, 21, 22);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/transpose_in_function.hlsl b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/transpose_in_function.hlsl
new file mode 100644
index 000000000..748eef3b7
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/transpose_in_function.hlsl
@@ -0,0 +1,16 @@
+// RUN: %dxc /T vs_6_0 /E main > %s | FileCheck %s
+
+// Regression test for a bug where the transpose isn't performed,
+// or is performed twice, when wrapped in its own function.
+
+typedef row_major int2x2 rmi2x2;
+rmi2x2 DoTranspose(rmi2x2 mat) { return transpose(mat); }
+int4 main() : OUT
+{
+  rmi2x2 mat = DoTranspose(rmi2x2(11, 12, 21, 22));
+  // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 11)
+  // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 1, i32 21)
+  // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 2, i32 12)
+  // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 3, i32 22)
+  return int4(mat._11, mat._12, mat._21, mat._22);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/truncation_column_major.hlsl b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/truncation_column_major.hlsl
new file mode 100644
index 000000000..49c8fc52e
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/truncation_column_major.hlsl
@@ -0,0 +1,12 @@
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s
+
+// Test that implicitly truncating a matrix keeps
+// the same matrix elements no matter the orientation of the matrices.
+
+void main(out column_major int1x2 result : OUT)
+{
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 11)
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 1, i8 0, i32 12)
+    column_major int2x2 value = int2x2(11, 12, 21, 22);
+    result = value;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/truncation_row_major.hlsl b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/truncation_row_major.hlsl
new file mode 100644
index 000000000..3df0e6d6f
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/matrix_pack/truncation_row_major.hlsl
@@ -0,0 +1,12 @@
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s
+
+// Test that implicitly truncating a matrix keeps
+// the same matrix elements no matter the orientation of the matrices.
+
+void main(out row_major int1x2 result : OUT)
+{
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 11)
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 1, i32 12)
+    row_major int2x2 value = int2x2(11, 12, 21, 22);
+    result = value;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/structs/anonymous.hlsl b/tools/clang/test/CodeGenHLSL/declarations/structs/anonymous.hlsl
new file mode 100644
index 000000000..6c48cc378
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/structs/anonymous.hlsl
@@ -0,0 +1,23 @@
+// RUN: %dxc -E main -T vs_6_0 %s | FileCheck %s
+
+// Tests declarations and uses of anonymous structs.
+
+// CHECK: call i32 @dx.op.loadInput.i32
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: add nsw i32
+// CHECK: call void @dx.op.storeOutput.i32
+
+typedef struct { int x; } typedefed;
+
+struct { int x; } global;
+struct Outer
+{
+    struct { int x; } field;
+};
+
+int main(Outer input : IN) : OUT
+{
+    struct { int x; } local = input.field;
+    typedefed retval = local;
+    return retval.x + global.x;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/bitfields_error.hlsl b/tools/clang/test/CodeGenHLSL/declarations/structs/bitfields_error.hlsl
similarity index 50%
rename from tools/clang/test/CodeGenHLSL/quick-test/bitfields_error.hlsl
rename to tools/clang/test/CodeGenHLSL/declarations/structs/bitfields_error.hlsl
index afccc92c9..8000f6dc4 100644
--- a/tools/clang/test/CodeGenHLSL/quick-test/bitfields_error.hlsl
+++ b/tools/clang/test/CodeGenHLSL/declarations/structs/bitfields_error.hlsl
@@ -1,6 +1,6 @@
-// RUN: %dxc /T ps_6_0 /E main %s | FileCheck %s
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s
 
 // CHECK: error: bitfields are not supported in HLSL
 
 struct Struct { uint field : 1; };
-float main() : SV_Target { return 0; }
\ No newline at end of file
+void main() {}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/builtin_types_no_inheritance.hlsl b/tools/clang/test/CodeGenHLSL/declarations/structs/builtin_types_no_inheritance.hlsl
similarity index 87%
rename from tools/clang/test/CodeGenHLSL/quick-test/builtin_types_no_inheritance.hlsl
rename to tools/clang/test/CodeGenHLSL/declarations/structs/builtin_types_no_inheritance.hlsl
index 61dbb31e2..80cf09250 100644
--- a/tools/clang/test/CodeGenHLSL/quick-test/builtin_types_no_inheritance.hlsl
+++ b/tools/clang/test/CodeGenHLSL/declarations/structs/builtin_types_no_inheritance.hlsl
@@ -1,9 +1,10 @@
-// RUN: %dxc -T ps_6_0 -E main %s | FileCheck %s
+// RUN: %dxc -T vs_6_0 -E main %s | FileCheck %s
 
 // CHECK: error: base 'vector' is marked 'final'
 // CHECK: error: base 'matrix' is marked 'final'
 // CHECK: error: base 'Texture3D' is marked 'final'
 // CHECK: error: base 'ByteAddressBuffer' is marked 'final'
+// CHECK: error: base 'StructuredBuffer' is marked 'final'
 // CHECK: error: base 'SamplerState' is marked 'final'
 // CHECK: error: base 'TriangleStream' is marked 'final'
 // CHECK: error: base 'InputPatch' is marked 'final'
@@ -17,6 +18,7 @@ struct F2 : float2 {};
 struct F4x4 : float4x4 {};
 struct Tex3D : Texture3D<float> {};
 struct BABuf : ByteAddressBuffer {};
+struct StructBuf : StructuredBuffer<int> {};
 struct Samp : SamplerState {};
 
 struct Vertex { float3 pos : POSITION; };
@@ -29,4 +31,4 @@ struct BITIA : BuiltInTriangleIntersectionAttributes {};
 struct RTAS : RaytracingAccelerationStructure {};
 struct GRS : GlobalRootSignature {};
 
-float main() : SV_Target { return 0; }
\ No newline at end of file
+void main() {}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/structs/declaration_in_return_type.hlsl b/tools/clang/test/CodeGenHLSL/declarations/structs/declaration_in_return_type.hlsl
new file mode 100644
index 000000000..f20d4da82
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/structs/declaration_in_return_type.hlsl
@@ -0,0 +1,13 @@
+// RUN: %dxc -E main -T vs_6_0 %s | FileCheck %s
+
+// Tests that struct declarations cannot also declare functions.
+// Note that FXC allows this
+
+// CHECK: error: {{.*}} cannot be defined in the result type of a function
+
+struct Struct { int x; };
+struct { int x; } main() : OUT
+{
+    Struct result;
+    return result;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/structs/empty.hlsl b/tools/clang/test/CodeGenHLSL/declarations/structs/empty.hlsl
new file mode 100644
index 000000000..2b6e4d91b
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/structs/empty.hlsl
@@ -0,0 +1,25 @@
+// RUN: %dxc -E main -T vs_6_0 %s | FileCheck %s
+
+// Test that (nested) empty structs compile away
+
+// CHECK: define void @main()
+// CHECK-NOT: %{{.*}} =
+// CHECK: ret void
+
+struct EmptyStruct {};
+struct OuterStruct { EmptyStruct empty; };
+
+OuterStruct global;
+static OuterStruct staticGlobal;
+cbuffer SomeCBuffer { OuterStruct cbufferField; };
+ConstantBuffer<OuterStruct> cb;
+StructuredBuffer<OuterStruct> sb;
+
+OuterStruct main(OuterStruct input,
+    out OuterStruct output)
+{
+    OuterStruct local = input;
+    staticGlobal = global;
+    output = cbufferField;
+    return local;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/declarations/structured_buffers/layout.hlsl b/tools/clang/test/CodeGenHLSL/declarations/structured_buffers/layout.hlsl
new file mode 100644
index 000000000..314ff4be1
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/declarations/structured_buffers/layout.hlsl
@@ -0,0 +1,37 @@
+// RUN: %dxc -T vs_6_0 -E main %s | FileCheck %s
+
+// Tests the printed layout of structured buffers.
+
+// CHECK: int2 a; ; Offset: 0
+// CHECK: int b[2]; ; Offset: 8
+// CHECK: int2 c; ; Offset: 16
+// CHECK: int2 d; ; Offset: 24
+// CHECK: int e; ; Offset: 32
+// CHECK: Size: 36
+
+// CHECK: int2 a; ; Offset: 0
+// CHECK: int b[2]; ; Offset: 8
+// CHECK: int2 c; ; Offset: 16
+// CHECK: int2 d; ; Offset: 24
+// CHECK: int e; ; Offset: 32
+// CHECK: Size: 36
+
+struct Struct
+{
+    int2 a;
+    struct
+    {
+        int b[2];
+        int2 c;
+        int2 d;
+    } s;
+    int e;
+};
+
+StructuredBuffer<Struct> sb;
+RWStructuredBuffer<Struct> rwsb;
+
+int main() : OUT
+{
+    return sb[0].e + rwsb[0].e;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/evalMat.hlsl b/tools/clang/test/CodeGenHLSL/evalMat.hlsl
index 8983c68d3..7fe2df74f 100644
--- a/tools/clang/test/CodeGenHLSL/evalMat.hlsl
+++ b/tools/clang/test/CodeGenHLSL/evalMat.hlsl
@@ -7,4 +7,4 @@ float4 main(float4x4 a : A) : SV_Target
   float4 r = EvaluateAttributeCentroid(a)[0];
 
   return r;
-}
\ No newline at end of file
+}
diff --git a/tools/clang/test/HLSL/expand_trig/acos.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/acos.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/acos.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/acos.hlsl
diff --git a/tools/clang/test/HLSL/expand_trig/acos_h.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/acos_h.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/acos_h.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/acos_h.hlsl
diff --git a/tools/clang/test/HLSL/expand_trig/asin.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/asin.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/asin.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/asin.hlsl
diff --git a/tools/clang/test/HLSL/expand_trig/asin_h.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/asin_h.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/asin_h.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/asin_h.hlsl
diff --git a/tools/clang/test/HLSL/expand_trig/atan.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/atan.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/atan.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/atan.hlsl
diff --git a/tools/clang/test/HLSL/expand_trig/atan_h.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/atan_h.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/atan_h.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/atan_h.hlsl
diff --git a/tools/clang/test/HLSL/expand_trig/hcos.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/hcos.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/hcos.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/hcos.hlsl
diff --git a/tools/clang/test/HLSL/expand_trig/hcos_h.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/hcos_h.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/hcos_h.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/hcos_h.hlsl
diff --git a/tools/clang/test/HLSL/expand_trig/hsin.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/hsin.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/hsin.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/hsin.hlsl
diff --git a/tools/clang/test/HLSL/expand_trig/hsin_h.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/hsin_h.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/hsin_h.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/hsin_h.hlsl
diff --git a/tools/clang/test/HLSL/expand_trig/htan.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/htan.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/htan.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/htan.hlsl
diff --git a/tools/clang/test/HLSL/expand_trig/htan_h.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/htan_h.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/htan_h.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/htan_h.hlsl
diff --git a/tools/clang/test/HLSL/expand_trig/keep_precise.0.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/keep_precise.0.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/keep_precise.0.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/keep_precise.0.hlsl
diff --git a/tools/clang/test/HLSL/expand_trig/keep_precise.1.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/keep_precise.1.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/keep_precise.1.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/keep_precise.1.hlsl
diff --git a/tools/clang/test/HLSL/expand_trig/tan.hlsl b/tools/clang/test/CodeGenHLSL/expand_trig/tan.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/expand_trig/tan.hlsl
rename to tools/clang/test/CodeGenHLSL/expand_trig/tan.hlsl
diff --git a/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/between_type_shapes.hlsl b/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/between_type_shapes.hlsl
new file mode 100644
index 000000000..ea496ae2b
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/between_type_shapes.hlsl
@@ -0,0 +1,446 @@
+// RUN: %dxc -E main -T vs_6_0 -no-warnings %s | FileCheck -check-prefix=DXC %s
+
+// Tests all implicit conversions and explicit casts between type shapes
+// (scalars, vectors, matrices, arrays and structs).
+// A matching syntactic test confirms which conversions/casts are valid and which are not.
+// The codegen for all valid conversion/casts is exercised here.
+
+// We test using scalars, vectors, matrices, arrays and structs in sizes 1, 2 and 4.
+// Size 1 are for direct conversions to/from scalars
+// Size 2 is for most other conversions (avoiding potential special cases for single elements)
+// Size 4 is for conversions to/from matrices (avoiding potential special cases for single rows/columns)
+// As an exception, we use int3x3 to cast int2x2 up (avoiding potential special cases for single rows/columns)
+
+typedef int A1[1];
+typedef int A2[2];
+typedef int A4[4];
+typedef int A5[5];
+struct S1 { int a; };
+struct S2 { int a, b; };
+struct S4 { int a, b, c, d; };
+struct S5 { int a, b, c, d, e; };
+
+AppendStructuredBuffer<int4> buffer;
+// Avoid overloading since it plays into conversions
+// _i means scalar int, to avoid confusion with _s for structs
+void output_i(int i) { buffer.Append(int4(i, 0, 0, 0)); }
+void output_v1(int1 v) { buffer.Append(int4(v.x, 0, 0, 0)); }
+void output_v2(int2 v) { buffer.Append(int4(v.x, v.y, 0, 0)); }
+void output_v4(int4 v) { buffer.Append(v); }
+void output_m1x1(int1x1 m) { buffer.Append(int4(m._11, 0, 0, 0)); }
+void output_m1x2(int1x2 m) { buffer.Append(int4(m._11, m._12, 0, 0)); }
+void output_m2x1(int2x1 m) { buffer.Append(int4(m._11, m._21, 0, 0)); }
+void output_m2x2(int2x2 m) { buffer.Append(int4(m._11, m._12, m._21, m._22)); }
+void output_m3x3(int3x3 m)
+{
+    buffer.Append(int4(m._11, m._12, m._13, 0));
+    buffer.Append(int4(m._21, m._22, m._23, 0));
+    buffer.Append(int4(m._31, m._32, m._33, 0));
+}
+void output_a1(A1 a) { buffer.Append(int4(a[0], 0, 0, 0)); }
+void output_a2(A2 a) { buffer.Append(int4(a[0], a[1], 0, 0)); }
+void output_a4(A4 a) { buffer.Append(int4(a[0], a[1], a[2], a[3])); }
+void output_s1(S1 s) { buffer.Append(int4(s.a, 0, 0, 0)); }
+void output_s2(S2 s) { buffer.Append(int4(s.a, s.b, 0, 0)); }
+void output_s4(S4 s) { buffer.Append(int4(s.a, s.b, s.c, s.d)); }
+
+// This is only to make it easier to match the output to the code.
+void output_separator() { buffer.Append(int4(8888, 8888, 8888, 8888)); }
+
+void main()
+{
+    int i = 1;
+    int1 v1 = int1(1);
+    int2 v2 = int2(1, 2);
+    int4 v4 = int4(1, 2, 3, 4);
+    int1x1 m1x1 = int1x1(11);
+    int1x2 m1x2 = int1x2(11, 12);
+    int2x1 m2x1 = int2x1(11, 21);
+    int2x2 m2x2 = int2x2(11, 12, 21, 22);
+    int1x3 m1x3 = int1x3(11, 12, 13);
+    int2x3 m2x3 = int2x3(11, 12, 13, 21, 22, 23);
+    int3x1 m3x1 = int3x1(11, 21, 31);
+    int3x2 m3x2 = int3x2(11, 12, 21, 22, 31, 32);
+    int3x3 m3x3 = int3x3(11, 12, 13, 21, 22, 23, 31, 32, 33);
+    A1 a1 = { 1 };
+    A2 a2 = { 1, 2 };
+    A4 a4 = { 1, 2, 3, 4 };
+    A5 a5 = { 1, 2, 3, 4, 5 };
+    S1 s1 = { 1 };
+    S2 s2 = { 1, 2 };
+    S4 s4 = { 1, 2, 3, 4 };
+    S5 s5 = { 1, 2, 3, 4, 5 };
+
+    // =========== Scalar/single-element ===========
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_i(v1);
+    // DXC: i32 11, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(11,0,0,0)
+    output_i(m1x1);
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_i((int)a1);
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_i((int)s1);
+
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_v1(i);
+    // DXC: i32 11, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(11,0,0,0)
+    output_v1(m1x1);
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_v1((int1)a1);
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_v1((int1)s1);
+
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_m1x1(i);
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_m1x1(v1);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,0,0,0)
+    // output_m1x1((int1x1)a1);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,0,0,0)
+    // output_m1x1((int1x1)s1);
+
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_a1((A1)i);
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_a1((A1)v1);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(11,0,0,0)
+    // output_a1((A1)m1x1);
+    // DXC rejects (GitHub #1862)
+    // FXC: l(1,0,0,0)
+    // output_a1(s1); 
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,0,0,0)
+    // output_a1((A1)s1);
+
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_s1((S1)i);
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_s1((S1)v1);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(11,0,0,0)
+    // output_s1((S1)m1x1);
+    // DXC rejects (GitHub #1862)
+    // FXC: l(1,0,0,0)
+    // output_s1(a1);
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_s1((S1)a1);
+
+    // DXC: 8888
+    output_separator();
+
+    // =========== Truncation to scalar/single-element ===========
+    // Single element sources already tested
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_i(v2); // warning: implicit truncation of vector type
+    // DXC: i32 11, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(11,0,0,0)
+    output_i(m2x2); // warning: implicit truncation of vector type
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_i((int)a2);
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_i((int)s2);
+
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_v1(v2); // warning: implicit truncation of vector type
+    // DXC: i32 11, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(11,0,0,0)
+    output_v1(m2x2); // warning: implicit truncation of vector type
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_v1((int1)a2);
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_v1((int1)s2);
+
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_m1x1(v2); // warning: implicit truncation of vector type
+    // DXC: i32 11, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(11,0,0,0)
+    output_m1x1(m2x2); // warning: implicit truncation of vector type
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,0,0,0)
+    // output_m1x1((int1x1)a2);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,0,0,0)
+    // output_m1x1((int1x1)s2);
+
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_a1((A1)a2);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,0,0,0)
+    // output_a1((A1)s2);
+
+    // DXC: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // FXC: l(1,0,0,0)
+    output_s1((S1)a2);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,0,0,0)
+    // output_s1((S1)s2);
+    
+    // DXC: 8888
+    output_separator();
+
+    // =========== Splatting ===========
+    // Single element dests already tested
+    // DXC: i32 1, i32 1, i32 0, i32 0, i8 15)
+    // FXC: l(1,1,0,0)
+    output_v2(i);
+    // DXC: i32 1, i32 1, i32 0, i32 0, i8 15)
+    // FXC: l(1,1,0,0)
+    output_v2(v1);
+    // DXC: i32 11, i32 11, i32 0, i32 0, i8 15)
+    // FXC: l(11,11,0,0)
+    output_v2(m1x1);
+
+    // DXC: i32 1, i32 1, i32 1, i32 1, i8 15)
+    // FXC: l(1,1,1,1)
+    output_m2x2(i);
+    // DXC: i32 1, i32 1, i32 1, i32 1, i8 15)
+    // FXC: l(1,1,1,1)
+    output_m2x2(v1);
+    // DXC: i32 11, i32 11, i32 11, i32 11, i8 15)
+    // FXC: l(11,11,11,11)
+    output_m2x2(m1x1);
+
+    // DXC: i32 1, i32 1, i32 0, i32 0, i8 15)
+    // FXC: l(1,1,0,0)
+    output_a2((A2)i);
+    // DXC rejects (GitHub #1863)
+    // FXC: l(1,1,0,0)
+    // output_a2((A2)v1);
+    // DXC rejects (GitHub #1863)
+    // FXC: l(11,11,0,0)
+    // output_a2((A2)m1x1);
+
+    // DXC: i32 1, i32 1, i32 0, i32 0, i8 15)
+    // FXC: l(1,1,0,0)
+    output_s2((S2)i);
+    // DXC rejects (GitHub #1863)
+    // FXC: l(1,1,0,0)
+    // output_s2((S2)v1);
+    // DXC rejects (GitHub #1863)
+    // FXC: l(11,11,0,0)
+    // output_s2((S2)m1x1);
+    
+    // DXC: 8888
+    output_separator();
+
+    // =========== Element-preserving ===========
+    // Single element sources/dests already tested
+    // DXC: i32 11, i32 12, i32 0, i32 0, i8 15)
+    // FXC: l(11,12,0,0)
+    output_v2(m1x2);
+    // DXC: i32 11, i32 21, i32 0, i32 0, i8 15)
+    // FXC: l(11,21,0,0)
+    output_v2(m2x1);
+    // DXC: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // FXC: l(11,12,21,22)
+    output_v4(m2x2);
+    // DXC: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // FXC: l(1,2,0,0)
+    output_v2((int2)a2);
+    // DXC: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // FXC: l(1,2,0,0)
+    output_v2((int2)s2);
+
+    // DXC: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // FXC: l(1,2,0,0)
+    output_m1x2(v2);
+    // DXC: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // FXC: l(1,2,0,0)
+    output_m2x1(v2);
+    // DXC: i32 1, i32 2, i32 3, i32 4, i8 15)
+    // FXC: l(1,2,3,4)
+    output_m2x2(v4);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,2,0,0)
+    // output_m1x2((int1x2)a2);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,2,0,0)
+    // output_m2x1((int2x1)a2);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,2,3,4)
+    // output_m2x2((int2x2)a4);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,2,0,0)
+    // output_m1x2((int1x2)s2);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,2,0,0)
+    // output_m2x1((int2x1)s2);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,2,3,4)
+    // output_m2x2((int2x2)s4);
+
+    // DXC fails with validation errors (GitHub #1861)
+    // FXC: l(1,2,0,0)
+    // output_a2((A2)v2);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(11,12,0,0)
+    // output_a2((A2)m1x2);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(11,21,0,0)
+    // output_a2((A2)m2x1);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(11,12,21,22)
+    // output_a4((A4)m2x2);
+    // DXC rejects (GitHub #1862)
+    // FXC: l(1,2,0,0)
+    // output_a2(s2);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,2,0,0)
+    // output_a2((A2)s2);
+
+    // DXC: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // FXC: l(1,2,0,0)
+    output_s2((S2)v2);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(11,12,0,0)
+    // output_s2((S2)m1x2);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(11,21,0,0)
+    // output_s2((S2)m2x1);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(11,12,21,22)
+    // output_s4((S4)m2x2);
+    // DXC rejects (GitHub #1862)
+    // FXC: l(1,2,0,0)
+    // output_s2(a2);
+    // DXC: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // FXC: l(1,2,0,0)
+    output_s2((S2)a2);
+    
+    // DXC: 8888
+    output_separator();
+
+    // =========== Truncating ===========
+    // Single element dests already tested
+    // DXC: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // FXC: l(1,2,0,0)
+    output_v2(v4); // warning: implicit truncation of vector type
+    // DXC: i32 11, i32 12, i32 0, i32 0, i8 15)
+    // FXC: l(11,12,0,0)
+    output_v2(m1x3); // warning: implicit truncation of vector type
+    // DXC: i32 11, i32 21, i32 0, i32 0, i8 15)
+    // FXC: l(11,21,0,0)
+    output_v2(m3x1); // warning: implicit truncation of vector type
+    // DXC: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // FXC: l(1,2,0,0)
+    output_v2((int2)a4);
+    // DXC: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // FXC: l(1,2,0,0)
+    output_v2((int2)s4);
+
+    // DXC: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // FXC: l(1,2,0,0)
+    output_m1x2(v4); // warning: implicit truncation of vector type
+    // DXC: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // FXC fails with internal error: invalid sequence/cast expression
+    output_m2x1(v4); // warning: implicit truncation of vector type
+    // DXC: i32 11, i32 12, i32 0, i32 0, i8 15)
+    // FXC: l(11,12,0,0)
+    output_m1x2(m1x3); // warning: implicit truncation of vector type
+    // DXC: i32 11, i32 12, i32 0, i32 0, i8 15)
+    // FXC: l(11,12,0,0)
+    output_m1x2(m2x2); // warning: implicit truncation of vector type
+    // DXC: i32 11, i32 21, i32 0, i32 0, i8 15)
+    // FXC: l(11,21,0,0)
+    output_m2x1(m3x1); // warning: implicit truncation of vector type
+    // DXC: i32 11, i32 21, i32 0, i32 0, i8 15)
+    // FXC: l(11,21,0,0)
+    output_m2x1(m2x2); // warning: implicit truncation of vector type
+    // DXC: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // FXC: l(11,12,21,22)
+    output_m2x2(m2x3); // warning: implicit truncation of vector type
+    // DXC: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // FXC: l(11,12,21,22)
+    output_m2x2(m3x2); // warning: implicit truncation of vector type
+    // DXC: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // FXC: l(11,12,21,22)
+    output_m2x2(m3x3); // warning: implicit truncation of vector type
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,2,0,0)
+    // output_m1x2((int1x2)a4);
+    // DXC crashes (GitHub #1799)
+    // FXC fails with internal error: invalid sequence/cast expression
+    // output_m2x1((int2x1)a4);
+    // DXC crashes (GitHub #1799)
+    // FXC rejects with error X3017: cannot convert from 'typedef int[5]' to 'int2x2'
+    // output_m2x2((int2x2)a5);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,2,0,0)
+    // output_m1x2((int1x2)s4);
+    // DXC crashes (GitHub #1799)
+    // FXC fails with internal error: invalid sequence/cast expression
+    // output_m2x1((int2x1)s4);
+    // DXC crashes (GitHub #1799)
+    // FXC rejects with error X3017: cannot convert from 'struct S5' to 'int2x2'
+    // output_m2x2((int2x2)s5);
+
+    // DXC fails validation
+    // FXC: l(1,2,0,0)
+    // output_a2((A2)v4);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(11,12,0,0)
+    // output_a2((A2)m1x3);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(11,21,0,0)
+    // output_a2((A2)m3x1);
+    // DXC crashes (GitHub #1799)
+    // FXC rejects with error X3017: cannot convert from 'int2x2' to 'typedef int[2]'
+    // output_a2((A2)m2x2);
+    // DXC crashes (GitHub #1799)
+    // FXC rejects with error X3017: cannot convert from 'int3x3' to 'typedef int[2]'
+    // output_a2((A2)m3x3);
+    // DXC: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // FXC: l(1,2,0,0)
+    output_a2((A2)a4);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,2,0,0)
+    // output_a2((A2)s4);
+
+    // DXC: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // FXC: l(1,2,0,0)
+    output_s2((S2)v4);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(11,12,0,0)
+    // output_s2((S2)m1x3);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(11,21,0,0)
+    // output_s2((S2)m3x1);
+    // DXC crashes (GitHub #1799)
+    // FXC rejects with error X3017: cannot convert from 'int2x2' to 'struct S2'
+    // output_s2((S2)m2x2);
+    // DXC crashes (GitHub #1799)
+    // FXC rejects with error X3017: cannot convert from 'int3x3' to 'struct S2'
+    // output_s2((S2)m3x3);
+    // DXC: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // FXC: l(1,2,0,0)
+    output_s2((S2)a4);
+    // DXC crashes (GitHub #1799)
+    // FXC: l(1,2,0,0)
+    // output_s2((S2)s4);
+}
diff --git a/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/numerical_const_init_list.hlsl b/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/numerical_const_init_list.hlsl
new file mode 100644
index 000000000..ae820e0c9
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/numerical_const_init_list.hlsl
@@ -0,0 +1,87 @@
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types %s | FileCheck %s
+
+// Tests conversion between numerical types which happen as of
+// constant initialization lists, since they use a different code path.
+
+RWStructuredBuffer<bool4> buf_b;
+RWStructuredBuffer<int4> buf_i;
+RWStructuredBuffer<uint4> buf_u;
+RWStructuredBuffer<float4> buf_f;
+
+void main() {
+    // To bool
+    // CHECK: i32 1, i32 1, i32 1, i32 1, i8
+    // CHECK: i32 1, i32 1, i32 1, i32 1, i8
+    // CHECK: i32 1, i32 1, i32 0, i32 0, i8
+    static const bool b[] = {
+        true, // No-op
+        (int)(-1), // icmp ne 0
+        (uint)0xFFFFFFFF, // icmp ne 0
+        (int16_t)(-1), // icmp ne 0
+        (uint16_t)0xFFFF, // icmp ne 0
+        (int64_t)(-1), // icmp ne 0
+        (uint64_t)0xFFFFFFFFFFFFFFFF, // icmp ne 0
+        (half)(-1.5f), // fcmp ne 0
+        -1.5f, // fcmp ne 0
+        (double)(-1.5f) }; // fcmp ne 0
+    buf_b[0] = bool4(b[0], b[1], b[2], b[3]);
+    buf_b[1] = bool4(b[4], b[5], b[6], b[7]);
+    buf_b[2] = bool4(b[8], b[9], false, false);
+
+    // To signed int
+    // CHECK: i32 1, i32 -1, i32 -1, i32 -1, i8
+    // CHECK: i32 65535, i32 -1, i32 -1, i32 -1, i8
+    // CHECK: i32 -1, i32 -1, i32 0, i32 0, i8
+    static const int i[] = {
+        true, // ZExt
+        (int)(-1), // No-op
+        (uint)0xFFFFFFFF, // No-op (reinterpret)
+        (int16_t)(-1), // SExt
+        (uint16_t)0xFFFF, // ZExt
+        (int64_t)(-1), // Trunc
+        (uint64_t)0xFFFFFFFFFFFFFFFF, // Trunc
+        (half)(-1.5f), // FPToSI
+        -1.5f, // FPToSI
+        (double)(-1.5f) }; // FPToSI
+    buf_i[0] = int4(i[0], i[1], i[2], i[3]);
+    buf_i[1] = int4(i[4], i[5], i[6], i[7]);
+    buf_i[2] = int4(i[8], i[9], 0, 0);
+    
+    // To unsigned int
+    // CHECK: i32 1, i32 -1, i32 -1, i32 -1, i8
+    // CHECK: i32 65535, i32 -1, i32 -1, i32 0, i8
+    // CHECK: i32 0, i32 0, i32 0, i32 0, i8
+    static const uint u[] = {
+        true, // ZExt
+        (int)(-1), // No-op (reinterpret)
+        (uint)0xFFFFFFFF, // No-op
+        (int16_t)(-1), // SExt
+        (uint16_t)0xFFFF, // ZExt
+        (int64_t)(-1), // Trunc
+        (uint64_t)0xFFFFFFFFFFFFFFFF, // Trunc
+        (half)(-1.5f), // FPToUI
+        -1.5f, // FPToUI
+        (double)(-1.5f) }; // FPToUI
+    buf_u[0] = uint4(u[0], u[1], u[2], u[3]);
+    buf_u[1] = uint4(u[4], u[5], u[6], u[7]);
+    buf_u[2] = uint4(u[8], u[9], 0, 0);
+    
+    // To float
+    // CHECK: float 1.000000e+00, float -1.000000e+00, float 0x41F0000000000000, float -1.000000e+00, i8
+    // CHECK: float 6.553500e+04, float -1.000000e+00, float 0x41F0000000000000, float -1.500000e+00, i8
+    // CHECK: float -1.500000e+00, float -1.500000e+00, float 0.000000e+00, float 0.000000e+00, i8
+    static const float f[] = {
+        true, // UIToFP
+        (int)(-1), // SIToFP
+        (uint)0xFFFFFFFF, // UIToFP
+        (int16_t)(-1), // SIToFP
+        (uint16_t)0xFFFF, // UIToFP
+        (int64_t)(-1), // SIToFP
+        (uint64_t)0xFFFFFFFFFFFFFFFF, // UIToFP
+        (half)(-1.5f), // FPExt
+        -1.5f, // No-op
+        (double)(-1.5f) }; // FPTrunc
+    buf_f[0] = float4(f[0], f[1], f[2], f[3]);
+    buf_f[1] = float4(f[4], f[5], f[6], f[7]);
+    buf_f[2] = float4(f[8], f[9], 0, 0);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/numerical_indirect.hlsl b/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/numerical_indirect.hlsl
new file mode 100644
index 000000000..11a8ec4e9
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/numerical_indirect.hlsl
@@ -0,0 +1,103 @@
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types %s | FileCheck %s
+
+// Tests conversion between numerical types which happen as part
+// of a larger flat conversion between compound types.
+// Assume that struct-to-struct is representative of all
+// compound/numerical to compound/numerical flat casts.
+// Assume that conversions between 16 and 32-bit types
+// are representative of those between 16 and 64 or 32 and 64-bit types. 
+
+struct B { bool value; };
+struct I16 { int16_t value; };
+struct U16 { uint16_t value; };
+struct I32 { int value; };
+struct U32 { uint value; };
+struct F16 { half value; };
+struct F32 { float value; };
+
+RWStructuredBuffer<B> b;
+RWStructuredBuffer<I16> i16;
+RWStructuredBuffer<U16> u16;
+RWStructuredBuffer<I32> i32;
+RWStructuredBuffer<U32> u32;
+RWStructuredBuffer<F16> f16;
+RWStructuredBuffer<F32> f32;
+
+void main() {
+    int i = 0;
+    int j = 0;
+
+    // Integral casts
+    // CHECK-NOT: zext
+    // CHECK-NOT: sext
+    // CHECK-NOT: trunc
+    i32[i++] = (I32)u32[j++];
+    u32[i++] = (U32)i32[j++];
+
+    // CHECK: trunc
+    // CHECK: trunc
+    // CHECK: trunc
+    // CHECK: trunc
+    i16[i++] = (I16)i32[j++];
+    i16[i++] = (I16)u32[j++];
+    u16[i++] = (U16)i32[j++];
+    u16[i++] = (U16)u32[j++];
+    
+    // CHECK: sext
+    // CHECK: zext
+    // CHECK: sext
+    // CHECK: zext
+    i32[i++] = (I32)i16[j++];
+    i32[i++] = (I32)u16[j++];
+    u32[i++] = (U32)i16[j++];
+    u32[i++] = (U32)u16[j++];
+    
+    // Float casts
+    // CHECK: fpext
+    // CHECK: fptrunc
+    f32[i++] = (F32)f16[j++];
+    f16[i++] = (F16)f32[j++];
+    
+    // Integral/float casts
+    // CHECK: fptosi
+    // CHECK: fptoui
+    // CHECK: sitofp
+    // CHECK: uitofp
+    i32[i++] = (I32)f32[j++];
+    u32[i++] = (U32)f32[j++];
+    f32[i++] = (F32)i32[j++];
+    f32[i++] = (F32)u32[j++];
+
+    // CHECK: fptosi
+    // CHECK: fptoui
+    // CHECK: sitofp
+    // CHECK: uitofp
+    i16[i++] = (I16)f32[j++];
+    u16[i++] = (U16)f32[j++];
+    f16[i++] = (F16)i32[j++];
+    f16[i++] = (F16)u32[j++];
+    
+    // CHECK: fptosi
+    // CHECK: fptoui
+    // CHECK: sitofp
+    // CHECK: uitofp
+    i32[i++] = (I32)f16[j++];
+    u32[i++] = (U32)f16[j++];
+    f32[i++] = (F32)i16[j++];
+    f32[i++] = (F32)u16[j++];
+
+    // Casts to/from bool
+    // CHECK: icmp ne
+    // CHECK: icmp ne
+    // CHECK: fcmp fast une
+    b[i++] = (B)i32[j++];
+    b[i++] = (B)u32[j++];
+    b[i++] = (B)f32[j++];
+    
+    // CHECK: zext
+    // CHECK: zext
+    // CHECK: uitofp
+    i32[i++] = (I32)b[j++];
+    u32[i++] = (U32)b[j++];
+    f32[i++] = (F32)b[j++];
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/numerical_matrices.hlsl b/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/numerical_matrices.hlsl
new file mode 100644
index 000000000..53bc2bfc5
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/numerical_matrices.hlsl
@@ -0,0 +1,101 @@
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types %s | FileCheck %s
+
+// Tests conversions between numerical types in matrices.
+// This happens during matrix lowering so it needs its own testing.
+// Assume that conversions between 16 and 32-bit types
+// are representative of those between 16 and 64 or 32 and 64-bit types. 
+
+typedef bool1x1 B;
+typedef int16_t1x1 I16;
+typedef uint16_t1x1 U16;
+typedef int1x1 I32;
+typedef uint1x1 U32;
+typedef half1x1 F16;
+typedef float1x1 F32;
+
+RWStructuredBuffer<B> b;
+RWStructuredBuffer<I16> i16;
+RWStructuredBuffer<U16> u16;
+RWStructuredBuffer<I32> i32;
+RWStructuredBuffer<U32> u32;
+RWStructuredBuffer<F16> f16;
+RWStructuredBuffer<F32> f32;
+
+void main() {
+    int i = 0;
+    int j = 0;
+
+    // Integral casts
+    // CHECK-NOT: zext
+    // CHECK-NOT: sext
+    // CHECK-NOT: trunc
+    i32[i++] = (I32)u32[j++];
+    u32[i++] = (U32)i32[j++];
+
+    // CHECK: trunc
+    // CHECK: trunc
+    // CHECK: trunc
+    // CHECK: trunc
+    i16[i++] = (I16)i32[j++];
+    i16[i++] = (I16)u32[j++];
+    u16[i++] = (U16)i32[j++];
+    u16[i++] = (U16)u32[j++];
+    
+    // CHECK: sext
+    // CHECK: zext
+    // CHECK: sext
+    // CHECK: zext
+    i32[i++] = (I32)i16[j++];
+    i32[i++] = (I32)u16[j++];
+    u32[i++] = (U32)i16[j++];
+    u32[i++] = (U32)u16[j++];
+    
+    // Float casts
+    // CHECK: fpext
+    // CHECK: fptrunc
+    f32[i++] = (F32)f16[j++];
+    f16[i++] = (F16)f32[j++];
+    
+    // Integral/float casts
+    // CHECK: fptosi
+    // CHECK: fptoui
+    // CHECK: sitofp
+    // CHECK: uitofp
+    i32[i++] = (I32)f32[j++];
+    u32[i++] = (U32)f32[j++];
+    f32[i++] = (F32)i32[j++];
+    f32[i++] = (F32)u32[j++];
+
+    // CHECK: fptosi
+    // CHECK: fptoui
+    // CHECK: sitofp
+    // CHECK: uitofp
+    i16[i++] = (I16)f32[j++];
+    u16[i++] = (U16)f32[j++];
+    f16[i++] = (F16)i32[j++];
+    f16[i++] = (F16)u32[j++];
+    
+    // CHECK: fptosi
+    // CHECK: fptoui
+    // CHECK: sitofp
+    // CHECK: uitofp
+    i32[i++] = (I32)f16[j++];
+    u32[i++] = (U32)f16[j++];
+    f32[i++] = (F32)i16[j++];
+    f32[i++] = (F32)u16[j++];
+
+    // Casts to/from bool
+    // CHECK: icmp ne
+    // CHECK: icmp ne
+    // CHECK: fcmp fast une
+    b[i++] = (B)i32[j++];
+    b[i++] = (B)u32[j++];
+    b[i++] = (B)f32[j++];
+    
+    // CHECK: zext
+    // CHECK: zext
+    // CHECK: uitofp
+    i32[i++] = (I32)b[j++];
+    u32[i++] = (U32)b[j++];
+    f32[i++] = (F32)b[j++];
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/numerical_to_compound_roundtrip.hlsl b/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/numerical_to_compound_roundtrip.hlsl
new file mode 100644
index 000000000..371fd67aa
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/expressions/conversions_and_casts/numerical_to_compound_roundtrip.hlsl
@@ -0,0 +1,141 @@
+// RUN: %dxc -T vs_6_0 -E main %s | FileCheck %s
+
+// Test round-trip conversions from scalar/vector/matrices to structs/arrays and back
+// If the round-trip conversion succeeds, we assume both single-way conversions did too.
+// Does not test numerical conversions.
+
+// Whenever possible, use 4 members so we can convert between all structs, arrays, int4 and int2x2
+struct s_int { int x; }; // For scalar tests
+struct s_three_ints { int x, y, z; }; // For truncation tests
+struct s_ints { int x, y, z, w; };
+struct s_vecs { int2 xy, zw; };
+struct s_mat { int2x2 mat; };
+struct s_mat_3x3 { int3x3 mat; }; // For truncation tests
+struct s_structs
+{
+    struct { int x, y; } xy;
+    struct { int x, y; } zw;
+};
+struct s_arrays { int xy[2]; int zw[2]; };
+struct s_empty_structs { struct {} _pre; int x, y; struct {} _mid; int z, w; struct {} _post; };
+
+typedef int a_int[1];
+typedef int a_three_ints[3];
+typedef int a_ints[4];
+typedef int2 a_vecs[2];
+typedef int2x2 a_mat[1];
+typedef int3x3 a_mat_3x3[1];
+typedef struct { int x, y; } a_structs[2];
+typedef int a_ints_2d[2][2];
+
+AppendStructuredBuffer<int4> buffer;
+
+void output_i(int value) { buffer.Append(int4(value, 0, 0, 0)); }
+void output_v1(int1 value) { buffer.Append(int4(value.x, 0, 0, 0)); }
+void output_v2(int2 value) { buffer.Append(int4(value.x, value.y, 0, 0)); }
+void output_v4(int4 value) { buffer.Append(value); }
+void output_m1x1(int1x1 value) { buffer.Append(int4(value._11, 0, 0, 0)); }
+void output_m2x2(int2x2 value) { buffer.Append(int4(value._11, value._12, value._21, value._22)); }
+
+void output_separator() { buffer.Append((int4)8888); }
+
+void main() {
+    int4 v4 = int4(1, 2, 3, 4);
+    int2x2 m2x2 = int4(11, 12, 21, 22);
+    int4x4 m4x4 = int4x4(11, 12, 13, 14, 21, 22, 23, 24, 31, 32, 33, 34, 41, 42, 43, 44); // For truncation tests
+
+    // Scalar cases
+    // CHECK: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // CHECK: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // CHECK: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // CHECK: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // CHECK: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // CHECK: i32 1, i32 0, i32 0, i32 0, i8 15)
+    // CHECK: 8888
+    output_i((int)(s_int)1);
+    output_v1((int1)(s_int)int1(1));
+    output_m1x1((int1x1)(s_int)int1x1(1));
+    output_i((int)(a_int)1);
+    output_v1((int1)(a_int)int1(1));
+    output_m1x1((int1x1)(a_int)int1x1(1));
+    output_separator();
+
+    // 1-to-1 vector/matrix cases
+    // CHECK: i32 1, i32 2, i32 3, i32 4, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: i32 1, i32 2, i32 3, i32 4, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: 8888
+    output_v4((int4)(s_ints)v4);
+    output_m2x2((int2x2)(s_ints)m2x2);
+    output_v4((int4)(a_ints)v4);
+    output_m2x2((int2x2)(a_ints)m2x2);
+    output_separator();
+    
+    // With numerical conversions
+    
+    // With vectors in compound type
+    // CHECK: i32 1, i32 2, i32 3, i32 4, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: i32 1, i32 2, i32 3, i32 4, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: 8888
+    output_v4((int4)(s_vecs)v4);
+    output_m2x2((int2x2)(s_vecs)m2x2);
+    output_v4((int4)(a_vecs)v4);
+    output_m2x2((int2x2)(a_vecs)m2x2);
+    output_separator();
+    
+    // With matrices in compound type
+    // CHECK: i32 1, i32 2, i32 3, i32 4, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: i32 1, i32 2, i32 3, i32 4, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: 8888
+    output_v4((int4)(s_mat)v4);
+    output_m2x2((int2x2)(s_mat)m2x2);
+    output_v4((int4)(a_mat)v4);
+    output_m2x2((int2x2)(a_mat)m2x2);
+    output_separator();
+    
+    // With homogeneous nesting (struct of structs, array of arrays)
+    // CHECK: i32 1, i32 2, i32 3, i32 4, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: i32 1, i32 2, i32 3, i32 4, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: 8888
+    output_v4((int4)(s_structs)v4);
+    output_m2x2((int2x2)(s_structs)m2x2);
+    output_v4((int4)(a_ints_2d)v4);
+    output_m2x2((int2x2)(a_ints_2d)m2x2);
+    output_separator();
+    
+    // With heterogeneous nesting (struct of arrays, array of structs)
+    // CHECK: i32 1, i32 2, i32 3, i32 4, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: i32 1, i32 2, i32 3, i32 4, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: 8888
+    output_v4((int4)(s_arrays)v4);
+    output_m2x2((int2x2)(s_arrays)m2x2);
+    output_v4((int4)(a_structs)v4);
+    output_m2x2((int2x2)(a_structs)m2x2);
+    output_separator();
+
+    // With nested empty struct
+    // CHECK: i32 1, i32 2, i32 3, i32 4, i8 15)
+    // CHECK: i32 11, i32 12, i32 21, i32 22, i8 15)
+    // CHECK: 8888
+    output_v4((int4)(s_empty_structs)v4);
+    output_m2x2((int2x2)(s_empty_structs)m2x2);
+    output_separator();
+
+    // Truncation case
+    // Casting a 2D matrix to a smaller struct or struct to smaller 2D matrix is illegal
+    // CHECK: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // CHECK: i32 1, i32 2, i32 0, i32 0, i8 15)
+    // CHECK: 8888
+    output_v2((int2)(s_three_ints)v4);
+    output_v2((int2)(a_three_ints)v4);
+    output_separator();
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/expressions/operators/matrices/arithmetic.hlsl b/tools/clang/test/CodeGenHLSL/expressions/operators/matrices/arithmetic.hlsl
new file mode 100644
index 000000000..96c2c8213
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/expressions/operators/matrices/arithmetic.hlsl
@@ -0,0 +1,131 @@
+// RUN: %dxc -E main -T vs_6_0 %s | FileCheck %s
+
+// Tests the implementation of unary and binary matrix operators
+
+// Workaround for AppendStructuredBuffer<matrix>.Append bug
+#define Append(buf, val) buf[buf.IncrementCounter()] = (val)
+
+RWStructuredBuffer<int1x1> output_i;
+RWStructuredBuffer<uint1x1> output_u;
+RWStructuredBuffer<float1x1> output_f;
+RWStructuredBuffer<bool1x1> output_b;
+
+void main()
+{
+    int1x1 i1 = int1x1(1);
+    int1x1 i2 = int1x1(2);
+    int1x1 i3 = int1x1(3);
+    int1x1 im1 = int1x1(-1);
+    int1x1 im3 = int1x1(-3);
+    uint1x1 u1 = uint1x1(1);
+    uint1x1 u2 = uint1x1(2);
+    uint1x1 u3 = uint1x1(3);
+    uint1x1 um1 = uint1x1((uint)(-1));
+    float1x1 fm0_5 = float1x1(-0.5);
+    float1x1 f0_5 = float1x1(0.5);
+    float1x1 f1 = float1x1(1);
+    float1x1 f1_5 = float1x1(1.5);
+    float1x1 f2 = float1x1(2);
+
+    // Unary operators, except pre/post inc/dec
+    // CHECK: i32 3, i32 undef
+    Append(output_i, +i3); // Plus
+    // CHECK: i32 -3, i32 undef
+    Append(output_i, -i3); // Minus
+    // CHECK: i32 -4, i32 undef
+    Append(output_i, ~i3); // Not
+    // CHECK: i32 0, i32 undef
+    Append(output_b, !i3); // LNot
+    
+    // CHECK: float 5.000000e-01, float undef
+    Append(output_f, +f0_5); // Plus
+    // CHECK: float -5.000000e-01, float undef
+    Append(output_f, -f0_5); // Minus
+    // CHECK: i32 0, i32 undef
+    Append(output_b, !f0_5); // LNot
+
+    // Binary operators
+    // CHECK: i32 6, i32 undef
+    Append(output_i, i3 * i2); // Mul
+    // CHECK: i32 -1, i32 undef
+    Append(output_i, im3 / i2); // Div
+    // CHECK: i32 -1, i32 undef
+    Append(output_i, im3 % i2); // Rem
+    // CHECK: i32 3, i32 undef
+    Append(output_i, i1 + i2); // Add
+    // CHECK: i32 2, i32 undef
+    Append(output_i, i3 - i1); // Sub
+
+    // CHECK: float 1.000000e+00, float undef
+    Append(output_f, f0_5 * f2); // Mul
+    // CHECK: float 2.000000e+00, float undef
+    Append(output_f, f1 / f0_5); // Div
+    // CHECK: float 5.000000e-01, float undef
+    Append(output_f, f2 % f1_5); // Rem
+    // CHECK: float 2.000000e+00, float undef
+    Append(output_f, f0_5 + f1_5); // Add
+    // CHECK: float -1.000000e+00, float undef
+    Append(output_f, f0_5 - f1_5); // Sub
+
+    // CHECK: i32 6, i32 undef
+    Append(output_i, i3 << i1); // Shl
+    // CHECK: i32 -1, i32 undef
+    Append(output_i, im1 >> i1); // Shr
+    // CHECK: i32 2, i32 undef
+    Append(output_i, i3 & i2); // And
+    // CHECK: i32 2, i32 undef
+    Append(output_i, i3 ^ i1); // Xor
+    // CHECK: i32 3, i32 undef
+    Append(output_i, i2 | i1); // Or
+
+    // CHECK: i32 1, i32 undef
+    Append(output_b, i3 && i2); // LAnd
+    // CHECK: i32 1, i32 undef
+    Append(output_b, i3 || i2); // LOr
+    
+    // CHECK: i32 1, i32 undef
+    Append(output_b, f0_5 && f1_5); // LAnd
+    // CHECK: i32 1, i32 undef
+    Append(output_b, f0_5 || f1_5); // LOr
+
+    // CHECK: i32 6, i32 undef
+    Append(output_u, u3 * u2); // UDiv
+    // CHECK: i32 1, i32 undef
+    Append(output_u, u3 % u2); // URem
+    // CHECK: i32 2147483647, i32 undef
+    Append(output_u, um1 >> u1); // UShr
+
+    // CHECK: i32 1, i32 undef
+    Append(output_b, im1 < i1); // LT
+    // CHECK: i32 0, i32 undef
+    Append(output_b, im1 > i1); // GT
+    // CHECK: i32 1, i32 undef
+    Append(output_b, im1 <= i1); // LE
+    // CHECK: i32 0, i32 undef
+    Append(output_b, im1 >= i1); // GE
+    // CHECK: i32 0, i32 undef
+    Append(output_b, im1 == i1); // EQ
+    // CHECK: i32 1, i32 undef
+    Append(output_b, im1 != i1); // NE
+    // CHECK: i32 0, i32 undef
+    Append(output_b, um1 < u1); // ULT
+    // CHECK: i32 1, i32 undef
+    Append(output_b, um1 > u1); // UGT
+    // CHECK: i32 0, i32 undef
+    Append(output_b, um1 <= u1); // ULE
+    // CHECK: i32 1, i32 undef
+    Append(output_b, um1 >= u1); // UGE
+    
+    // CHECK: i32 1, i32 undef
+    Append(output_b, fm0_5 < f1_5); // LT
+    // CHECK: i32 0, i32 undef
+    Append(output_b, fm0_5 > f1_5); // GT
+    // CHECK: i32 1, i32 undef
+    Append(output_b, fm0_5 <= f1_5); // LE
+    // CHECK: i32 0, i32 undef
+    Append(output_b, fm0_5 >= f1_5); // GE
+    // CHECK: i32 0, i32 undef
+    Append(output_b, fm0_5 == f1_5); // EQ
+    // CHECK: i32 1, i32 undef
+    Append(output_b, fm0_5 != f1_5); // NE
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/expressions/operators/matrices/increment_decrement.hlsl b/tools/clang/test/CodeGenHLSL/expressions/operators/matrices/increment_decrement.hlsl
new file mode 100644
index 000000000..fe28d9e8a
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/expressions/operators/matrices/increment_decrement.hlsl
@@ -0,0 +1,36 @@
+// RUN: %dxc /T vs_6_0 /E main > %s | FileCheck %s | XFail GitHub #1780
+
+// Check that pre/post increment/decrement operators on
+// matrices have the intended semantics for both the original
+// variable and the returned value.
+
+AppendStructuredBuffer<int2> results;
+
+void main()
+{
+  int1x1 variable, result;
+  
+  // Post-increment
+  // CHECK: i32 11, i32 10
+  variable = int1x1(10);
+  result = variable++;
+  results.Append(int2(variable._11, result._11));
+  
+  // Post-decrement
+  // CHECK: i32 9, i32 10
+  variable = int1x1(10);
+  result = variable--;
+  results.Append(int2(variable._11, result._11));
+  
+  // Pre-increment
+  // CHECK: i32 11, i32 11
+  variable = int1x1(10);
+  result = ++variable;
+  results.Append(int2(variable._11, result._11));
+  
+  // Pre-decrement
+  // CHECK: i32 9, i32 9
+  variable = int1x1(10);
+  result = --variable;
+  results.Append(int2(variable._11, result._11));
+}
\ No newline at end of file
diff --git a/tools/clang/test/HLSL/hca/01.hlsl b/tools/clang/test/CodeGenHLSL/hoist_constant_array/01.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/hca/01.hlsl
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/01.hlsl
diff --git a/tools/clang/test/HLSL/hca/02.hlsl b/tools/clang/test/CodeGenHLSL/hoist_constant_array/02.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/hca/02.hlsl
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/02.hlsl
diff --git a/tools/clang/test/HLSL/hca/03.hlsl b/tools/clang/test/CodeGenHLSL/hoist_constant_array/03.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/hca/03.hlsl
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/03.hlsl
diff --git a/tools/clang/test/HLSL/hca/04.hlsl b/tools/clang/test/CodeGenHLSL/hoist_constant_array/04.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/hca/04.hlsl
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/04.hlsl
diff --git a/tools/clang/test/HLSL/hca/05.hlsl b/tools/clang/test/CodeGenHLSL/hoist_constant_array/05.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/hca/05.hlsl
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/05.hlsl
diff --git a/tools/clang/test/HLSL/hca/06.hlsl b/tools/clang/test/CodeGenHLSL/hoist_constant_array/06.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/hca/06.hlsl
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/06.hlsl
diff --git a/tools/clang/test/HLSL/hca/07.hlsl b/tools/clang/test/CodeGenHLSL/hoist_constant_array/07.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/hca/07.hlsl
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/07.hlsl
diff --git a/tools/clang/test/HLSL/hca/08.hlsl b/tools/clang/test/CodeGenHLSL/hoist_constant_array/08.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/hca/08.hlsl
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/08.hlsl
diff --git a/tools/clang/test/HLSL/hca/09.hlsl b/tools/clang/test/CodeGenHLSL/hoist_constant_array/09.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/hca/09.hlsl
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/09.hlsl
diff --git a/tools/clang/test/HLSL/hca/10.hlsl b/tools/clang/test/CodeGenHLSL/hoist_constant_array/10.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/hca/10.hlsl
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/10.hlsl
diff --git a/tools/clang/test/HLSL/hca/11.hlsl b/tools/clang/test/CodeGenHLSL/hoist_constant_array/11.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/hca/11.hlsl
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/11.hlsl
diff --git a/tools/clang/test/HLSL/hca/12.hlsl b/tools/clang/test/CodeGenHLSL/hoist_constant_array/12.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/hca/12.hlsl
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/12.hlsl
diff --git a/tools/clang/test/HLSL/hca/13.hlsl b/tools/clang/test/CodeGenHLSL/hoist_constant_array/13.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/hca/13.hlsl
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/13.hlsl
diff --git a/tools/clang/test/HLSL/hca/14.hlsl b/tools/clang/test/CodeGenHLSL/hoist_constant_array/14.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/hca/14.hlsl
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/14.hlsl
diff --git a/tools/clang/test/HLSL/hca/15.ll b/tools/clang/test/CodeGenHLSL/hoist_constant_array/15.ll
similarity index 100%
rename from tools/clang/test/HLSL/hca/15.ll
rename to tools/clang/test/CodeGenHLSL/hoist_constant_array/15.ll
diff --git a/tools/clang/test/CodeGenHLSL/implicit-casts_Mod.hlsl b/tools/clang/test/CodeGenHLSL/implicit-casts_Mod.hlsl
index ca13014b2..ebf5679c9 100644
--- a/tools/clang/test/CodeGenHLSL/implicit-casts_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/implicit-casts_Mod.hlsl
@@ -1,7 +1,7 @@
 // RUN: %dxc -E main -T ps_6_0 %s
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T ps_5_1 implicit-casts.hlsl
+// fxc.exe /T ps_5_1 implicit-casts.hlsl
 
 // without also putting them in a static assertion
 
diff --git a/tools/clang/test/CodeGenHLSL/indexing-operator_Mod.hlsl b/tools/clang/test/CodeGenHLSL/indexing-operator_Mod.hlsl
index 575b02215..7da3a242b 100644
--- a/tools/clang/test/CodeGenHLSL/indexing-operator_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/indexing-operator_Mod.hlsl
@@ -2,7 +2,7 @@
 
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T vs_5_1 indexing-operator.hlsl
+// fxc.exe /T vs_5_1 indexing-operator.hlsl
 
 Buffer g_b;
 StructuredBuffer<float4> g_sb;
diff --git a/tools/clang/test/CodeGenHLSL/literals_Mod.hlsl b/tools/clang/test/CodeGenHLSL/literals_Mod.hlsl
index f57dd7b74..db183a755 100644
--- a/tools/clang/test/CodeGenHLSL/literals_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/literals_Mod.hlsl
@@ -1,7 +1,7 @@
 // RUN: %dxc -E test -T vs_6_0 %s
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T ps_5_1 literals.hlsl
+// fxc.exe /T ps_5_1 literals.hlsl
 
 // without also putting them in a static assertion
 
diff --git a/tools/clang/test/CodeGenHLSL/literals_exact_precision_Mod.hlsl b/tools/clang/test/CodeGenHLSL/literals_exact_precision_Mod.hlsl
index d65a0cde1..ed9f6f264 100644
--- a/tools/clang/test/CodeGenHLSL/literals_exact_precision_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/literals_exact_precision_Mod.hlsl
@@ -1,7 +1,7 @@
 // RUN: %dxc -enable-16bit-types -E test -T vs_6_2 %s
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T ps_5_1 literals.hlsl
+// fxc.exe /T ps_5_1 literals.hlsl
 
 // without also putting them in a static assertion
 
diff --git a/tools/clang/test/CodeGenHLSL/loop3.hlsl b/tools/clang/test/CodeGenHLSL/loop3.hlsl
index 11c9d2b51..d8d6f648a 100644
--- a/tools/clang/test/CodeGenHLSL/loop3.hlsl
+++ b/tools/clang/test/CodeGenHLSL/loop3.hlsl
@@ -1,6 +1,5 @@
 // RUN: %dxc -E main -O2 -T ps_6_0 %s | FileCheck %s
 
-// CHECK: !"llvm.loop.unroll.full"
 // CHECK: !"llvm.loop.unroll.disable"
 
 float main(float2 a : A, int3 b : B) : SV_Target
@@ -12,8 +11,7 @@ float main(float2 a : A, int3 b : B) : SV_Target
     if (b.z == 9)
       break;
     [allow_uav_condition]
-    [unroll]
-    for(int j = 0; j < b.y; j++)
+    for(int j = 0; j <= 16; j++)
     {
       [branch]
       if (b.z == 16)
diff --git a/tools/clang/test/CodeGenHLSL/matrix-assignments_Mod.hlsl b/tools/clang/test/CodeGenHLSL/matrix-assignments_Mod.hlsl
index db1cb94c1..011e56027 100644
--- a/tools/clang/test/CodeGenHLSL/matrix-assignments_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/matrix-assignments_Mod.hlsl
@@ -1,7 +1,7 @@
 // RUN: %dxc -E main -T ps_6_0 %s
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T vs_2_0 matrix-assignments.hlsl
+// fxc.exe /T vs_2_0 matrix-assignments.hlsl
 
 float pick_one(float2x2 f2) {
   // TODO: implement swizzling members return f2._m00;
diff --git a/tools/clang/test/CodeGenHLSL/matrix-syntax_Mod.hlsl b/tools/clang/test/CodeGenHLSL/matrix-syntax_Mod.hlsl
index 38b815b3d..1e626f384 100644
--- a/tools/clang/test/CodeGenHLSL/matrix-syntax_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/matrix-syntax_Mod.hlsl
@@ -1,7 +1,7 @@
 // RUN: %dxc -E main -T ps_6_0 %s
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T vs_5_1 matrix-syntax.hlsl
+// fxc.exe /T vs_5_1 matrix-syntax.hlsl
 
 matrix m;
 
diff --git a/tools/clang/test/CodeGenHLSL/max_min_literal.hlsl b/tools/clang/test/CodeGenHLSL/max_min_literal.hlsl
new file mode 100644
index 000000000..a6a5c2519
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/max_min_literal.hlsl
@@ -0,0 +1,23 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK-NOT: FMax
+// CHECK-NOT: FMin
+// CHECK-NOT: IMax
+// CHECK-NOT: IMin
+// CHECK-NOT: UMax
+// CHECK-NOT: UMin
+
+
+#define FA float4(0.0f, 1.0f, 2.0f, 3.0f)
+#define FB float4(4.0f, 5.0f, 6.0f, 7.0f)
+#define IA int4(0, 1, 2, 3)
+#define IB int4(4, 5, 6, 7)
+#define UA uint4(0, 1, 2, 3)
+#define UB uint4(4, 5, 6, 7)
+
+
+float4 main(float4 a : A) : SV_TARGET
+{
+  return max(FA,FB) + min(FA,FB) + max(IA,IB) + min(IA,IB) + max(UA,UB) + min(UA,UB);
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/more-operators_Mod.hlsl b/tools/clang/test/CodeGenHLSL/more-operators_Mod.hlsl
index f080c5dce..d64e5eb4b 100644
--- a/tools/clang/test/CodeGenHLSL/more-operators_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/more-operators_Mod.hlsl
@@ -12,7 +12,7 @@
 #endif
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T vs_5_1 more-operators.hlsl
+// fxc.exe /T vs_5_1 more-operators.hlsl
 // with vs_2_0 (the default) min16float usage produces a complaint that it's not supported
 
 struct f3_s    { float3 f3; };
diff --git a/tools/clang/test/CodeGenHLSL/object-operators_Mod.hlsl b/tools/clang/test/CodeGenHLSL/object-operators_Mod.hlsl
index f7f1e6c59..e1a65ba3d 100644
--- a/tools/clang/test/CodeGenHLSL/object-operators_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/object-operators_Mod.hlsl
@@ -9,7 +9,7 @@
 #endif
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T vs_5_1 scalar-operators.hlsl
+// fxc.exe /T vs_5_1 scalar-operators.hlsl
 // with vs_2_0 (the default) min16float usage produces a complaint that it's not supported
 
 struct f3_s    { float3 f3; };
diff --git a/tools/clang/test/HLSL/pix/AccessTracking.hlsl b/tools/clang/test/CodeGenHLSL/pix/AccessTracking.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/AccessTracking.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/AccessTracking.hlsl
diff --git a/tools/clang/test/HLSL/pix/DebugBasic.hlsl b/tools/clang/test/CodeGenHLSL/pix/DebugBasic.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/DebugBasic.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/DebugBasic.hlsl
diff --git a/tools/clang/test/HLSL/pix/DebugCSParameters.hlsl b/tools/clang/test/CodeGenHLSL/pix/DebugCSParameters.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/DebugCSParameters.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/DebugCSParameters.hlsl
diff --git a/tools/clang/test/HLSL/pix/DebugFlowControl.hlsl b/tools/clang/test/CodeGenHLSL/pix/DebugFlowControl.hlsl
similarity index 53%
rename from tools/clang/test/HLSL/pix/DebugFlowControl.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/DebugFlowControl.hlsl
index fdcc2f053..a2d3cf0ac 100644
--- a/tools/clang/test/HLSL/pix/DebugFlowControl.hlsl
+++ b/tools/clang/test/CodeGenHLSL/pix/DebugFlowControl.hlsl
@@ -1,14 +1,14 @@
-// RUN: %dxc -EFlowControlPS -Tps_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation | %FileCheck %s
+// RUN: %dxc -EFlowControlPS -Tps_6_0 %s | %opt -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation | %FileCheck %s
 
 // Check that flow control constructs don't break the instrumentation.
 
 // check instrumentation for one branch. 
 
-// CHECK:  %UAVIncResult15 = call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle, i32 0, i32 0, i32 undef, i32 undef, i32 %IncrementForThisInvocation1)
-// CHECK:  %MaskedForUAVLimit16 = and i32 %UAVIncResult15, 983039
-// CHECK:  %MultipliedForInterest17 = mul i32 %MaskedForUAVLimit16, %OffsetMultiplicand
-// CHECK:  %AddedForInterest18 = add i32 %MultipliedForInterest17, %OffsetAddend
-// CHECK:  call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, i32 %AddedForInterest18, i32 undef, i32 64257, i32 undef, i32 undef, i32 undef, i8 1)
+// CHECK:  %UAVIncResult2 = call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle, i32 0, i32 0, i32 undef, i32 undef, i32 %IncrementForThisInvocation1)
+// CHECK:  %MaskedForUAVLimit3 = and i32 %UAVIncResult2, 983039
+// CHECK:  %MultipliedForInterest4 = mul i32 %MaskedForUAVLimit3, %OffsetMultiplicand
+// CHECK:  %AddedForInterest5 = add i32 %MultipliedForInterest4, %OffsetAddend
+// CHECK:  call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, i32 %AddedForInterest5, i32 undef, i32 64771, i32 undef, i32 undef, i32 undef, i8 1)
 // CHECK:  switch i32
 // CHECK:    i32 0, label 
 // CHECK:    i32 32, label
diff --git a/tools/clang/test/HLSL/pix/DebugGSParameters.hlsl b/tools/clang/test/CodeGenHLSL/pix/DebugGSParameters.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/DebugGSParameters.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/DebugGSParameters.hlsl
diff --git a/tools/clang/test/HLSL/pix/DebugPSParameters.hlsl b/tools/clang/test/CodeGenHLSL/pix/DebugPSParameters.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/DebugPSParameters.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/DebugPSParameters.hlsl
diff --git a/tools/clang/test/HLSL/pix/DebugPreexistingSVInstance.hlsl b/tools/clang/test/CodeGenHLSL/pix/DebugPreexistingSVInstance.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/DebugPreexistingSVInstance.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/DebugPreexistingSVInstance.hlsl
diff --git a/tools/clang/test/HLSL/pix/DebugPreexistingSVPosition.hlsl b/tools/clang/test/CodeGenHLSL/pix/DebugPreexistingSVPosition.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/DebugPreexistingSVPosition.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/DebugPreexistingSVPosition.hlsl
diff --git a/tools/clang/test/HLSL/pix/DebugPreexistingSVVertex.hlsl b/tools/clang/test/CodeGenHLSL/pix/DebugPreexistingSVVertex.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/DebugPreexistingSVVertex.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/DebugPreexistingSVVertex.hlsl
diff --git a/tools/clang/test/HLSL/pix/DebugUAVSize.hlsl b/tools/clang/test/CodeGenHLSL/pix/DebugUAVSize.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/DebugUAVSize.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/DebugUAVSize.hlsl
diff --git a/tools/clang/test/HLSL/pix/DebugVSParameters.hlsl b/tools/clang/test/CodeGenHLSL/pix/DebugVSParameters.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/DebugVSParameters.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/DebugVSParameters.hlsl
diff --git a/tools/clang/test/HLSL/pix/constantcolor.hlsl b/tools/clang/test/CodeGenHLSL/pix/constantcolor.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/constantcolor.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/constantcolor.hlsl
diff --git a/tools/clang/test/HLSL/pix/constantcolorFromCB.hlsl b/tools/clang/test/CodeGenHLSL/pix/constantcolorFromCB.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/constantcolorFromCB.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/constantcolorFromCB.hlsl
diff --git a/tools/clang/test/HLSL/pix/constantcolorFromCBint.hlsl b/tools/clang/test/CodeGenHLSL/pix/constantcolorFromCBint.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/constantcolorFromCBint.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/constantcolorFromCBint.hlsl
diff --git a/tools/clang/test/HLSL/pix/constantcolorMRT.hlsl b/tools/clang/test/CodeGenHLSL/pix/constantcolorMRT.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/constantcolorMRT.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/constantcolorMRT.hlsl
diff --git a/tools/clang/test/HLSL/pix/constantcolorOtherSIVs.hlsl b/tools/clang/test/CodeGenHLSL/pix/constantcolorOtherSIVs.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/constantcolorOtherSIVs.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/constantcolorOtherSIVs.hlsl
diff --git a/tools/clang/test/HLSL/pix/constantcolorUAVs.hlsl b/tools/clang/test/CodeGenHLSL/pix/constantcolorUAVs.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/constantcolorUAVs.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/constantcolorUAVs.hlsl
diff --git a/tools/clang/test/HLSL/pix/constantcolorint.hlsl b/tools/clang/test/CodeGenHLSL/pix/constantcolorint.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/constantcolorint.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/constantcolorint.hlsl
diff --git a/tools/clang/test/HLSL/pix/forceEarlyZ.hlsl b/tools/clang/test/CodeGenHLSL/pix/forceEarlyZ.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/forceEarlyZ.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/forceEarlyZ.hlsl
diff --git a/tools/clang/test/HLSL/pix/msaaLoad.hlsl b/tools/clang/test/CodeGenHLSL/pix/msaaLoad.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/msaaLoad.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/msaaLoad.hlsl
diff --git a/tools/clang/test/HLSL/pix/pixelCounter.hlsl b/tools/clang/test/CodeGenHLSL/pix/pixelCounter.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/pixelCounter.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/pixelCounter.hlsl
diff --git a/tools/clang/test/HLSL/pix/pixelCounterAddPixelCost.hlsl b/tools/clang/test/CodeGenHLSL/pix/pixelCounterAddPixelCost.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/pixelCounterAddPixelCost.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/pixelCounterAddPixelCost.hlsl
diff --git a/tools/clang/test/HLSL/pix/pixelCounterEarlyZ.hlsl b/tools/clang/test/CodeGenHLSL/pix/pixelCounterEarlyZ.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/pixelCounterEarlyZ.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/pixelCounterEarlyZ.hlsl
diff --git a/tools/clang/test/HLSL/pix/pixelCounterNoSvPosition.hlsl b/tools/clang/test/CodeGenHLSL/pix/pixelCounterNoSvPosition.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/pixelCounterNoSvPosition.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/pixelCounterNoSvPosition.hlsl
diff --git a/tools/clang/test/HLSL/pix/removeDiscards.hlsl b/tools/clang/test/CodeGenHLSL/pix/removeDiscards.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/pix/removeDiscards.hlsl
rename to tools/clang/test/CodeGenHLSL/pix/removeDiscards.hlsl
diff --git a/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/overrides_Zpc.hlsl b/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/overrides_Zpc.hlsl
new file mode 100644
index 000000000..d3e909506
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/overrides_Zpc.hlsl
@@ -0,0 +1,38 @@
+// RUN: %dxc /T vs_6_0 /E main /Zpc %s | FileCheck %s
+
+// Test effective matrix orientations with every combination
+// of default and explicit matrix orientations.
+
+struct S1 { row_major int2x2 mat; };
+struct S2 { int2x2 mat; }; // Default to column_major from /Zpc
+
+#pragma pack_matrix(row_major)
+struct S3 { column_major int2x2 mat; };
+struct S4 { int2x2 mat; }; // Default to row_major from #pragma
+
+#pragma pack_matrix(column_major)
+struct S5 { row_major int2x2 mat; };
+struct S6 { int2x2 mat; }; // Default to column_major from #pragma
+
+RWStructuredBuffer<S1> sb1;
+RWStructuredBuffer<S2> sb2;
+RWStructuredBuffer<S3> sb3;
+RWStructuredBuffer<S4> sb4;
+RWStructuredBuffer<S5> sb5;
+RWStructuredBuffer<S6> sb6;
+
+void main()
+{
+    // CHECK: i32 11, i32 12, i32 21, i32 22
+    sb1[0].mat = int2x2(11, 12, 21, 22);
+    // CHECK: i32 11, i32 21, i32 12, i32 22
+    sb2[0].mat = int2x2(11, 12, 21, 22);
+    // CHECK: i32 11, i32 21, i32 12, i32 22
+    sb3[0].mat = int2x2(11, 12, 21, 22);
+    // CHECK: i32 11, i32 12, i32 21, i32 22
+    sb4[0].mat = int2x2(11, 12, 21, 22);
+    // CHECK: i32 11, i32 12, i32 21, i32 22
+    sb5[0].mat = int2x2(11, 12, 21, 22);
+    // CHECK: i32 11, i32 21, i32 12, i32 22
+    sb6[0].mat = int2x2(11, 12, 21, 22);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/overrides_Zpr.hlsl b/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/overrides_Zpr.hlsl
new file mode 100644
index 000000000..1947f0477
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/overrides_Zpr.hlsl
@@ -0,0 +1,38 @@
+// RUN: %dxc /T vs_6_0 /E main /Zpr %s | FileCheck %s
+
+// Test effective matrix orientations with every combination
+// of default and explicit matrix orientations.
+
+struct S1 { column_major int2x2 mat; };
+struct S2 { int2x2 mat; }; // Default to row_major from /Zpr
+
+#pragma pack_matrix(column_major)
+struct S3 { row_major int2x2 mat; };
+struct S4 { int2x2 mat; }; // Default to column_major from #pragma
+
+#pragma pack_matrix(row_major)
+struct S5 { column_major int2x2 mat; };
+struct S6 { int2x2 mat; }; // Default to row_major from #pragma
+
+RWStructuredBuffer<S1> sb1;
+RWStructuredBuffer<S2> sb2;
+RWStructuredBuffer<S3> sb3;
+RWStructuredBuffer<S4> sb4;
+RWStructuredBuffer<S5> sb5;
+RWStructuredBuffer<S6> sb6;
+
+void main()
+{
+    // CHECK: i32 11, i32 21, i32 12, i32 22
+    sb1[0].mat = int2x2(11, 12, 21, 22);
+    // CHECK: i32 11, i32 12, i32 21, i32 22
+    sb2[0].mat = int2x2(11, 12, 21, 22);
+    // CHECK: i32 11, i32 12, i32 21, i32 22
+    sb3[0].mat = int2x2(11, 12, 21, 22);
+    // CHECK: i32 11, i32 21, i32 12, i32 22
+    sb4[0].mat = int2x2(11, 12, 21, 22);
+    // CHECK: i32 11, i32 21, i32 12, i32 22
+    sb5[0].mat = int2x2(11, 12, 21, 22);
+    // CHECK: i32 11, i32 12, i32 21, i32 22
+    sb6[0].mat = int2x2(11, 12, 21, 22);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/overrides_ast.hlsl b/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/overrides_ast.hlsl
new file mode 100644
index 000000000..53934a9d3
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/overrides_ast.hlsl
@@ -0,0 +1,29 @@
+// RUN: %dxc /T vs_6_0 /E main /Zpc -ast-dump %s | FileCheck %s
+
+// Test that the declarations in the ast get annotated with row/column major as expected
+
+void main()
+{
+    // CHECK: rm_Zpc 'row_major int2x2
+    row_major int2x2 rm_Zpc;
+    // CHECK: cm_Zpc 'column_major int2x2
+    column_major int2x2 cm_Zpc;
+    // CHECK: def_Zpc 'int2x2
+    int2x2 def_Zpc; // Default to column_major from (implicit) /Zpc
+
+    #pragma pack_matrix(row_major)
+    // CHECK: rm_prm 'row_major int2x2
+    row_major int2x2 rm_prm;
+    // CHECK: cm_prm 'column_major int2x2
+    column_major int2x2 cm_prm;
+    // CHECK: def_prm 'row_major int2x2
+    int2x2 def_prm; // Default to row_major from #pragma
+
+    #pragma pack_matrix(column_major)
+    // CHECK: rm_pcm 'row_major int2x2
+    row_major int2x2 rm_pcm;
+    // CHECK: cm_pcm 'column_major int2x2
+    column_major int2x2 cm_pcm;
+    // CHECK: def_pcm 'column_major int2x2
+    int2x2 def_pcm; // Default to column_major from #pragma
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/pragma_granularity.hlsl b/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/pragma_granularity.hlsl
new file mode 100644
index 000000000..402b7b851
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/pragma_granularity.hlsl
@@ -0,0 +1,31 @@
+// RUN: %dxc /T vs_6_0 /E main > %s | FileCheck %s
+
+// Tests the exact place at which #pragma pack_matrix takes effect.
+
+#pragma pack_matrix(column_major)
+
+typedef
+#pragma pack_matrix(row_major)
+int2x2
+// With FXC, we could place the #pragma pack_matrix(column_major) here
+// and still get the type be row_major. This not easy to replicate
+// in DXC because the parser looks ahead one token to see if the
+// type is followed by '::', which causes the execution of a #pragma
+// following the 'int2x2' one token early, but it is highly unlikely that this
+// would be a backwards compatibility issue.
+i22
+#pragma pack_matrix(column_major)
+;
+
+void main(out i22 mat : OUT)
+{
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 11)
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 1, i32 12)
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 1, i8 0, i32 21)
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 1, i8 1, i32 22)
+    mat = i22(11, 12, 21, 22);
+
+    // FXC output, for reference:
+    // mov o0.xy, l(11,12,0,0)
+    // mov o1.xy, l(21,22,0,0)
+}
diff --git a/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/pragma_granularity_template_syntax.hlsl b/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/pragma_granularity_template_syntax.hlsl
new file mode 100644
index 000000000..5db56af2d
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/preprocessor/pragma_matrix_pack/pragma_granularity_template_syntax.hlsl
@@ -0,0 +1,29 @@
+// RUN: %dxc /T vs_6_0 /E main > %s | FileCheck %s
+
+#pragma pack_matrix(column_major)
+
+typedef matrix<int, 2, 2
+#pragma pack_matrix(row_major)
+>
+// With FXC, we could place the #pragma pack_matrix(column_major) here
+// and still get the type be row_major. This not easy to replicate
+// in DXC because the parser looks ahead one token to see if the
+// templated type is followed by '::', which causes the execution of a #pragma
+// following the '>' one token early, but it is highly unlikely that this
+// would be a backwards compatibility issue.
+i22
+#pragma pack_matrix(column_major)
+;
+
+void main(out i22 mat : OUT)
+{
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 11)
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 1, i32 12)
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 1, i8 0, i32 21)
+    // CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 1, i8 1, i32 22)
+    mat = i22(11, 12, 21, 22);
+
+    // FXC output, for reference:
+    // mov o0.xy, l(11,12,0,0)
+    // mov o1.xy, l(21,22,0,0)
+}
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/FileCheck_prefix.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/FileCheck_prefix.hlsl
new file mode 100644
index 000000000..512692fc2
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/FileCheck_prefix.hlsl
@@ -0,0 +1,6 @@
+// RUN: %dxc -E main -T vs_6_0 %s | FileCheck -check-prefix=FOO %s
+
+// CHECK: this should be ignored
+// FOO: main
+
+void main() {}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/anon_struct.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/anon_struct.hlsl
deleted file mode 100644
index a478f6222..000000000
--- a/tools/clang/test/CodeGenHLSL/quick-test/anon_struct.hlsl
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: %dxc -T ps_6_0 -E main %s | FileCheck %s
-
-// CHECK: %"$Globals" = type { %struct.anon }
-// CHECK: @dx.op.cbufferLoadLegacy
-
-struct {
-    int X;
-} CB;
-
-float main(int N : A, int C : B) : SV_TARGET {
-    return CB.X;
-}
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/bool_matrix_conversion.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/bool_matrix_conversion.hlsl
new file mode 100644
index 000000000..28167f3b2
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_matrix_conversion.hlsl
@@ -0,0 +1,30 @@
+// RUN: %dxc -E main -T vs_6_0 %s | FileCheck %s
+
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: fcmp fast une float {{.*}}, 0.000000e+00
+// CHECK: fcmp fast une float {{.*}}, 0.000000e+00
+// CHECK: fcmp fast une float {{.*}}, 0.000000e+00
+// CHECK: fcmp fast une float {{.*}}, 0.000000e+00
+
+struct Input
+{
+    int2x2 i : I;
+    float2x2 f : F;
+};
+
+struct Output
+{
+    bool2x2 i : I;
+    bool2x2 f : F;
+};
+
+Output main(Input input)
+{
+    Output output;
+    output.i = (bool2x2)input.i;
+    output.f = (bool2x2)input.f;
+    return output;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/bool_scalar_swizzle.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/bool_scalar_swizzle.hlsl
new file mode 100644
index 000000000..9499f3ddc
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_scalar_swizzle.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -E main -T ps_6_0 -O0 %s | FileCheck %s
+
+// This is mostly a regression test for a bug where a bitcast
+// from i32* to i1* was emitted.
+
+// CHECK: alloca i32
+// CHECK: alloca [2 x i32]
+// CHECK-NOT: bitcast
+
+float main() : SV_Target
+{
+    bool b = true;
+    bool2 b2 = b.xx;
+    return 0;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/bool_stress.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/bool_stress.hlsl
new file mode 100644
index 000000000..aacf9fd85
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_stress.hlsl
@@ -0,0 +1,54 @@
+// RUN: %dxc -E main -T vs_6_0 -O0 %s
+
+// Regression test for compiler crashes in complex bool cases
+
+struct AllTheBools
+{
+    bool b : B;
+    bool ba2[2] :BA2;
+    bool1 b1 : B1;
+    bool3 b3 : B3;
+    bool3 b3a2[2] : B3A2;
+    bool1x1 b1x1 : B1X1;
+    bool2x3 b2x3 : B2X3;
+    row_major bool2x3 rmb2x3 : RMB2X3;
+    bool2x3 b2x3a2[2] : B2X3A2;
+};
+
+ConstantBuffer<AllTheBools> cb;
+StructuredBuffer<AllTheBools> sb;
+
+void not(in out bool value) { value = !value; }
+
+void not(in out bool2 value)
+{
+  value = !value;
+  not(value.x);
+  not(value.y);
+}
+
+void not(in out bool3 value)
+{
+  not(value.xz);
+  not(value.y);
+}
+
+AllTheBools main(AllTheBools input, float f : F)
+{
+    AllTheBools output;
+    output.b = input.b ? cb.b : sb[0].b;
+    output.ba2[1] = input.b;
+    output.ba2[0] = input.ba2[1];
+    output.b1 = input.b3.y;
+    output.b3 = input.b.xxx;
+    output.b3a2 = sb[0].b3a2;
+    if (sb[0].b) return cb;
+
+    output.b1x1 = cb.b2x3._22;
+    output.b2x3 = bool2x3(sb[0].b3, bool3(f > 2, input.b, false));
+    output.rmb2x3 = input.b2x3;
+    not(output.rmb2x3[0]);
+    output.b2x3a2[1] = cb.b2x3;
+    output.b2x3a2[0] = input.b2x3;
+    return output;
+}
\ No newline at end of file
diff --git a/tools/clang/test/HLSL/ctbuf.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/ctbuf.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/ctbuf.hlsl
rename to tools/clang/test/CodeGenHLSL/quick-test/ctbuf.hlsl
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/disasm_struct_layout_ctbuffer.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/disasm_struct_layout_ctbuffer.hlsl
deleted file mode 100644
index 426eb7eee..000000000
--- a/tools/clang/test/CodeGenHLSL/quick-test/disasm_struct_layout_ctbuffer.hlsl
+++ /dev/null
@@ -1,61 +0,0 @@
-// RUN: %dxc -T ps_6_0 -E main -Od %s | FileCheck %s
-
-// All cbuffer and tbuffer declarations should have the same layout
-// We don't care in what order they get printed
-
-// CHECK: float2 a; ; Offset: 0
-// CHECK: float b[2]; ; Offset: 16
-// CHECK: float2 c; ; Offset: 36
-// CHECK: float2 d; ; Offset: 48
-// CHECK: float e; ; Offset: 56
-// CHECK: Size: 60
-
-// CHECK: float2 a; ; Offset: 0
-// CHECK: float b[2]; ; Offset: 16
-// CHECK: float2 c; ; Offset: 36
-// CHECK: float2 d; ; Offset: 48
-// CHECK: float e; ; Offset: 56
-// CHECK: Size: 60
-
-// CHECK: float2 a; ; Offset: 0
-// CHECK: float b[2]; ; Offset: 16
-// CHECK: float2 c; ; Offset: 36
-// CHECK: float2 d; ; Offset: 48
-// CHECK: float e; ; Offset: 56
-// CHECK: Size: 60
-
-// CHECK: float2 a; ; Offset: 0
-// CHECK: float b[2]; ; Offset: 16
-// CHECK: float2 c; ; Offset: 36
-// CHECK: float2 d; ; Offset: 48
-// CHECK: float e; ; Offset: 56
-// CHECK: Size: 60
-
-struct Struct
-{
-  float2 a;
-  struct
-  {
-    float b[2]; // Each element is float4-aligned
-    float2 c; // Fits in b[1].yz
-    float2 d; // Doesn't fit in b[1].w-, so gets its own float4
-  } s;
-  float e; // Fits in d.z
-};
-
-cbuffer _cbl
-{
-  Struct cbl;
-};
-ConstantBuffer<Struct> cb;
-
-tbuffer _tbl
-{
-  Struct tbl;
-};
-TextureBuffer<Struct> tb;
-
-float4 main() : SV_Target
-{
-    return cbl.e + cb.e + tbl.e + tb.e;
-}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/disasm_struct_layout_structbuf.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/disasm_struct_layout_structbuf.hlsl
deleted file mode 100644
index df611ff94..000000000
--- a/tools/clang/test/CodeGenHLSL/quick-test/disasm_struct_layout_structbuf.hlsl
+++ /dev/null
@@ -1,37 +0,0 @@
-// RUN: %dxc -T ps_6_0 -E main -Od %s | FileCheck %s
-
-// StructuredBuffer/RWStructuredBuffer should have the same layout
-
-// CHECK: float2 a; ; Offset: 0
-// CHECK: float b[2]; ; Offset: 8
-// CHECK: float2 c; ; Offset: 16
-// CHECK: float2 d; ; Offset: 24
-// CHECK: float e; ; Offset: 32
-// CHECK: Size: 36
-
-// CHECK: float2 a; ; Offset: 0
-// CHECK: float b[2]; ; Offset: 8
-// CHECK: float2 c; ; Offset: 16
-// CHECK: float2 d; ; Offset: 24
-// CHECK: float e; ; Offset: 32
-// CHECK: Size: 36
-
-struct Struct
-{
-  float2 a;
-  struct
-  {
-    float b[2];
-    float2 c;
-    float2 d;
-  } s;
-  float e;
-};
-
-StructuredBuffer<Struct> sb;
-RWStructuredBuffer<Struct> rwsb;
-
-float4 main() : SV_Target
-{
-    return sb[0].e + rwsb[0].e;
-}
\ No newline at end of file
diff --git a/tools/clang/test/HLSL/eliminate_dynamic_output.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/eliminate_dynamic_output.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/eliminate_dynamic_output.hlsl
rename to tools/clang/test/CodeGenHLSL/quick-test/eliminate_dynamic_output.hlsl
diff --git a/tools/clang/test/HLSL/eliminate_dynamic_output2.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/eliminate_dynamic_output2.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/eliminate_dynamic_output2.hlsl
rename to tools/clang/test/CodeGenHLSL/quick-test/eliminate_dynamic_output2.hlsl
diff --git a/tools/clang/test/HLSL/eliminate_dynamic_output3.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/eliminate_dynamic_output3.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/eliminate_dynamic_output3.hlsl
rename to tools/clang/test/CodeGenHLSL/quick-test/eliminate_dynamic_output3.hlsl
diff --git a/tools/clang/test/HLSL/eliminate_dynamic_output4.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/eliminate_dynamic_output4.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/eliminate_dynamic_output4.hlsl
rename to tools/clang/test/CodeGenHLSL/quick-test/eliminate_dynamic_output4.hlsl
diff --git a/tools/clang/test/HLSL/eliminate_dynamic_output6.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/eliminate_dynamic_output6.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/eliminate_dynamic_output6.hlsl
rename to tools/clang/test/CodeGenHLSL/quick-test/eliminate_dynamic_output6.hlsl
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/empty_struct.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/empty_struct.hlsl
deleted file mode 100644
index efd198104..000000000
--- a/tools/clang/test/CodeGenHLSL/quick-test/empty_struct.hlsl
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: %dxc -E main -T vs_6_0 %s | FileCheck %s
-
-// Make sure nest empty struct works.
-// CHECK: main
-
-struct EmptyStruct
-{
-};
-
-struct OuterStruct
-{
-  EmptyStruct s;
-};
-
-float4 main(float4 pos : POSITION) : SV_POSITION
-{
-  OuterStruct os;
-  return float4(0, 0, 0, 0);
-}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/empty_struct2.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/empty_struct2.hlsl
deleted file mode 100644
index fdea53305..000000000
--- a/tools/clang/test/CodeGenHLSL/quick-test/empty_struct2.hlsl
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: %dxc -E main -T vs_6_0 %s | FileCheck %s
-
-// Make sure nest empty struct works.
-// CHECK: main
-
-struct KillerStruct {};
-
-struct InnerStruct {
-  KillerStruct s;
-};
-
-struct OuterStruct {
-  InnerStruct s;
-};
-
-cbuffer Params_cbuffer : register(b0) {
-  OuterStruct constants;
-};
-
-float4 main(float4 pos : POSITION) : SV_POSITION { return float4(0, 0, 0, 0); }
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/entrypoint_name_clash_regression.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/entrypoint_name_clash_regression.hlsl
new file mode 100644
index 000000000..618a4038a
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/entrypoint_name_clash_regression.hlsl
@@ -0,0 +1,11 @@
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s
+
+// Regression test for a bug where any function named
+// like the entry point would get the same mangling,
+// regardless of its scope, causing a name clash (GitHub #1848).
+
+// CHECK: define void @main()
+
+namespace foo { void main() {} }
+struct bar { static void main() {} };
+void main() { foo::main(); bar::main(); }
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/entrypoint_not_in_global_namespace_error.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/entrypoint_not_in_global_namespace_error.hlsl
new file mode 100644
index 000000000..954a0ded1
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/entrypoint_not_in_global_namespace_error.hlsl
@@ -0,0 +1,8 @@
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s
+
+// Test that the entry point must be in the global namespace.
+
+// CHECK: error: missing entry point definition
+
+namespace foo { void main() {} }
+struct bar { static void main() {} };
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/global-var-no-init.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/global-var-no-init.hlsl
deleted file mode 100644
index a14338cb5..000000000
--- a/tools/clang/test/CodeGenHLSL/quick-test/global-var-no-init.hlsl
+++ /dev/null
@@ -1,68 +0,0 @@
-// RUN: %dxc -E main -T ps_6_0 > %s | FileCheck %s
-
-// CBuffer-promoted global variables should not have initializers
-// CHECK-NOT: {{.*var.*}} = constant float 0.000000e+00, align 4
-// CHECK-NOT: {{.*var_init.*}} = constant float 0.000000e+00, align 4
-// CHECK-NOT: {{.*const_var.*}} = constant float 0.000000e+00, align 4
-// CHECK-NOT: {{.*const_var_init.*}} = constant float 0.000000e+00, align 4
-// CHECK-NOT: {{.*extern_var.*}} = constant float 0.000000e+00, align 4
-// CHECK-NOT: {{.*extern_var_init.*}} = constant float 0.000000e+00, align 4
-// CHECK-NOT: {{.*extern_const_var.*}} = constant float 0.000000e+00, align 4
-// CHECK-NOT: {{.*extern_const_var_init.*}} = constant float 0.000000e+00, align 4
-
-// ... they should only exist in their CBuffer declaration
-// CHECK: cbuffer $Globals
-// CHECK: float var;
-// CHECK: float var_init;
-// CHECK: float const_var;
-// CHECK: float const_var_init;
-// CHECK: float extern_var;
-// CHECK: float extern_var_init;
-// CHECK: float extern_const_var;
-// CHECK: float extern_const_var_init;
-
-Texture2D tex;
-float var;
-float var_init = 1;
-const float const_var;
-const float const_var_init = 1;
-extern float extern_var;
-extern float extern_var_init = 1;
-extern const float extern_const_var;
-extern const float extern_const_var_init = 1;
-
-// Those get optimized away
-static float static_var;
-static float static_var_init = 1;
-static const float static_const_var;
-static const float static_const_var_init = 1;
-
-struct s
-{
-  // Those get optimized away
-  static float struct_static_var;
-  // static float struct_static_var_init = 1; // error: struct/class members cannot have default values
-  static const float struct_static_const_var;
-  static const float struct_static_const_var_init = 1;
-};
-
-float s::struct_static_var = 1;
-const float s::struct_static_const_var = 1;
-
-float main() : SV_Target {
-  static float func_static_var;
-  static float func_static_var_init = 1;
-  static const float func_static_const_var;
-  static const float func_static_const_var_init = 1;
-  return tex.Load((int3)0).x
-    + var + var_init
-    + const_var + const_var_init
-    + extern_var + extern_var_init
-    + extern_const_var + extern_const_var_init
-    + static_var + static_var_init
-    + static_const_var + static_const_var_init
-    + s::struct_static_var + /*s::struct_static_var_init*/
-    + s::struct_static_const_var + s::struct_static_const_var_init
-    + func_static_var + func_static_var_init
-    + func_static_const_var + func_static_const_var_init;
-}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/global-var-write-test04.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/global-var-write-test04.hlsl
index a0440c733..a3392a0b5 100644
--- a/tools/clang/test/CodeGenHLSL/quick-test/global-var-write-test04.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/global-var-write-test04.hlsl
@@ -7,7 +7,7 @@
 // CHECK: {{.*g_v.*}} = external constant <4 x float>, align 4
 // CHECK: {{.*g_m1.*}} = external constant %class.matrix.int.2.2, align 4
 // CHECK: {{.*g_m2.*}} = external constant %class.matrix.int.2.2, align 4
-// CHECK: {{.*g_b.*}} = external constant i32, align 1
+// CHECK: {{.*g_b.*}} = external constant i32, align 4
 // CHECK: {{.*g_a.*}} = external constant [5 x i32], align 4
 // CHECK: {{.*g_a2d.*}} = external constant [3 x [2 x i32]], align 4
 // CHECK-NOT: {{(.*g_s1.*)(.*static.copy.*)}} = internal global float 0.000000e+00
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/intrinsic_uabs_usign.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/intrinsic_uabs_usign.hlsl
new file mode 100644
index 000000000..4d37711bf
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/intrinsic_uabs_usign.hlsl
@@ -0,0 +1,11 @@
+// RUN: %dxc -E main -T vs_6_0 %s | FileCheck %s
+
+// Test the unsigned version of the abs and sign intrinsics
+
+// CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 -1)
+// CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 1, i32 1)
+
+uint2 main() : OUT
+{
+    return uint2(abs((uint)0xFFFFFFFF), sign((uint)0xFFFFFFFF));
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/memcpy_input_to_static.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/memcpy_input_to_static.hlsl
new file mode 100644
index 000000000..f6f8a0885
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/memcpy_input_to_static.hlsl
@@ -0,0 +1,22 @@
+// RUN: %dxc /T vs_6_0 /E main %s | FileCheck %s
+
+// Regression test for github issue #1724, which was crashing
+// in SROA_Parameter_HLSL due to the memcpy instruction between
+// the VsInput structs and its source bitcasts getting erased
+// while an IRBuilder was pointing to one of these instructions
+// as an insertion point.
+
+// CHECK: ret void
+
+struct VsInput
+{
+	uint instanceId : SV_InstanceID;
+	uint vertexId : SV_VertexID;
+};
+
+static VsInput __vsInput;
+
+void main(VsInput inputs)
+{
+	__vsInput = inputs; // This line triggers compiler crash
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/pack_matrix.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/pack_matrix.hlsl
deleted file mode 100644
index 221195d9b..000000000
--- a/tools/clang/test/CodeGenHLSL/quick-test/pack_matrix.hlsl
+++ /dev/null
@@ -1,35 +0,0 @@
-// RUN: %dxc -E main -T ps_6_0 -ast-dump %s  | FileCheck %s
-
-// CHECK:row_major
-#pragma pack_matrix(row_major)
-
-struct Foo
-{
-  float2x2 a;
-};
-
-// CHECK:column_major
-#pragma pack_matrix(column_major)
-
-struct Bar {
-  float2x2 a;
-};
-
-Foo f;
-Bar b;
-
-// CHECK:row_major
-#pragma pack_matrix(row_major)
-
-float2x2 c;
-
-// CHECK:column_major
-#pragma pack_matrix(column_major)
-float2x2 d;
-
-// CHECK: main 'float4 ()'
-float4 main() : SV_Target
-{
-  float2x2 e = f.a + b.a + c + d;
-  return e;
-}
\ No newline at end of file
diff --git a/tools/clang/test/HLSL/preserve_all_outputs_1.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/preserve_all_outputs_1.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/preserve_all_outputs_1.hlsl
rename to tools/clang/test/CodeGenHLSL/quick-test/preserve_all_outputs_1.hlsl
diff --git a/tools/clang/test/HLSL/preserve_all_outputs_2.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/preserve_all_outputs_2.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/preserve_all_outputs_2.hlsl
rename to tools/clang/test/CodeGenHLSL/quick-test/preserve_all_outputs_2.hlsl
diff --git a/tools/clang/test/HLSL/preserve_all_outputs_3.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/preserve_all_outputs_3.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/preserve_all_outputs_3.hlsl
rename to tools/clang/test/CodeGenHLSL/quick-test/preserve_all_outputs_3.hlsl
diff --git a/tools/clang/test/HLSL/preserve_all_outputs_4.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/preserve_all_outputs_4.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/preserve_all_outputs_4.hlsl
rename to tools/clang/test/CodeGenHLSL/quick-test/preserve_all_outputs_4.hlsl
diff --git a/tools/clang/test/HLSL/preserve_all_outputs_5.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/preserve_all_outputs_5.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/preserve_all_outputs_5.hlsl
rename to tools/clang/test/CodeGenHLSL/quick-test/preserve_all_outputs_5.hlsl
diff --git a/tools/clang/test/HLSL/preserve_all_outputs_6.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/preserve_all_outputs_6.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/preserve_all_outputs_6.hlsl
rename to tools/clang/test/CodeGenHLSL/quick-test/preserve_all_outputs_6.hlsl
diff --git a/tools/clang/test/HLSL/preserve_all_outputs_7.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/preserve_all_outputs_7.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/preserve_all_outputs_7.hlsl
rename to tools/clang/test/CodeGenHLSL/quick-test/preserve_all_outputs_7.hlsl
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/rawbufferloadstore_64bit_6_2.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/rawbufferloadstore_64bit_6_2.hlsl
new file mode 100644
index 000000000..8df5968b3
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/rawbufferloadstore_64bit_6_2.hlsl
@@ -0,0 +1,104 @@
+// // RUN: %dxc -E main -T cs_6_2 %s | FileCheck %s
+
+struct TestData { 
+  int64_t3 v3;
+  int64_t4 v4;
+};
+
+ByteAddressBuffer srv0 : register(t0); 
+RWByteAddressBuffer uav0 : register(u0); 
+
+StructuredBuffer<TestData> srv1 : register(t1);
+RWStructuredBuffer<TestData> uav1 : register(u1);
+
+[numthreads(1, 1, 1)]
+void main(uint GI : SV_GroupIndex) {
+
+  int64_t3 vec3 = srv0.Load<int64_t3>(0);
+// CHECK: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srv0_texture_rawbuf, i32 0, i32 undef, i8 15, i32 8)
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 0
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 1
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 2
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 3
+// CHECK: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srv0_texture_rawbuf, i32 16, i32 undef, i8 3, i32 8)
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 0
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 1
+// CHECK: zext i32 %{{[0-9]+}} to i64
+// CHECK: zext i32 %{{[0-9]+}} to i64
+// CHECK: shl i64 %{{[0-9]+}}, 32
+// CHECK: or i64 %{{[0-9]+}}, %{{[0-9]+}}
+
+   uav0.Store(0, vec3);
+// CHECK: trunc i64 %{{[0-9]+}} to i32
+// CHECK: lshr i64 %{{[0-9]+}}, 32
+// CHECK: trunc i64 %{{[0-9]+}} to i32
+// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %uav0_UAV_rawbuf, i32 0, i32 undef, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i8 15, i32 8) 
+// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %uav0_UAV_rawbuf, i32 16, i32 undef, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 undef, i32 undef, i8 3, i32 8) 
+
+  int64_t4 vec4 = srv0.Load<int64_t4>(0);
+// CHECK: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srv0_texture_rawbuf, i32 0, i32 undef, i8 15, i32 8)
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 0
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 1
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 2
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 3
+// CHECK: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srv0_texture_rawbuf, i32 16, i32 undef, i8 15, i32 8)
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 0
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 1
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 2
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 3
+// CHECK: zext i32 %{{[0-9]+}} to i64
+// CHECK: zext i32 %{{[0-9]+}} to i64
+// CHECK: shl i64 %{{[0-9]+}}, 32
+// CHECK: or i64 %{{[0-9]+}}, %{{[0-9]+}}
+
+  uav0.Store(0, vec4);
+// CHECK: trunc i64 %{{[0-9]+}} to i32
+// CHECK: lshr i64 %{{[0-9]+}}, 32
+// CHECK: trunc i64 %{{[0-9]+}} to i32
+// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %uav0_UAV_rawbuf, i32 0, i32 undef, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i8 15, i32 8) 
+// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %uav0_UAV_rawbuf, i32 16, i32 undef, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i8 15, i32 8) 
+
+  int64_t3 svec3 = srv1[0].v3;
+// CHECK: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srv1_texture_structbuf, i32 0, i32 0, i8 15, i32 8)
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 0
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 1
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 2
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 3
+// CHECK: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srv1_texture_structbuf, i32 0, i32 16, i8 3, i32 8)
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 0
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 1
+// CHECK: zext i32 %{{[0-9]+}} to i64
+// CHECK: zext i32 %{{[0-9]+}} to i64
+// CHECK: shl i64 %{{[0-9]+}}, 32
+// CHECK: or i64 %{{[0-9]+}}, %{{[0-9]+}}
+
+  uav1[0].v3 = svec3;
+// CHECK: trunc i64 %{{[0-9]+}} to i32
+// CHECK: lshr i64 %{{[0-9]+}}, 32
+// CHECK: trunc i64 %{{[0-9]+}} to i32
+// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %uav1_UAV_structbuf, i32 0, i32 0, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i8 15, i32 8) 
+// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %uav1_UAV_structbuf, i32 0, i32 16, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 undef, i32 undef, i8 3, i32 8) 
+
+  int64_t4 svec4 = srv1[0].v4;
+// CHECK: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srv1_texture_structbuf, i32 0, i32 24, i8 15, i32 8)
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 0
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 1
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 2
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 3
+// CHECK: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srv1_texture_structbuf, i32 0, i32 40, i8 15, i32 8)
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 0
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 1
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 2
+// CHECK: extractvalue %dx.types.ResRet.i32 %{{[a-zA-Z0-9]+}}, 3
+// CHECK: zext i32 %{{[0-9]+}} to i64
+// CHECK: zext i32 %{{[0-9]+}} to i64
+// CHECK: shl i64 %{{[0-9]+}}, 32
+// CHECK: or i64 %{{[0-9]+}}, %{{[0-9]+}}
+
+  uav1[0].v4 = svec4;
+// CHECK: trunc i64 %{{[0-9]+}} to i32
+// CHECK: lshr i64 %{{[0-9]+}}, 32
+// CHECK: trunc i64 %{{[0-9]+}} to i32
+// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %uav1_UAV_structbuf, i32 0, i32 24, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i8 15, i32 8) 
+// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %uav1_UAV_structbuf, i32 0, i32 40, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i8 15, i32 8) 
+};
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/rawbufferloadstore_64bit_6_3.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/rawbufferloadstore_64bit_6_3.hlsl
new file mode 100644
index 000000000..2b6131fac
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/rawbufferloadstore_64bit_6_3.hlsl
@@ -0,0 +1,54 @@
+// // RUN: %dxc -E main -T cs_6_3 %s | FileCheck %s
+
+struct TestData { 
+  int64_t3 v3;
+  int64_t4 v4;
+};
+
+ByteAddressBuffer srv0 : register(t0); 
+RWByteAddressBuffer uav0 : register(u0); 
+
+StructuredBuffer<TestData> srv1 : register(t1);
+RWStructuredBuffer<TestData> uav1 : register(u1);
+
+[numthreads(1, 1, 1)]
+void main(uint GI : SV_GroupIndex) {
+
+  int64_t3 vec3 = srv0.Load<int64_t3>(0);
+// CHECK: call %dx.types.ResRet.i64 @dx.op.rawBufferLoad.i64(i32 139, %dx.types.Handle %srv0_texture_rawbuf, i32 0, i32 undef, i8 7, i32 8)
+// CHECK: extractvalue %dx.types.ResRet.i64 %{{[0-9a-zA-Z]+}}, 0
+// CHECK: extractvalue %dx.types.ResRet.i64 %{{[0-9a-zA-Z]+}}, 1
+// CHECK: extractvalue %dx.types.ResRet.i64 %{{[0-9a-zA-Z]+}}, 2
+
+  uav0.Store(0, vec3);
+// CHECK: call void @dx.op.rawBufferStore.i64(i32 140, %dx.types.Handle %uav0_UAV_rawbuf, i32 0, i32 undef, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}, i64 undef, i8 7, i32 8)
+
+  int64_t4 vec4 = srv0.Load<int64_t4>(0);
+// CHECK: call %dx.types.ResRet.i64 @dx.op.rawBufferLoad.i64(i32 139, %dx.types.Handle %srv0_texture_rawbuf, i32 0, i32 undef, i8 15, i32 8)
+// CHECK: extractvalue %dx.types.ResRet.i64 %{{[0-9a-zA-Z]+}}, 0
+// CHECK: extractvalue %dx.types.ResRet.i64 %{{[0-9a-zA-Z]+}}, 1
+// CHECK: extractvalue %dx.types.ResRet.i64 %{{[0-9a-zA-Z]+}}, 2
+// CHECK: extractvalue %dx.types.ResRet.i64 %{{[0-9a-zA-Z]+}}, 3
+
+  uav0.Store(0, vec4);
+// CHECK: call void @dx.op.rawBufferStore.i64(i32 140, %dx.types.Handle %uav0_UAV_rawbuf, i32 0, i32 undef, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}, i8 15, i32 8)
+
+  int64_t3 svec3 = srv1[0].v3;
+// CHECK: call %dx.types.ResRet.i64 @dx.op.rawBufferLoad.i64(i32 139, %dx.types.Handle %srv1_texture_structbuf, i32 0, i32 0, i8 7, i32 8)
+// CHECK: extractvalue %dx.types.ResRet.i64 %{{[0-9a-zA-Z]+}}, 0
+// CHECK: extractvalue %dx.types.ResRet.i64 %{{[0-9a-zA-Z]+}}, 1
+// CHECK: extractvalue %dx.types.ResRet.i64 %{{[0-9a-zA-Z]+}}, 2
+
+  uav1[0].v3 = svec3;
+// CHECK: call void @dx.op.rawBufferStore.i64(i32 140, %dx.types.Handle %uav1_UAV_structbuf, i32 0, i32 0, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}, i64 undef, i8 7, i32 8)
+
+  int64_t4 svec4 = srv1[0].v4;
+// CHECK: call %dx.types.ResRet.i64 @dx.op.rawBufferLoad.i64(i32 139, %dx.types.Handle %srv1_texture_structbuf, i32 0, i32 24, i8 15, i32 8)
+// CHECK: extractvalue %dx.types.ResRet.i64 %{{[0-9a-zA-Z]+}}, 0
+// CHECK: extractvalue %dx.types.ResRet.i64 %{{[0-9a-zA-Z]+}}, 1
+// CHECK: extractvalue %dx.types.ResRet.i64 %{{[0-9a-zA-Z]+}}, 2
+// CHECK: extractvalue %dx.types.ResRet.i64 %{{[0-9a-zA-Z]+}}, 3
+
+  uav1[0].v4 = svec4;
+// CHECK: call void @dx.op.rawBufferStore.i64(i32 140, %dx.types.Handle %uav1_UAV_structbuf, i32 0, i32 24, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}, i8 15, i32 8)
+};
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/semantics_conflict_warning.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/semantics_conflict_warning.hlsl
new file mode 100644
index 000000000..3b9cde10a
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/semantics_conflict_warning.hlsl
@@ -0,0 +1,14 @@
+// RUN: %dxc -T ps_6_0 -E main -WX %s | FileCheck %s
+
+// CHECK: semantic 'NORMAL0' on field overridden by function or enclosing type
+
+struct Input
+{
+    float a;
+    float b : NORMAL0;
+};
+
+float main(Input input : TEXCOORD0) : SV_Target
+{
+    return 1;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/semantics_conflict_warning_return_struct.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/semantics_conflict_warning_return_struct.hlsl
new file mode 100644
index 000000000..e7b1cf84e
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/semantics_conflict_warning_return_struct.hlsl
@@ -0,0 +1,17 @@
+// RUN: %dxc -T ps_6_0 -E main -WX %s | FileCheck %s
+
+// CHECK: semantic 'COLOR0' on field overridden by function or enclosing type
+
+struct Output
+{
+    float a;
+    float b : COLOR0;
+};
+
+Output main() : SV_Target
+{
+    Output output;
+    output.a = 1;
+    output.b = 2;
+    return output;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/semantics_duplicate_param_error.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/semantics_duplicate_param_error.hlsl
new file mode 100644
index 000000000..0d12ede02
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/semantics_duplicate_param_error.hlsl
@@ -0,0 +1,8 @@
+// RUN: %dxc -T ps_6_0 -E main %s | FileCheck %s
+
+// Tests that we prevent multiple inputs/outputs from having the same semantics.
+
+// CHECK: validation errors
+// CHECK: Semantic 'TEXCOORD' overlap at 0
+
+float main(float u : TEXCOORD0, float v : TEXCOORD0) : SV_Target { return 1; }
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/semantics_duplicate_struct_error.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/semantics_duplicate_struct_error.hlsl
new file mode 100644
index 000000000..3e6d8f8d3
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/semantics_duplicate_struct_error.hlsl
@@ -0,0 +1,18 @@
+// RUN: %dxc -T ps_6_0 -E main %s | FileCheck %s
+
+// Tests that we prevent multiple inputs/outputs from having the same semantics.
+// Also serves as a regresion test for an SROA crash in the struct case.
+
+// CHECK: validation errors
+// CHECK: Semantic 'TEXCOORD' overlap at 0
+
+struct Texcoords
+{
+    float u : TEXCOORD0;
+    float v : TEXCOORD0;
+};
+
+float main(Texcoords texcoords) : SV_Target
+{
+    return 1;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/simple_gvn_hoist.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/simple_gvn_hoist.hlsl
new file mode 100644
index 000000000..5a07c6230
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/simple_gvn_hoist.hlsl
@@ -0,0 +1,23 @@
+// RUN: %dxc -T ps_6_0 -E main %s | FileCheck %s
+
+// CHECK: call %dx.types.ResRet.f32 @dx.op.sample.f32
+// Make sure only 1 sample exist.
+// CHECK-NOT:call %dx.types.ResRet.f32 @dx.op.sample.f32
+
+Texture2D<float4> tex;
+SamplerState ss;
+
+float4 main(float2 uv:UV, float2 a:A) : SV_Target {
+  float4 r = 0;
+  if (a.x > 0) {
+    r = tex.Sample(ss, uv)-1;
+  } else {
+    if (a.y > 0)
+      r = tex.Sample(ss, uv);
+    else
+      r = tex.Sample(ss, uv) + 3;
+  }
+
+  return r;
+
+}
\ No newline at end of file
diff --git a/tools/clang/test/HLSL/sm-fail.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/sm-fail.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/sm-fail.hlsl
rename to tools/clang/test/CodeGenHLSL/quick-test/sm-fail.hlsl
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/sroa_memcpy_from_cbuf_nested_field.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/sroa_memcpy_from_cbuf_nested_field.hlsl
new file mode 100644
index 000000000..0323070c9
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/sroa_memcpy_from_cbuf_nested_field.hlsl
@@ -0,0 +1,14 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+// Regression test for a crash in SROA when replacing a memcpy
+// whose source is a CBuffer value at a deep nesting level (multiple GEPs).
+
+// CHECK: ret void
+
+struct A { float f1[1]; };
+struct B { A a1[1]; };
+struct C { B b; };
+float one(B b) { return 1; }
+B getB(C c) { return c.b; }
+cbuffer CB { C g_c; }
+float main() : SV_Target { return one(getB(g_c)); }
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/streamout_input_before_output_different_structs.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/streamout_input_before_output_different_structs.hlsl
new file mode 100644
index 000000000..64cc9886c
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/streamout_input_before_output_different_structs.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -E main -T gs_6_0 %s | FileCheck %s
+
+// Regression test for an SROA bug where the flattening the output stream argument
+// would not handle the case where its input had already been SROA'd.
+
+// CHECK: define void @main()
+// CHECK: call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 0)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float {{.*}})
+// CHECK: call void @dx.op.emitStream(i32 97, i8 0)
+// CHECK: ret void
+
+struct GSIn { float value : TEXCOORD0; };
+struct GSOut { float value : TEXCOORD0; };
+
+[maxvertexcount(1)]
+void main(point GSIn input[1], inout PointStream<GSOut> output)
+{
+    output.Append(input[0]);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/streamout_input_before_output_same_struct.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/streamout_input_before_output_same_struct.hlsl
new file mode 100644
index 000000000..94c3ea762
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/streamout_input_before_output_same_struct.hlsl
@@ -0,0 +1,18 @@
+// RUN: %dxc -E main -T gs_6_0 %s | FileCheck %s
+
+// Regression test for an SROA bug where the flattening the output stream argument
+// would not handle the case where its input had already been SROA'd.
+
+// CHECK: define void @main()
+// CHECK: call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 0)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float {{.*}})
+// CHECK: call void @dx.op.emitStream(i32 97, i8 0)
+// CHECK: ret void
+
+struct GSInOut { float value : TEXCOORD0; };
+
+[maxvertexcount(1)]
+void main(point GSInOut input[1], inout PointStream<GSInOut> output)
+{
+    output.Append(input[0]);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/streamout_matrix_all_orientations.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/streamout_matrix_all_orientations.hlsl
new file mode 100644
index 000000000..b7c15f525
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/streamout_matrix_all_orientations.hlsl
@@ -0,0 +1,45 @@
+// RUN: %dxc -E main -T gs_6_0 %s | FileCheck %s
+
+// CHECK: call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 0)
+// CHECK: call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 1, i32 0)
+// CHECK: call float @dx.op.loadInput.f32(i32 4, i32 1, i32 0, i8 0, i32 0)
+// CHECK: call float @dx.op.loadInput.f32(i32 4, i32 1, i32 0, i8 1, i32 0)
+// CHECK: call float @dx.op.loadInput.f32(i32 4, i32 2, i32 0, i8 0, i32 0)
+// CHECK: call float @dx.op.loadInput.f32(i32 4, i32 2, i32 1, i8 0, i32 0)
+// CHECK: call float @dx.op.loadInput.f32(i32 4, i32 3, i32 0, i8 0, i32 0)
+// CHECK: call float @dx.op.loadInput.f32(i32 4, i32 3, i32 1, i8 0, i32 0)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float {{.*}})
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float {{.*}})
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 0, float {{.*}})
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 1, i8 0, float {{.*}})
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 2, i32 0, i8 0, float {{.*}})
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 2, i32 0, i8 1, float {{.*}})
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 3, i32 0, i8 0, float {{.*}})
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 3, i32 1, i8 0, float {{.*}})
+
+struct GSIn
+{
+    row_major float1x2 a : A;
+    row_major float1x2 b : B;
+    column_major float1x2 c : C;
+    column_major float1x2 d : D;
+};
+
+struct GSOut
+{
+    row_major float1x2 a : A;
+    column_major float1x2 b : B;
+    row_major float1x2 c : C;
+    column_major float1x2 d : D;
+};
+
+[maxvertexcount(1)]
+void main(point GSIn input[1], inout PointStream<GSOut> output)
+{
+    GSOut result;
+    result.a = input[0].a;
+    result.b = input[0].b;
+    result.c = input[0].c;
+    result.d = input[0].d;
+    output.Append(result);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/streamout_multiple_aggregates.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/streamout_multiple_aggregates.hlsl
new file mode 100644
index 000000000..94f8eb625
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/streamout_multiple_aggregates.hlsl
@@ -0,0 +1,20 @@
+// RUN: %dxc -E main -T gs_6_0 %s | FileCheck %s
+
+// Regression test for GitHub #1812
+// "Crash when using multiple nested structs in GS"
+// Due to multiple SROA passes processing the same original Append intrinsic,
+// and redundantly queuing it for deletion.
+
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 0.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 0, float 0.000000e+00)
+
+struct Inner1 { float t : A; };
+struct Inner2 { float t : B; };
+struct Outer { Inner1 i1; Inner2 i2; };
+
+[maxvertexcount(1)]
+void main(point Outer input[1], inout PointStream<Outer> output)
+{
+    Outer o = (Outer)0;
+    output.Append(o);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/streamout_output_before_input_different_structs.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/streamout_output_before_input_different_structs.hlsl
new file mode 100644
index 000000000..3d84c5f75
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/streamout_output_before_input_different_structs.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -E main -T gs_6_0 %s | FileCheck %s
+
+// Regression test for an SROA bug where the flattening the output stream argument
+// would not handle the case where its input had already been SROA'd.
+
+// CHECK: define void @main()
+// CHECK: call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 0)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float {{.*}})
+// CHECK: call void @dx.op.emitStream(i32 97, i8 0)
+// CHECK: ret void
+
+struct GSIn { float value : TEXCOORD0; };
+struct GSOut { float value : TEXCOORD0; };
+
+[maxvertexcount(1)]
+void main(inout PointStream<GSOut> output, point GSIn input[1])
+{
+    output.Append(input[0]);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/streamout_output_before_input_same_struct.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/streamout_output_before_input_same_struct.hlsl
new file mode 100644
index 000000000..94d91ae96
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/streamout_output_before_input_same_struct.hlsl
@@ -0,0 +1,18 @@
+// RUN: %dxc -E main -T gs_6_0 %s | FileCheck %s
+
+// Regression test for an SROA bug where the flattening the output stream argument
+// would not handle the case where its input had already been SROA'd.
+
+// CHECK: define void @main()
+// CHECK: call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 0)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float {{.*}})
+// CHECK: call void @dx.op.emitStream(i32 97, i8 0)
+// CHECK: ret void
+
+struct GSInOut { float value : TEXCOORD0; };
+
+[maxvertexcount(1)]
+void main(inout PointStream<GSInOut> output, point GSInOut input[1])
+{
+    output.Append(input[0]);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/test_matrix_orientation_matrix_ret_type.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/test_matrix_orientation_matrix_ret_type.hlsl
new file mode 100644
index 000000000..3393d7e18
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/test_matrix_orientation_matrix_ret_type.hlsl
@@ -0,0 +1,20 @@
+// RUN: %dxc /Tvs_6_0 /Evs_main > %s | FileCheck %s
+
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 1.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float 2.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float 3.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float 4.000000e+00)
+
+#pragma pack_matrix (row_major)
+
+float2x2 GetMatrix()
+{
+ float2x2 mat = {1, 2, 3, 4};
+ return mat;
+}
+
+float4 vs_main() : SV_POSITION
+{
+ float2x2 mat = GetMatrix();
+ return float4(mat[0][0], mat[0][1], mat[1][0], mat[1][1]);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/test_matrix_orientation_matrix_ret_type_multiple_calls.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/test_matrix_orientation_matrix_ret_type_multiple_calls.hlsl
new file mode 100644
index 000000000..6e9f2b27a
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/test_matrix_orientation_matrix_ret_type_multiple_calls.hlsl
@@ -0,0 +1,28 @@
+// RUN: %dxc /Tvs_6_0 /Evs_main > %s | FileCheck %s
+
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 2.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float 4.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float 6.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float 8.000000e+00)
+
+#pragma pack_matrix (row_major)
+
+float2x2 GetMatrix1()
+{
+ float2x2 mat = {1, 2, 3, 4};
+ return mat;
+}
+
+#pragma pack_matrix (column_major)
+
+float2x2 GetMatrix()
+{
+ float2x2 mat = {1, 2, 3, 4};
+ return mat + GetMatrix1();
+}
+
+float4 vs_main() : SV_POSITION
+{
+ float2x2 mat = GetMatrix();
+ return float4(mat[0][0], mat[0][1], mat[1][0], mat[1][1]);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/test_matrix_orientation_matrix_ret_type_with_branches.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/test_matrix_orientation_matrix_ret_type_with_branches.hlsl
new file mode 100644
index 000000000..9917cdf00
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/test_matrix_orientation_matrix_ret_type_with_branches.hlsl
@@ -0,0 +1,28 @@
+// RUN: %dxc /Tvs_6_0 /Evs_main > %s | FileCheck %s
+
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 1.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float 2.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float 3.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float 4.000000e+00)
+
+#pragma pack_matrix (row_major)
+
+float2x2 GetMatrix(int i)
+{
+ if(i > 0)
+ {
+   float2x2 mat = {1, 2, 3, 4};
+   return mat;
+ }
+ else
+ {
+   float2x2 mat = {2, 3, 3, 4};
+   return mat;
+ }
+}
+
+float4 vs_main() : SV_POSITION
+{
+ float2x2 mat = GetMatrix(1);
+ return float4(mat[0][0], mat[0][1], mat[1][0], mat[1][1]);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/test_matrix_orientation_matrix_ret_type_with_branches_ast.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/test_matrix_orientation_matrix_ret_type_with_branches_ast.hlsl
new file mode 100644
index 000000000..f93481f48
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/test_matrix_orientation_matrix_ret_type_with_branches_ast.hlsl
@@ -0,0 +1,27 @@
+// RUN: %dxc -ast-dump /Tvs_6_0 /Evs_main > %s | FileCheck %s
+
+// CHECK: GetMatrix 'row_major float2x2 (int)'
+
+#pragma pack_matrix (row_major)
+
+float2x2 GetMatrix(int i)
+{
+ if(i > 0)
+ {
+   #pragma pack_matrix (column_major)
+   float2x2 mat = {1, 2, 3, 4};
+   return mat;
+ }
+ else
+ {
+   #pragma pack_matrix (row_major)
+   float2x2 mat = {2, 3, 3, 4};
+   return mat;
+ }
+}
+
+float4 vs_main() : SV_POSITION
+{
+ float2x2 mat = GetMatrix(1);
+ return float4(mat[0][0], mat[0][1], mat[1][0], mat[1][1]);
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/texture_of_array_error.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/texture_of_array_error.hlsl
new file mode 100644
index 000000000..1e85fb9b3
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/texture_of_array_error.hlsl
@@ -0,0 +1,13 @@
+// RUN: %dxc -T ps_6_0 -E main %s | FileCheck %s
+// CHECK: error: texture resource texel type must be scalar or vector
+typedef float a[4];
+Texture2D<a> t;
+RWTexture2D<a> rwt;
+SamplerState s;
+float main(float2 f2 : F2, int2 i2 : I) : SV_TARGET
+{
+    // Ensure semantic analysis doesn't crash
+    rwt[i2] = t.Load(int3(i2, 0));
+    t.Gather(s, f2, i2); // Test template resolution with INTRIN_COMPTYPE_FROM_TYPE_ELT0
+    return 0;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/texture_of_matrix_error.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/texture_of_matrix_error.hlsl
new file mode 100644
index 000000000..c5e37ba3d
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/texture_of_matrix_error.hlsl
@@ -0,0 +1,13 @@
+// RUN: %dxc -T ps_6_0 -E main %s | FileCheck %s
+// Note: FXC accepts this
+// CHECK: error: texture resource texel type must be scalar or vector
+Texture2D<float1x1> t;
+RWTexture2D<float1x1> rwt;
+SamplerState s;
+float main(float2 f2 : F2, int2 i2 : I) : SV_TARGET
+{
+    // Ensure semantic analysis doesn't crash
+    rwt[i2] = t.Load(int3(i2, 0));
+    t.Gather(s, f2, i2); // Test template resolution with INTRIN_COMPTYPE_FROM_TYPE_ELT0
+    return 0;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/texture_of_struct_error.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/texture_of_struct_error.hlsl
new file mode 100644
index 000000000..a10168eec
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/texture_of_struct_error.hlsl
@@ -0,0 +1,13 @@
+// RUN: %dxc -T ps_6_0 -E main %s | FileCheck %s
+// CHECK: error: texture resource texel type must be scalar or vector
+struct Struct { float f; };
+Texture2D<Struct> t;
+RWTexture2D<Struct> rwt;
+SamplerState s;
+float main(float2 f2 : F2, int2 i2 : I) : SV_TARGET
+{
+    // Ensure semantic analysis doesn't crash
+    rwt[i2] = t.Load(int3(i2, 0));
+    t.Gather(s, f2, i2); // Test template resolution with INTRIN_COMPTYPE_FROM_TYPE_ELT0
+    return 0;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/scalar-assignments_Mod.hlsl b/tools/clang/test/CodeGenHLSL/scalar-assignments_Mod.hlsl
index 86a172afb..f8fdc8f38 100644
--- a/tools/clang/test/CodeGenHLSL/scalar-assignments_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/scalar-assignments_Mod.hlsl
@@ -1,7 +1,7 @@
 // RUN: %dxc -E main -T vs_6_0 %s
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T vs_5_1 scalar-assignments.hlsl
+// fxc.exe /T vs_5_1 scalar-assignments.hlsl
 // with vs_2_0 (the default) min16float usage produces a complaint that it's not supported
 
 void main() {
diff --git a/tools/clang/test/CodeGenHLSL/scalar-operators-assign_Mod.hlsl b/tools/clang/test/CodeGenHLSL/scalar-operators-assign_Mod.hlsl
index 81b6d0a89..341c31eab 100644
--- a/tools/clang/test/CodeGenHLSL/scalar-operators-assign_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/scalar-operators-assign_Mod.hlsl
@@ -16,7 +16,7 @@
 #endif
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T vs_5_1 scalar-operators-assign.hlsl
+// fxc.exe /T vs_5_1 scalar-operators-assign.hlsl
 // with vs_2_0 (the default) min16float usage produces a complaint that it's not supported
 
 float4 plain(float4 param4 : FOO) : FOO {
diff --git a/tools/clang/test/CodeGenHLSL/scalar-operators_Mod.hlsl b/tools/clang/test/CodeGenHLSL/scalar-operators_Mod.hlsl
index e9d8831d7..9bfef53cb 100644
--- a/tools/clang/test/CodeGenHLSL/scalar-operators_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/scalar-operators_Mod.hlsl
@@ -9,7 +9,7 @@
 // :FXC_VERIFY_ARGUMENTS: /E plain /T vs_5_1
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T vs_5_1 scalar-operators.hlsl
+// fxc.exe /T vs_5_1 scalar-operators.hlsl
 // with vs_2_0 (the default) min16float usage produces a complaint that it's not supported
 
 float4 plain(float4 param4 : FOO) : FOO {
@@ -41,7 +41,7 @@ float4 plain(float4 param4 : FOO) : FOO {
     
     // For two floating-point types, they will widen to the largest one.
     
-  // Generated by running and modifying %sdxroot%\windows\directx\dxg\HLSL\test\lib\HLSL\operators.js
+  // Generated by running and modifying HLSL\test\lib\HLSL\operators.js
 
   return param4;
 }
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/spec_Mod.hlsl b/tools/clang/test/CodeGenHLSL/spec_Mod.hlsl
index 27c82f626..98a8543e7 100644
--- a/tools/clang/test/CodeGenHLSL/spec_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/spec_Mod.hlsl
@@ -4,7 +4,7 @@
 // specification.
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T ps_5_1 spec.hlsl
+// fxc.exe /T ps_5_1 spec.hlsl
 
 namespace ns_general {
 // * General
diff --git a/tools/clang/test/CodeGenHLSL/struct-assignmentsFull_Mod.hlsl b/tools/clang/test/CodeGenHLSL/struct-assignmentsFull_Mod.hlsl
index 69a7abe3e..73bbca875 100644
--- a/tools/clang/test/CodeGenHLSL/struct-assignmentsFull_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/struct-assignmentsFull_Mod.hlsl
@@ -1,7 +1,7 @@
 // RUN: %dxc -E main -T ps_6_0 %s
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T vs_5_1 struct-assignments.hlsl
+// fxc.exe /T vs_5_1 struct-assignments.hlsl
 
 struct s_f {
  float f;
diff --git a/tools/clang/test/CodeGenHLSL/struct-assignments_Mod.hlsl b/tools/clang/test/CodeGenHLSL/struct-assignments_Mod.hlsl
index 5c72dd56e..0e17ee526 100644
--- a/tools/clang/test/CodeGenHLSL/struct-assignments_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/struct-assignments_Mod.hlsl
@@ -1,7 +1,7 @@
 // RUN: %dxc -E main -T ps_6_0 %s
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T vs_5_1 struct-assignments.hlsl
+// fxc.exe /T vs_5_1 struct-assignments.hlsl
 
 struct s_f {
  float f;
diff --git a/tools/clang/test/CodeGenHLSL/uint64_1.hlsl b/tools/clang/test/CodeGenHLSL/uint64_1.hlsl
index 069b729d8..02fcc6d27 100644
--- a/tools/clang/test/CodeGenHLSL/uint64_1.hlsl
+++ b/tools/clang/test/CodeGenHLSL/uint64_1.hlsl
@@ -5,8 +5,6 @@
 // CHECK: sdiv i64
 // CHECK: shl i64
 // CHECK: mul i64
-// For iabs.
-// CHECK: IMax
 // CHECK: UMax
 // CHECK: UMin
 // CHECK: uitofp i64
@@ -32,7 +30,7 @@ float4 main(float idx1 : Idx1, float idx2 : Idx2, int2 c : C) : SV_Target
   buf2[idx1*3].b = r;
 
   r *= b << 5;
-  r = abs(r);
+  r = abs(r); // No-op on uints
   r = max(r, c.x);
   r = min(r, c.y);
   return r;
diff --git a/tools/clang/test/CodeGenHLSL/unroll/2d_array.hlsl b/tools/clang/test/CodeGenHLSL/unroll/2d_array.hlsl
new file mode 100644
index 000000000..1fee066b3
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/2d_array.hlsl
@@ -0,0 +1,28 @@
+// RUN: %dxc -Od -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK-NOT: call i32 @dx.op.bufferUpdateCounter
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+uint g_cond;
+
+float main() : SV_Target {
+
+  AppendStructuredBuffer<float4> buffers[2][2] = { buf0, buf1, buf2, buf3, };
+
+  [unroll]
+  for (uint j = 0; j < 4; j++) {
+    if (g_cond == j) {
+      buffers[j/2][j%2].Append(1);
+      return 10;
+    }
+  }
+
+  return 0;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/complex.hlsl b/tools/clang/test/CodeGenHLSL/unroll/complex.hlsl
new file mode 100644
index 000000000..5105bc708
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/complex.hlsl
@@ -0,0 +1,33 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: @main
+
+uint g_cond[3];
+uint g_bound;
+
+float main() : SV_Target {
+
+  float foo = 10;
+
+  [unroll]
+  for (uint i = 0; i < 4; i++) {
+    
+    if (i == g_cond[0]) {
+      foo += 100;
+      break;
+    }
+    else if (i == g_cond[1]) {
+      foo += 200;
+      break;
+    }
+    else if (i == g_cond[2]) { 
+      return 10;
+    }
+    foo++;
+  }
+
+  if (foo > 300) {
+    foo /= 2;
+  }
+
+  return foo;
+}
diff --git a/tools/clang/test/CodeGenHLSL/unroll/complex2.hlsl b/tools/clang/test/CodeGenHLSL/unroll/complex2.hlsl
new file mode 100644
index 000000000..8a6a5d696
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/complex2.hlsl
@@ -0,0 +1,51 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+
+// CHECK-NOT: call float @dx.op.dot3
+
+uint gc[4];
+uint g_bound;
+
+float main(float3 a : A, float3 b : B) : SV_Target {
+
+  float foo = 10;
+
+  [unroll]
+  for (uint i = 1; i < 3; i++) {
+    
+    if (i == gc[0]) {
+      foo += dot(a*gc[0], b/gc[0]);
+      continue;
+    }
+    else if (i == gc[1]) {
+      foo += dot(a*gc[1], b/gc[1]);
+      continue;
+    }
+    else if (i == gc[2]) { 
+      foo += dot(a*gc[2], b/gc[2]);
+      if (foo > g_bound)
+        return foo;
+      continue;
+    }
+    else if (i == gc[3]) { 
+      foo += dot(a*gc[3], b/gc[3]);
+      continue;
+    }
+    foo++;
+  }
+
+  if (foo > 300) {
+    foo /= 2;
+  }
+
+  return foo;
+}
diff --git a/tools/clang/test/CodeGenHLSL/unroll/count_cbuff.hlsl b/tools/clang/test/CodeGenHLSL/unroll/count_cbuff.hlsl
new file mode 100644
index 000000000..8bb0b0fc3
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/count_cbuff.hlsl
@@ -0,0 +1,18 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK-NOT: call float @dx.op.dot3
+
+uint g_cond;
+float main(float3 a : A, float3 b : B) : SV_Target {
+
+  float result = 0;
+  [unroll(3)]
+  for (int i = 0; i < g_cond; i++) {
+    result += dot(a*i, b);
+  }
+  return result;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/count_cbuff_br.hlsl b/tools/clang/test/CodeGenHLSL/unroll/count_cbuff_br.hlsl
new file mode 100644
index 000000000..1ed914238
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/count_cbuff_br.hlsl
@@ -0,0 +1,29 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+// Identical to count_cbuff.hlsl, except checks for number of br's
+
+// entry
+// CHECK: br
+// loop iteration
+// CHECK: call float @dx.op.dot3
+// CHECK: br
+// loop iteration
+// CHECK: call float @dx.op.dot3
+// CHECK: br
+// loop iteration, unconditional
+// CHECK: call float @dx.op.dot3
+// CHECK: br
+// return
+// CHECK-NOT: br
+
+uint g_cond;
+float main(float3 a : A, float3 b : B) : SV_Target {
+
+  float result = 0;
+  [unroll(3)]
+  for (int i = 0; i < g_cond; i++) {
+    result += dot(a*i, b);
+  }
+  return result;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/count_greater_than_i.hlsl b/tools/clang/test/CodeGenHLSL/unroll/count_greater_than_i.hlsl
new file mode 100644
index 000000000..d86da03dc
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/count_greater_than_i.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK-NOT: call float @dx.op.dot3
+
+float main(float3 a : A, float3 b : B) : SV_Target {
+  float result = 0;
+  [unroll(3)]
+  for (int i = 0; i < 2; i++) {
+    result += dot(a*i, b);
+  }
+  return result;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/count_less_than_i.hlsl b/tools/clang/test/CodeGenHLSL/unroll/count_less_than_i.hlsl
new file mode 100644
index 000000000..a2fc63157
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/count_less_than_i.hlsl
@@ -0,0 +1,16 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK-NOT: call float @dx.op.dot3
+
+float main(float3 a : A, float3 b : B) : SV_Target {
+  float result = 0;
+  [unroll(3)]
+  for (int i = 0; i < 10; i++) {
+    result += dot(a*i, b);
+  }
+  return result;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/count_negative.hlsl b/tools/clang/test/CodeGenHLSL/unroll/count_negative.hlsl
new file mode 100644
index 000000000..703751c07
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/count_negative.hlsl
@@ -0,0 +1,17 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: attribute 'unroll' must have a uint literal argument
+// CHECK-NOT: @main
+
+uint g_cond;
+
+float main() : SV_Target {
+  float result = 0;
+  [unroll(-1)]
+  for (int i = 0; i < g_cond; i++) {
+    result += i;
+  }
+
+  return 0;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/count_zero.hlsl b/tools/clang/test/CodeGenHLSL/unroll/count_zero.hlsl
new file mode 100644
index 000000000..45a7d1505
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/count_zero.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK-DAG: Could not unroll loop.
+// CHECK-NOT: @main
+uint g_cond;
+float main() : SV_Target {
+
+  float result = 0;
+  [unroll(0)]
+  for (int i = 0; i < g_cond; i++) {
+    result += i;
+  }
+  return result;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/extern.hlsl b/tools/clang/test/CodeGenHLSL/unroll/extern.hlsl
new file mode 100644
index 000000000..b7048d2cd
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/extern.hlsl
@@ -0,0 +1,24 @@
+// RUN: %dxc -T lib_6_3 %s | FileCheck %s
+
+// Global array with external linkage does not need constant indexing.
+// Check that the block is not included in the unroll and only happens
+// once
+
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK-NOT: call i32 @dx.op.bufferUpdateCounter
+
+extern AppendStructuredBuffer<float> buffs[4];
+
+export float f(int arg : A) {
+  
+  float result = 0;
+
+  [unroll]
+  for (int i = 0; i < 4; i++) {
+    if (i == arg) {
+      buffs[i].Append(arg);
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/tools/clang/test/CodeGenHLSL/unroll/fail.hlsl b/tools/clang/test/CodeGenHLSL/unroll/fail.hlsl
new file mode 100644
index 000000000..c216941cb
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/fail.hlsl
@@ -0,0 +1,18 @@
+// RUN: %dxc -Od -E main -T ps_6_0 %s | FileCheck %s
+// CHECK-DAG: Could not unroll loop.
+// CHECK-NOT: @main
+
+// Check that the compilation fails due to unable to
+// find the loop bound.
+
+uint g_cond;
+
+float main() : SV_Target {
+  float result = 0;
+  [unroll]
+  for (uint j = 0; j < g_cond; j++) {
+    result += 1;
+  }
+  return result;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/gis.hlsl b/tools/clang/test/CodeGenHLSL/unroll/gis.hlsl
new file mode 100644
index 000000000..dc80cdbe4
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/gis.hlsl
@@ -0,0 +1,17 @@
+// RUN: %dxc -Gis -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK-NOT: call float @dx.op.dot3
+
+float4 main(float3 a : A, float3 b : B) : SV_Target {
+  uint result = 1;
+  [unroll]
+  for (uint i = 0; i < 4; i++) {
+    result += dot(a*i, b);
+  }
+  return float4(result, 0,0, 1);
+}
+
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/nested.hlsl b/tools/clang/test/CodeGenHLSL/unroll/nested.hlsl
new file mode 100644
index 000000000..5668133e5
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/nested.hlsl
@@ -0,0 +1,30 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: @main
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+uint g_cond;
+
+float main() : SV_Target {
+
+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
+
+  float ret = 0;
+  [unroll]
+  for (uint i = 0; i < 4; i++) {
+    [unroll]
+    for (uint j = 0; j < 4; j++) {
+      ret++;
+      if (g_cond == j) {
+        buffers[j].Append(i);
+        return ret;
+      }
+    }
+    ret--;
+  }
+
+  return ret;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/nested2.hlsl b/tools/clang/test/CodeGenHLSL/unroll/nested2.hlsl
new file mode 100644
index 000000000..a91fbcb32
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/nested2.hlsl
@@ -0,0 +1,34 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: @main
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+uint g_cond;
+
+float main() : SV_Target {
+
+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
+
+  float ret = 0;
+  [unroll]
+  for (uint i = 0; i < 4; i++) {
+    [unroll]
+    for (uint j = 0; j < 4; j++) {
+      ret++;
+      [unroll]
+      for (uint k = 0; k < 4; k++) {
+        ret++;
+        if (g_cond == j) {
+          buffers[k].Append(i+j);
+          return ret;
+        }
+      }
+    }
+    ret--;
+  }
+
+  return ret;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/nested3.hlsl b/tools/clang/test/CodeGenHLSL/unroll/nested3.hlsl
new file mode 100644
index 000000000..f1180ce31
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/nested3.hlsl
@@ -0,0 +1,62 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+
+// CHECK-NOT: call i32 @dx.op.bufferUpdateCounter
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+uint g_cond;
+uint g_cond2;
+
+float routine(float value) {
+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
+  float ret = 0;
+  [unroll]
+  for (uint k = 0; k < 4; k++) {
+    ret += 15;
+    if (g_cond == k) {
+      buffers[k].Append(value);
+      return ret;
+    }
+  }
+  return ret+1;
+}
+
+float main(float3 a : A, float3 b : B) : SV_Target {
+
+  float ret = 0;
+  [unroll]
+  for (uint i = 0; i < 4; i++) {
+
+    [loop]
+    for (uint j = 0; j < 4; j++) {
+      ret += routine(j);
+      ret++;
+    }
+
+    ret--;
+  }
+
+  return ret;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/no_attribute.hlsl b/tools/clang/test/CodeGenHLSL/unroll/no_attribute.hlsl
new file mode 100644
index 000000000..36ec92345
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/no_attribute.hlsl
@@ -0,0 +1,27 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// CHECK-NOT: @main
+
+// Without [unroll] attribute, the special unroll
+// routine is not done of the loop, and the resource
+// fail to get mapped.
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+uint g_cond;
+
+float main() : SV_Target {
+
+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
+
+  for (uint j = 0; j < 4; j++) {
+    if (g_cond == j) {
+      buffers[j].Append(1);
+      return 10;
+    }
+  }
+
+  return 0;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/no_opt.hlsl b/tools/clang/test/CodeGenHLSL/unroll/no_opt.hlsl
new file mode 100644
index 000000000..139daf5db
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/no_opt.hlsl
@@ -0,0 +1,24 @@
+// RUN: %dxc -Od -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: @main
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+uint g_cond;
+
+float main() : SV_Target {
+
+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
+
+  [unroll]
+  for (uint j = 0; j < 4; j++) {
+    if (g_cond == j) {
+      buffers[j].Append(1);
+      return 10;
+    }
+  }
+
+  return 0;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/oob.hlsl b/tools/clang/test/CodeGenHLSL/unroll/oob.hlsl
new file mode 100644
index 000000000..079196e2a
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/oob.hlsl
@@ -0,0 +1,30 @@
+// RUN: %dxc -Od -E main -T ps_6_0 %s | FileCheck %s
+// CHECK-DAG: Could not unroll loop due to out of bound array access.
+// CHECK-DAG: Array access out of bound.
+// CHECK-DAG: Could not unroll loop due to out of bound array access.
+// CHECK-NOT: @main
+
+AppendStructuredBuffer<float> buf0;
+AppendStructuredBuffer<float> buf1;
+AppendStructuredBuffer<float> buf2;
+AppendStructuredBuffer<float> buf3;
+
+uint g_cond;
+
+float main() : SV_Target {
+  AppendStructuredBuffer<float> buffs[4] = {
+    buf0, buf1, buf2, buf3,
+  };
+  
+  float result = 0;
+  [unroll]
+  for (int j = -1; j < 4+1; j++) {
+    if (j == g_cond) {
+      buffs[j].Append(g_cond);
+      break;
+    }
+    result += 1;
+  }
+  return result;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/oob_2016.hlsl b/tools/clang/test/CodeGenHLSL/unroll/oob_2016.hlsl
new file mode 100644
index 000000000..6eeb15662
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/oob_2016.hlsl
@@ -0,0 +1,33 @@
+// RUN: %dxc -Od -E main -T ps_6_0 -HV 2016 %s | FileCheck %s
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK-NOT: call i32 @dx.op.bufferUpdateCounter
+
+AppendStructuredBuffer<float> buf0;
+AppendStructuredBuffer<float> buf1;
+AppendStructuredBuffer<float> buf2;
+AppendStructuredBuffer<float> buf3;
+
+uint g_cond;
+
+float main() : SV_Target {
+  AppendStructuredBuffer<float> buffs[4] = {
+    buf0, buf1, buf2, buf3,
+  };
+  
+  float result = 0;
+  [unroll]
+  for (int j = -1; j < 4+1; j++) {
+    if (j == g_cond) {
+      buffs[j].Append(g_cond);
+      break;
+    }
+    result += 1;
+  }
+  return result;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/partial_cond.hlsl b/tools/clang/test/CodeGenHLSL/unroll/partial_cond.hlsl
new file mode 100644
index 000000000..f49e99f88
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/partial_cond.hlsl
@@ -0,0 +1,20 @@
+// RUN: %dxc -Od -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+
+// CHECK-NOT: call float @dx.op.dot3
+
+uint g_cond;
+
+float main(float3 a : A, float3 b : B) : SV_Target {
+  float result = 0;
+  [unroll]
+  for (uint j = 0; j < g_cond && j < 4; j++) {
+    result += dot(a*j, b);
+  }
+  return result;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/precise_int.hlsl b/tools/clang/test/CodeGenHLSL/unroll/precise_int.hlsl
new file mode 100644
index 000000000..63b7c389c
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/precise_int.hlsl
@@ -0,0 +1,19 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK-NOT: call float @dx.op.dot3
+
+
+float4 main(float3 a : A, float3 b : B) : SV_Target {
+  precise uint result = 1;
+  [unroll]
+  for (precise uint i = 0; i < 4; i++) {
+    result += dot(a*i, b);
+  }
+  return float4(result, 0,0, 1);
+}
+
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/simple.hlsl b/tools/clang/test/CodeGenHLSL/unroll/simple.hlsl
new file mode 100644
index 000000000..9a745a099
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/simple.hlsl
@@ -0,0 +1,24 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: @main
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+uint g_cond;
+
+float main() : SV_Target {
+
+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
+
+  [unroll]
+  for (uint j = 0; j < 4; j++) {
+    if (g_cond == j) {
+      buffers[j].Append(1);
+      return 10;
+    }
+  }
+
+  return 0;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/static_dec.hlsl b/tools/clang/test/CodeGenHLSL/unroll/static_dec.hlsl
new file mode 100644
index 000000000..36a04f4a6
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/static_dec.hlsl
@@ -0,0 +1,17 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: @main
+
+static const uint COUNT = 16;
+
+float main() : SV_Target {
+  float result = 10;
+  int count = COUNT;
+  [unroll]
+  for(int i = count-1; i>=0; i--)
+  {
+    result += i;
+  }
+
+  return result;
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/struct_member.hlsl b/tools/clang/test/CodeGenHLSL/unroll/struct_member.hlsl
new file mode 100644
index 000000000..359f5106d
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/struct_member.hlsl
@@ -0,0 +1,40 @@
+// RUN: %dxc -Od -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: @main
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK-NOT: call i32 @dx.op.bufferUpdateCounter
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+uint g_cond;
+
+struct Params {
+  int foo;
+};
+
+float f(Params p) {
+
+  AppendStructuredBuffer<float4> buffers[2][2] = { buf0, buf1, buf2, buf3, };
+
+  [unroll]
+  for (uint j = 0; j < p.foo; j++) {
+    if (g_cond == j) {
+      buffers[j/2][j%2].Append(1);
+      return 10;
+    }
+  }
+
+  return 0;
+}
+
+float main() : SV_Target {
+  Params p;
+  p.foo = 4;
+
+  return f(p);
+}
+
diff --git a/tools/clang/test/CodeGenHLSL/unroll/warning.hlsl b/tools/clang/test/CodeGenHLSL/unroll/warning.hlsl
new file mode 100644
index 000000000..f83cdf1f2
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/unroll/warning.hlsl
@@ -0,0 +1,27 @@
+// RUN: %dxc -HV 2016 -Od -E main -T ps_6_0 %s | FileCheck %s
+// CHECK-DAG: warning: Could not unroll loop.
+// CHECK-NOT: @main
+
+// Check that the compilation fails due to unable to
+// find the loop bound.
+
+uint g_cond;
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+
+float main() : SV_Target {
+
+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
+
+  float result = 0;
+  [unroll]
+  for (uint j = 0; j < g_cond; j++) {
+    buffers[j].Append(result);
+    result += 1;
+  }
+  return result;
+}
+
diff --git a/tools/clang/test/HLSL/val-failures-ps.hlsl b/tools/clang/test/CodeGenHLSL/val-failures-ps.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/val-failures-ps.hlsl
rename to tools/clang/test/CodeGenHLSL/val-failures-ps.hlsl
diff --git a/tools/clang/test/HLSL/val-failures.hlsl b/tools/clang/test/CodeGenHLSL/val-failures.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/val-failures.hlsl
rename to tools/clang/test/CodeGenHLSL/val-failures.hlsl
diff --git a/tools/clang/test/HLSL/val-wave-failures-ps.hlsl b/tools/clang/test/CodeGenHLSL/val-wave-failures-ps.hlsl
similarity index 100%
rename from tools/clang/test/HLSL/val-wave-failures-ps.hlsl
rename to tools/clang/test/CodeGenHLSL/val-wave-failures-ps.hlsl
diff --git a/tools/clang/test/CodeGenHLSL/vector-assignments_Mod.hlsl b/tools/clang/test/CodeGenHLSL/vector-assignments_Mod.hlsl
index 7c330860c..7aae4cb9d 100644
--- a/tools/clang/test/CodeGenHLSL/vector-assignments_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/vector-assignments_Mod.hlsl
@@ -1,7 +1,7 @@
 // RUN: %dxc -E main -T ps_6_0 %s
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T vs_2_0 vector-assignments.hlsl
+// fxc.exe /T vs_2_0 vector-assignments.hlsl
 
 float pick_one(float2 f2) {
   return f2.x;
diff --git a/tools/clang/test/CodeGenHLSL/vector-syntax_Mod.hlsl b/tools/clang/test/CodeGenHLSL/vector-syntax_Mod.hlsl
index 6df961b90..9078cf97a 100644
--- a/tools/clang/test/CodeGenHLSL/vector-syntax_Mod.hlsl
+++ b/tools/clang/test/CodeGenHLSL/vector-syntax_Mod.hlsl
@@ -1,7 +1,7 @@
 // RUN: %dxc -E main -T ps_6_0 %s
 
 // To test with the classic compiler, run
-// %sdxroot%\tools\x86\fxc.exe /T vs_5_1 vector-syntax.hlsl
+// fxc.exe /T vs_5_1 vector-syntax.hlsl
 
 vector v;
 vector<float, 1+1> v1p1;
diff --git a/tools/clang/test/CodeGenSPIRV/intrinsics.allmemorybarrier.hlsl b/tools/clang/test/CodeGenSPIRV/intrinsics.allmemorybarrier.hlsl
index 0c3174a3f..5bc21e55e 100644
--- a/tools/clang/test/CodeGenSPIRV/intrinsics.allmemorybarrier.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/intrinsics.allmemorybarrier.hlsl
@@ -1,9 +1,9 @@
 // Run: %dxc -T cs_6_0 -E main
 
 // Memory scope : Device = 0x1 = 1
-// Semantics: ImageMemory | AtomicCounterMemory | UniformMemory | WorkgroupMemory | AcquireRelease = 0x800 | 0x400 | 0x40 | 0x100 | 0x8 = 3400
+// Semantics: ImageMemory | UniformMemory | WorkgroupMemory | AcquireRelease = 0x800 | 0x40 | 0x100 | 0x8 = 2376
 
 void main() {
-// CHECK: OpMemoryBarrier %uint_1 %uint_3400
+// CHECK: OpMemoryBarrier %uint_1 %uint_2376
   AllMemoryBarrier();
 }
diff --git a/tools/clang/test/CodeGenSPIRV/intrinsics.allmemorybarrierwithgroupsync.hlsl b/tools/clang/test/CodeGenSPIRV/intrinsics.allmemorybarrierwithgroupsync.hlsl
index aa4af5314..84adb7e34 100644
--- a/tools/clang/test/CodeGenSPIRV/intrinsics.allmemorybarrierwithgroupsync.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/intrinsics.allmemorybarrierwithgroupsync.hlsl
@@ -2,9 +2,9 @@
 
 // Execution scope : Workgroup = 0x2 = 2
 // Memory scope : Device = 0x1 = 1
-// Semantics: ImageMemory | AtomicCounterMemory | UniformMemory | WorkgroupMemory | AcquireRelease = 0x800 | 0x400 | 0x40 | 0x100 | 0x8 = 3400
+// Semantics: ImageMemory | UniformMemory | WorkgroupMemory | AcquireRelease = 0x800 | 0x40 | 0x100 | 0x8 = 2376
 
 void main() {
-// CHECK: OpControlBarrier %uint_2 %uint_1 %uint_3400
+// CHECK: OpControlBarrier %uint_2 %uint_1 %uint_2376
   AllMemoryBarrierWithGroupSync();
 }
diff --git a/tools/clang/test/CodeGenSPIRV/intrinsics.mad.hlsl b/tools/clang/test/CodeGenSPIRV/intrinsics.mad.hlsl
index 568ebc817..84c45fa31 100644
--- a/tools/clang/test/CodeGenSPIRV/intrinsics.mad.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/intrinsics.mad.hlsl
@@ -2,21 +2,38 @@
 
 // CHECK: [[glsl:%\d+]] = OpExtInstImport "GLSL.std.450"
 
+// CHECK: OpDecorate [[fma1:%\d+]] NoContraction
+// CHECK: OpDecorate [[fma2:%\d+]] NoContraction
+// CHECK: OpDecorate [[fma3:%\d+]] NoContraction
+// CHECK: OpDecorate [[fma4:%\d+]] NoContraction
+// CHECK: OpDecorate [[mul1:%\d+]] NoContraction
+// CHECK: OpDecorate [[add1:%\d+]] NoContraction
+// CHECK: OpDecorate [[mul2:%\d+]] NoContraction
+// CHECK: OpDecorate [[add2:%\d+]] NoContraction
+// CHECK: OpDecorate [[mul3:%\d+]] NoContraction
+// CHECK: OpDecorate [[add3:%\d+]] NoContraction
+// CHECK: OpDecorate [[mul4:%\d+]] NoContraction
+// CHECK: OpDecorate [[add4:%\d+]] NoContraction
+
 void main() {
   float    a1, a2, a3, fma_a;
   float4   b1, b2, b3, fma_b;
   float2x3 c1, c2, c3, fma_c;
 
+  int    d1, d2, d3, fma_d;
+  int4   e1, e2, e3, fma_e;
+  int2x3 f1, f2, f3, fma_f;
+
 // CHECK:      [[a1:%\d+]] = OpLoad %float %a1
 // CHECK-NEXT: [[a2:%\d+]] = OpLoad %float %a2
 // CHECK-NEXT: [[a3:%\d+]] = OpLoad %float %a3
-// CHECK-NEXT:    {{%\d+}} = OpExtInst %float [[glsl]] Fma [[a1]] [[a2]] [[a3]]
+// CHECK-NEXT:    [[fma1]] = OpExtInst %float [[glsl]] Fma [[a1]] [[a2]] [[a3]]
   fma_a = mad(a1, a2, a3);
 
 // CHECK:      [[b1:%\d+]] = OpLoad %v4float %b1
 // CHECK-NEXT: [[b2:%\d+]] = OpLoad %v4float %b2
 // CHECK-NEXT: [[b3:%\d+]] = OpLoad %v4float %b3
-// CHECK-NEXT:    {{%\d+}} = OpExtInst %v4float [[glsl]] Fma [[b1]] [[b2]] [[b3]]
+// CHECK-NEXT:    [[fma2]] = OpExtInst %v4float [[glsl]] Fma [[b1]] [[b2]] [[b3]]
   fma_b = mad(b1, b2, b3);
 
 // CHECK:            [[c1:%\d+]] = OpLoad %mat2v3float %c1
@@ -25,11 +42,42 @@ void main() {
 // CHECK-NEXT:  [[c1_row0:%\d+]] = OpCompositeExtract %v3float [[c1]] 0
 // CHECK-NEXT:  [[c2_row0:%\d+]] = OpCompositeExtract %v3float [[c2]] 0
 // CHECK-NEXT:  [[c3_row0:%\d+]] = OpCompositeExtract %v3float [[c3]] 0
-// CHECK-NEXT: [[fma_row0:%\d+]] = OpExtInst %v3float [[glsl]] Fma [[c1_row0]] [[c2_row0]] [[c3_row0]]
+// CHECK-NEXT:          [[fma3]] = OpExtInst %v3float [[glsl]] Fma [[c1_row0]] [[c2_row0]] [[c3_row0]]
 // CHECK-NEXT:  [[c1_row1:%\d+]] = OpCompositeExtract %v3float [[c1]] 1
 // CHECK-NEXT:  [[c2_row1:%\d+]] = OpCompositeExtract %v3float [[c2]] 1
 // CHECK-NEXT:  [[c3_row1:%\d+]] = OpCompositeExtract %v3float [[c3]] 1
-// CHECK-NEXT: [[fma_row1:%\d+]] = OpExtInst %v3float [[glsl]] Fma [[c1_row1]] [[c2_row1]] [[c3_row1]]
-// CHECK-NEXT:          {{%\d+}} = OpCompositeConstruct %mat2v3float [[fma_row0]] [[fma_row1]]
+// CHECK-NEXT:          [[fma4]] = OpExtInst %v3float [[glsl]] Fma [[c1_row1]] [[c2_row1]] [[c3_row1]]
+// CHECK-NEXT:          {{%\d+}} = OpCompositeConstruct %mat2v3float [[fma3]] [[fma4]]
   fma_c = mad(c1, c2, c3);
+
+// CHECK:       [[d1:%\d+]] = OpLoad %int %d1
+// CHECK-NEXT:  [[d2:%\d+]] = OpLoad %int %d2
+// CHECK-NEXT:  [[d3:%\d+]] = OpLoad %int %d3
+// CHECK-NEXT:     [[mul1]] = OpIMul %int [[d1]] [[d2]]
+// CHECK-NEXT:     [[add1]] = OpIAdd %int [[mul1]] [[d3]]
+  fma_d = mad(d1, d2, d3);
+
+// CHECK:       [[e1:%\d+]] = OpLoad %v4int %e1
+// CHECK-NEXT:  [[e2:%\d+]] = OpLoad %v4int %e2
+// CHECK-NEXT:  [[e3:%\d+]] = OpLoad %v4int %e3
+// CHECK-NEXT:     [[mul2]] = OpIMul %v4int [[e1]] [[e2]]
+// CHECK-NEXT:     [[add2]] = OpIAdd %v4int [[mul2]] [[e3]]
+  fma_e = mad(e1, e2, e3);
+
+// CHECK:           [[f1:%\d+]] = OpLoad %_arr_v3int_uint_2 %f1
+// CHECK-NEXT:      [[f2:%\d+]] = OpLoad %_arr_v3int_uint_2 %f2
+// CHECK-NEXT:      [[f3:%\d+]] = OpLoad %_arr_v3int_uint_2 %f3
+// CHECK-NEXT:  [[f1row0:%\d+]] = OpCompositeExtract %v3int [[f1]] 0
+// CHECK-NEXT:  [[f2row0:%\d+]] = OpCompositeExtract %v3int [[f2]] 0
+// CHECK-NEXT:  [[f3row0:%\d+]] = OpCompositeExtract %v3int [[f3]] 0
+// CHECK-NEXT:         [[mul3]] = OpIMul %v3int [[f1row0]] [[f2row0]]
+// CHECK-NEXT:         [[add3]] = OpIAdd %v3int [[mul3]] [[f3row0]]
+// CHECK-NEXT:  [[f1row1:%\d+]] = OpCompositeExtract %v3int [[f1]] 1
+// CHECK-NEXT:  [[f2row1:%\d+]] = OpCompositeExtract %v3int [[f2]] 1
+// CHECK-NEXT:  [[f3row1:%\d+]] = OpCompositeExtract %v3int [[f3]] 1
+// CHECK-NEXT:         [[mul4]] = OpIMul %v3int [[f1row1]] [[f2row1]]
+// CHECK-NEXT:         [[add4]] = OpIAdd %v3int [[mul4]] [[f3row1]]
+// CHECK-NEXT:         {{%\d+}} = OpCompositeConstruct %_arr_v3int_uint_2 [[add3]] [[add4]]
+  fma_f = mad(f1, f2, f3);
 }
+
diff --git a/tools/clang/test/CodeGenSPIRV/preprocess.error.hlsl b/tools/clang/test/CodeGenSPIRV/preprocess.error.hlsl
new file mode 100644
index 000000000..5bb7c2294
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/preprocess.error.hlsl
@@ -0,0 +1,8 @@
+// Run: %dxc -T cs_6_0 -E main -Zi
+
+#include "DoesntExist.hlsl"
+
+void main() {}
+
+
+// CHECK: 3:10: fatal error: 'DoesntExist.hlsl' file not found
diff --git a/tools/clang/test/CodeGenSPIRV/spirv.legal.counter.nested-struct.hlsl b/tools/clang/test/CodeGenSPIRV/spirv.legal.counter.nested-struct.hlsl
new file mode 100644
index 000000000..1c6c8233b
--- /dev/null
+++ b/tools/clang/test/CodeGenSPIRV/spirv.legal.counter.nested-struct.hlsl
@@ -0,0 +1,32 @@
+// Run: %dxc -T vs_6_0 -E main
+
+// CHECK: %counter_var_rw = OpVariable %_ptr_Uniform_type_ACSBuffer_counter Uniform
+// CHECK: %counter_var_t_1_0 = OpVariable %_ptr_Private__ptr_Uniform_type_ACSBuffer_counter Private
+// CHECK: %counter_var_s_0 = OpVariable %_ptr_Private__ptr_Uniform_type_ACSBuffer_counter Private
+  
+RWStructuredBuffer<uint> rw : register(u0); 
+
+struct S {
+  RWStructuredBuffer<uint> rw; 
+  uint u; 
+}; 
+
+struct T { 
+  float a; 
+  S s; 
+};
+
+void foo(S s) { s.rw[0] = 0; }
+
+float4 main() : SV_POSITION { 
+  T t;
+// CHECK: OpStore %counter_var_t_1_0 %counter_var_rw
+  t.s.rw = rw; 
+
+// CHECK: [[var:%\d+]] = OpLoad %_ptr_Uniform_type_ACSBuffer_counter %counter_var_t_1_0
+// CHECK: OpStore %counter_var_s_0 [[var]]
+// CHECK: OpFunctionCall
+  foo(t.s);
+
+  return float4(1, 1, 1, 1);
+} 
diff --git a/tools/clang/test/HLSL/ShaderOpArith.xml b/tools/clang/test/HLSL/ShaderOpArith.xml
index cbd637ae2..a5c2694ec 100644
--- a/tools/clang/test/HLSL/ShaderOpArith.xml
+++ b/tools/clang/test/HLSL/ShaderOpArith.xml
@@ -360,6 +360,45 @@
     </Shader>
   </ShaderOp>
 
+  <ShaderOp Name="Dot2AddOp" CS="CS" DispatchX="8" DispatchY="8">
+    <RootSignature>RootFlags(0), UAV(u0)</RootSignature>
+    <Resource Name="SDot2AddOp" Dimension="BUFFER" Width="1024" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <RootValues>
+      <RootValue Index="0" ResName="SDot2AddOp" />
+    </RootValues>
+    <Shader Name="CS" Target="cs_6_4">
+      <![CDATA[
+      void main(uint GI : SV_GroupIndex) {};
+      ]]>
+    </Shader>
+  </ShaderOp>
+
+  <ShaderOp Name="Dot4AddI8PackedOp" CS="CS" DispatchX="8" DispatchY="8">
+    <RootSignature>RootFlags(0), UAV(u0)</RootSignature>
+    <Resource Name="SDot4AddI8PackedOp" Dimension="BUFFER" Width="1024" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <RootValues>
+      <RootValue Index="0" ResName="SDot4AddI8PackedOp" />
+    </RootValues>
+    <Shader Name="CS" Target="cs_6_4">
+      <![CDATA[
+      void main(uint GI : SV_GroupIndex) {};
+      ]]>
+    </Shader>
+  </ShaderOp>
+
+  <ShaderOp Name="Dot4AddU8PackedOp" CS="CS" DispatchX="8" DispatchY="8">
+    <RootSignature>RootFlags(0), UAV(u0)</RootSignature>
+    <Resource Name="SDot4AddU8PackedOp" Dimension="BUFFER" Width="1024" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <RootValues>
+      <RootValue Index="0" ResName="SDot4AddU8PackedOp" />
+    </RootValues>
+    <Shader Name="CS" Target="cs_6_4">
+      <![CDATA[
+      void main(uint GI : SV_GroupIndex) {};
+      ]]>
+    </Shader>
+  </ShaderOp>
+
   <ShaderOp Name="Msad4" CS="CS" DispatchX="8" DispatchY="8">
     <RootSignature>RootFlags(0), UAV(u0)</RootSignature>
     <Resource Name="SMsad4" Dimension="BUFFER" Width="1024" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
@@ -582,31 +621,267 @@
     </Shader>
   </ShaderOp>
 
-  <ShaderOp Name="ComputeRawBufferLdStI32" CS="CS">
-  </ShaderOp>>
-  <ShaderOp Name="ComputeRawBufferLdStFloat" CS="CS">
-  </ShaderOp>>
-  <ShaderOp Name="ComputeRawBufferLdStI64" CS="CS">
-  </ShaderOp>>
-  <ShaderOp Name="ComputeRawBufferLdStDouble" CS="CS">
-  </ShaderOp>>
-  <ShaderOp Name="ComputeRawBufferLdStI16" CS="CS">
-  </ShaderOp>>
-  <ShaderOp Name="ComputeRawBufferLdStHalf" CS="CS">
+  <ShaderOp Name="ComputeRawBufferLdSt32Bit" CS="CS">
+    <RootSignature>RootFlags(0), SRV(t0), SRV(t1), UAV(u0), UAV(u1), DescriptorTable(SRV(t2,numDescriptors=2), UAV(u2,numDescriptors=2))</RootSignature>
+    <Resource Name="SRVBuffer0" Dimension="BUFFER" Width="40"  InitialResourceState="COPY_DEST" Init="ByName" Format="R32_TYPELESS"/>
+    <Resource Name="SRVBuffer1" Dimension="BUFFER" Width="40"  InitialResourceState="COPY_DEST" Init="ByName" />
+    <Resource Name="SRVBuffer2" Dimension="BUFFER" Width="40"  InitialResourceState="COPY_DEST" Init="ByName" Format="R32_TYPELESS"/>
+    <Resource Name="SRVBuffer3" Dimension="BUFFER" Width="40"  InitialResourceState="COPY_DEST" Init="ByName" />
+    <Resource Name="UAVBuffer0" Dimension="BUFFER" Width="120" InitialResourceState="COPY_DEST" Init="ByName" Flags="ALLOW_UNORDERED_ACCESS" TransitionTo="UNORDERED_ACCESS" ReadBack="true" Format="R32_TYPELESS" />
+    <Resource Name="UAVBuffer1" Dimension="BUFFER" Width="120" InitialResourceState="COPY_DEST" Init="ByName" Flags="ALLOW_UNORDERED_ACCESS" TransitionTo="UNORDERED_ACCESS" ReadBack="true" />
+    <Resource Name="UAVBuffer2" Dimension="BUFFER" Width="120" InitialResourceState="COPY_DEST" Init="ByName" Flags="ALLOW_UNORDERED_ACCESS" TransitionTo="UNORDERED_ACCESS" ReadBack="true" Format="R32_TYPELESS" />
+    <Resource Name="UAVBuffer3" Dimension="BUFFER" Width="120" InitialResourceState="COPY_DEST" Init="ByName" Flags="ALLOW_UNORDERED_ACCESS" TransitionTo="UNORDERED_ACCESS" ReadBack="true" />
+    <RootValues>
+      <RootValue Index="0" ResName="SRVBuffer0" />
+      <RootValue Index="1" ResName="SRVBuffer1" />
+      <RootValue Index="2" ResName="UAVBuffer0" />
+      <RootValue Index="3" ResName="UAVBuffer1" />
+      <RootValue Index="4" HeapName="ResHeap" />
+    </RootValues>
+    <DescriptorHeap Name="ResHeap" Type="CBV_SRV_UAV">
+      <Descriptor Name='SRVBuffer2' Kind='SRV' ResName='SRVBuffer2' Flags='RAW' NumElements="10" Format="R32_TYPELESS" />
+      <Descriptor Name='SRVBuffer3' Kind='SRV' ResName='SRVBuffer3' NumElements="1" StructureByteStride="40" />
+      <Descriptor Name='UAVBuffer2' Kind='UAV' ResName='UAVBuffer2' Flags='RAW' NumElements="30" Format="R32_TYPELESS" />
+      <Descriptor Name='UAVBuffer3' Kind='UAV' ResName='UAVBuffer3' NumElements="1" StructureByteStride="120" />
+    </DescriptorHeap>
+    <Shader Name="CS" Target="cs_6_2">
+      <![CDATA[// Shader source code will be set at runtime]]>
+    </Shader>
   </ShaderOp>>
   
-  <ShaderOp Name="GraphicsRawBufferLdStI32" PS="PS" VS="VS">
+  <ShaderOp Name="ComputeRawBufferLdSt64Bit" CS="CS">
+    <RootSignature>RootFlags(0), SRV(t0), SRV(t1), UAV(u0), UAV(u1), DescriptorTable(SRV(t2,numDescriptors=2), UAV(u2,numDescriptors=2))</RootSignature>
+    <Resource Name="SRVBuffer0" Dimension="BUFFER" Width="80"  InitialResourceState="COPY_DEST" Init="ByName" Format="R32_TYPELESS"/>
+    <Resource Name="SRVBuffer1" Dimension="BUFFER" Width="80"  InitialResourceState="COPY_DEST" Init="ByName" />
+    <Resource Name="SRVBuffer2" Dimension="BUFFER" Width="80"  InitialResourceState="COPY_DEST" Init="ByName" Format="R32_TYPELESS"/>
+    <Resource Name="SRVBuffer3" Dimension="BUFFER" Width="80"  InitialResourceState="COPY_DEST" Init="ByName" />
+    <Resource Name="UAVBuffer0" Dimension="BUFFER" Width="240" InitialResourceState="COPY_DEST" Init="ByName" Flags="ALLOW_UNORDERED_ACCESS" TransitionTo="UNORDERED_ACCESS" ReadBack="true" Format="R32_TYPELESS" />
+    <Resource Name="UAVBuffer1" Dimension="BUFFER" Width="240" InitialResourceState="COPY_DEST" Init="ByName" Flags="ALLOW_UNORDERED_ACCESS" TransitionTo="UNORDERED_ACCESS" ReadBack="true" />
+    <Resource Name="UAVBuffer2" Dimension="BUFFER" Width="240" InitialResourceState="COPY_DEST" Init="ByName" Flags="ALLOW_UNORDERED_ACCESS" TransitionTo="UNORDERED_ACCESS" ReadBack="true" Format="R32_TYPELESS" />
+    <Resource Name="UAVBuffer3" Dimension="BUFFER" Width="240" InitialResourceState="COPY_DEST" Init="ByName" Flags="ALLOW_UNORDERED_ACCESS" TransitionTo="UNORDERED_ACCESS" ReadBack="true" />
+    <RootValues>
+      <RootValue Index="0" ResName="SRVBuffer0" />
+      <RootValue Index="1" ResName="SRVBuffer1" />
+      <RootValue Index="2" ResName="UAVBuffer0" />
+      <RootValue Index="3" ResName="UAVBuffer1" />
+      <RootValue Index="4" HeapName="ResHeap" />
+    </RootValues>
+    <DescriptorHeap Name="ResHeap" Type="CBV_SRV_UAV">
+      <Descriptor Name='SRVBuffer2' Kind='SRV' ResName='SRVBuffer2' Flags='RAW' NumElements="20" Format="R32_TYPELESS" />
+      <Descriptor Name='SRVBuffer3' Kind='SRV' ResName='SRVBuffer3' NumElements="1" StructureByteStride="80" />
+      <Descriptor Name='UAVBuffer2' Kind='UAV' ResName='UAVBuffer2' Flags='RAW' NumElements="60" Format="R32_TYPELESS" />
+      <Descriptor Name='UAVBuffer3' Kind='UAV' ResName='UAVBuffer3' NumElements="1" StructureByteStride="240" />
+    </DescriptorHeap>
+    <Shader Name="CS" Target="cs_6_2">
+      <![CDATA[// Shader source code will be set at runtime]]>
+    </Shader>
+  </ShaderOp>>
+
+  <ShaderOp Name="ComputeRawBufferLdSt16Bit" CS="CS">
+    <RootSignature>RootFlags(0), SRV(t0), SRV(t1), UAV(u0), UAV(u1), DescriptorTable(SRV(t2,numDescriptors=2), UAV(u2,numDescriptors=2))</RootSignature>
+    <Resource Name="SRVBuffer0" Dimension="BUFFER" Width="20" InitialResourceState="COPY_DEST" Init="ByName" Format="R32_TYPELESS"/>
+    <Resource Name="SRVBuffer1" Dimension="BUFFER" Width="20" InitialResourceState="COPY_DEST" Init="ByName" />
+    <Resource Name="SRVBuffer2" Dimension="BUFFER" Width="20" InitialResourceState="COPY_DEST" Init="ByName" Format="R32_TYPELESS"/>
+    <Resource Name="SRVBuffer3" Dimension="BUFFER" Width="20" InitialResourceState="COPY_DEST" Init="ByName" />
+    <Resource Name="UAVBuffer0" Dimension="BUFFER" Width="60" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" Format="R32_TYPELESS" />
+    <Resource Name="UAVBuffer1" Dimension="BUFFER" Width="60" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <Resource Name="UAVBuffer2" Dimension="BUFFER" Width="60" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" Format="R32_TYPELESS" />
+    <Resource Name="UAVBuffer3" Dimension="BUFFER" Width="60" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <RootValues>
+      <RootValue Index="0" ResName="SRVBuffer0" />
+      <RootValue Index="1" ResName="SRVBuffer1" />
+      <RootValue Index="2" ResName="UAVBuffer0" />
+      <RootValue Index="3" ResName="UAVBuffer1" />
+      <RootValue Index="4" HeapName="ResHeap" />
+    </RootValues>
+    <DescriptorHeap Name="ResHeap" Type="CBV_SRV_UAV">
+      <Descriptor Name='SRVBuffer2' Kind='SRV' ResName='SRVBuffer2' Flags='RAW' NumElements="5" Format="R32_TYPELESS" />
+      <Descriptor Name='SRVBuffer3' Kind='SRV' ResName='SRVBuffer3' NumElements="1" StructureByteStride="20" />
+      <Descriptor Name='UAVBuffer2' Kind='UAV' ResName='UAVBuffer2' Flags='RAW' NumElements="15" Format="R32_TYPELESS" />
+      <Descriptor Name='UAVBuffer3' Kind='UAV' ResName='UAVBuffer3' NumElements="1" StructureByteStride="60" />
+    </DescriptorHeap>
+    <Shader Name="CS" Target="cs_6_2">
+      <![CDATA[// Shader source code will be set at runtime]]>
+    </Shader>
+  </ShaderOp>>
+
+  <ShaderOp Name="GraphicsRawBufferLdSt32Bit" PS="PS" VS="VS">
+    <RootSignature>RootFlags(ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT), SRV(t0), SRV(t1), UAV(u0), UAV(u1), DescriptorTable(SRV(t2,numDescriptors=2), UAV(u2,numDescriptors=2))</RootSignature>
+    <Resource Name="SRVBuffer0" Dimension="BUFFER" Width="40"  InitialResourceState="COPY_DEST" Init="ByName" Format="R32_TYPELESS"/>
+    <Resource Name="SRVBuffer1" Dimension="BUFFER" Width="40"  InitialResourceState="COPY_DEST" Init="ByName" />
+    <Resource Name="SRVBuffer2" Dimension="BUFFER" Width="40"  InitialResourceState="COPY_DEST" Init="ByName" Format="R32_TYPELESS"/>
+    <Resource Name="SRVBuffer3" Dimension="BUFFER" Width="40"  InitialResourceState="COPY_DEST" Init="ByName" />
+    <Resource Name="UAVBuffer0" Dimension="BUFFER" Width="120" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" Format="R32_TYPELESS" />
+    <Resource Name="UAVBuffer1" Dimension="BUFFER" Width="120" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <Resource Name="UAVBuffer2" Dimension="BUFFER" Width="120" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" Format="R32_TYPELESS" />
+    <Resource Name="UAVBuffer3" Dimension="BUFFER" Width="120" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <Resource Name="VBuffer" Dimension="BUFFER" InitialResourceState="COPY_DEST" Init="FromBytes" Topology="TRIANGLELIST">
+      { { -1.0f, 1.0f,  0.0f } },
+      { {  1.0f, 1.0f,  0.0f } },
+      { { -1.0f, -1.0f, 0.0f } },
+
+      { { -1.0f, -1.0f, 0.0f } },
+      { {  1.0f,  1.0f, 0.0f } },
+      { {  1.0f, -1.0f, 0.0f } }
+    </Resource>
+    <Resource Name="RTarget" Dimension="TEXTURE2D" Width="16" Height="16" Format="R32G32B32A32_UINT" Flags="ALLOW_RENDER_TARGET" InitialResourceState="COPY_DEST" ReadBack="true" />
+    <RootValues>
+      <RootValue Index="0" ResName="SRVBuffer0" />
+      <RootValue Index="1" ResName="SRVBuffer1" />
+      <RootValue Index="2" ResName="UAVBuffer0" />
+      <RootValue Index="3" ResName="UAVBuffer1" />
+      <RootValue Index="4" HeapName="ResHeap" />
+    </RootValues>
+    <DescriptorHeap Name="ResHeap" Type="CBV_SRV_UAV">
+      <Descriptor Name='SRVBuffer2' Kind='SRV' ResName='SRVBuffer2' Flags='RAW' NumElements="10" Format="R32_TYPELESS" />
+      <Descriptor Name='SRVBuffer3' Kind='SRV' ResName='SRVBuffer3' NumElements="1" StructureByteStride="40" />
+      <Descriptor Name='UAVBuffer2' Kind='UAV' ResName='UAVBuffer2' Flags='RAW' NumElements="30" Format="R32_TYPELESS" />
+      <Descriptor Name='UAVBuffer3' Kind='UAV' ResName='UAVBuffer3' NumElements="1" StructureByteStride="120" />
+    </DescriptorHeap>
+    <DescriptorHeap Name="RtvHeap" NumDescriptors="1" Type="RTV">
+      <Descriptor Name="RTarget" Kind="RTV"/>
+    </DescriptorHeap>
+    <InputElements>
+      <InputElement SemanticName="POSITION" Format="R32G32B32_FLOAT" AlignedByteOffset="0" />
+    </InputElements>
+    <RenderTargets>
+      <RenderTarget Name="RTarget"/>
+    </RenderTargets>
+    <Shader Name="VS" Target="vs_6_2">
+      <![CDATA[
+        struct PSInput {
+          float4 pos : SV_POSITION;
+        };
+        PSInput main(float3 pos : POSITION) {
+          PSInput r;
+          r.pos = float4(pos, 1); 
+          return r;
+        }
+      ]]>
+    </Shader>
+    <Shader Name="PS" Target="ps_6_2">
+      <![CDATA[// Shader source code will be set at runtime]]>
+    </Shader>
   </ShaderOp>
-  <ShaderOp Name="GraphicsRawBufferLdStFloat" PS="PS" VS="VS">
+  
+  <ShaderOp Name="GraphicsRawBufferLdSt64Bit" PS="PS" VS="VS">
+    <RootSignature>RootFlags(ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT), SRV(t0), SRV(t1), UAV(u0), UAV(u1), DescriptorTable(SRV(t2,numDescriptors=2), UAV(u2,numDescriptors=2))</RootSignature>
+    <Resource Name="SRVBuffer0" Dimension="BUFFER" Width="80"  InitialResourceState="COPY_DEST" Init="ByName" Format="R32_TYPELESS"/>
+    <Resource Name="SRVBuffer1" Dimension="BUFFER" Width="80"  InitialResourceState="COPY_DEST" Init="ByName" />
+    <Resource Name="SRVBuffer2" Dimension="BUFFER" Width="80"  InitialResourceState="COPY_DEST" Init="ByName" Format="R32_TYPELESS"/>
+    <Resource Name="SRVBuffer3" Dimension="BUFFER" Width="80"  InitialResourceState="COPY_DEST" Init="ByName" />
+    <Resource Name="UAVBuffer0" Dimension="BUFFER" Width="240" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" Format="R32_TYPELESS" />
+    <Resource Name="UAVBuffer1" Dimension="BUFFER" Width="240" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <Resource Name="UAVBuffer2" Dimension="BUFFER" Width="240" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" Format="R32_TYPELESS" />
+    <Resource Name="UAVBuffer3" Dimension="BUFFER" Width="240" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <Resource Name="VBuffer" Dimension="BUFFER" InitialResourceState="COPY_DEST" Init="FromBytes" Topology="TRIANGLELIST">
+      { { -1.0f, 1.0f,  0.0f } },
+      { {  1.0f, 1.0f,  0.0f } },
+      { { -1.0f, -1.0f, 0.0f } },
+
+      { { -1.0f, -1.0f, 0.0f } },
+      { {  1.0f,  1.0f, 0.0f } },
+      { {  1.0f, -1.0f, 0.0f } }
+    </Resource>
+    <Resource Name="RTarget" Dimension="TEXTURE2D" Width="16" Height="16" Format="R32G32B32A32_UINT" Flags="ALLOW_RENDER_TARGET" InitialResourceState="COPY_DEST" ReadBack="true" />
+    <RootValues>
+      <RootValue Index="0" ResName="SRVBuffer0" />
+      <RootValue Index="1" ResName="SRVBuffer1" />
+      <RootValue Index="2" ResName="UAVBuffer0" />
+      <RootValue Index="3" ResName="UAVBuffer1" />
+      <RootValue Index="4" HeapName="ResHeap" />
+    </RootValues>
+    <DescriptorHeap Name="ResHeap" Type="CBV_SRV_UAV">
+      <Descriptor Name='SRVBuffer2' Kind='SRV' ResName='SRVBuffer2' Flags='RAW' NumElements="20" Format="R32_TYPELESS" />
+      <Descriptor Name='SRVBuffer3' Kind='SRV' ResName='SRVBuffer3' NumElements="1" StructureByteStride="80" />
+      <Descriptor Name='UAVBuffer2' Kind='UAV' ResName='UAVBuffer2' Flags='RAW' NumElements="60" Format="R32_TYPELESS" />
+      <Descriptor Name='UAVBuffer3' Kind='UAV' ResName='UAVBuffer3' NumElements="1" StructureByteStride="240" />
+    </DescriptorHeap>
+    <DescriptorHeap Name="RtvHeap" NumDescriptors="1" Type="RTV">
+      <Descriptor Name="RTarget" Kind="RTV"/>
+    </DescriptorHeap>
+    <InputElements>
+      <InputElement SemanticName="POSITION" Format="R32G32B32_FLOAT" AlignedByteOffset="0" />
+    </InputElements>
+    <RenderTargets>
+      <RenderTarget Name="RTarget"/>
+    </RenderTargets>
+    <Shader Name="VS" Target="vs_6_2">
+      <![CDATA[
+        struct PSInput {
+          float4 pos : SV_POSITION;
+        };
+        PSInput main(float3 pos : POSITION) {
+          PSInput r;
+          r.pos = float4(pos, 1); 
+          return r;
+        }
+      ]]>
+    </Shader>
+    <Shader Name="PS" Target="ps_6_2">
+      <![CDATA[// Shader source code will be set at runtime]]>
+    </Shader>
   </ShaderOp>
-  <ShaderOp Name="GraphicsRawBufferLdStI64" PS="PS" VS="VS">
-  </ShaderOp>
-  <ShaderOp Name="GraphicsRawBufferLdStDouble" PS="PS" VS="VS">
-  </ShaderOp>
-  <ShaderOp Name="GraphicsRawBufferLdSt16" PS="PS" VS="VS">
-  </ShaderOp>
-  <ShaderOp Name="GraphicsRawBufferLdStHalf" PS="PS" VS="VS">
+
+  <ShaderOp Name="GraphicsRawBufferLdSt16Bit" PS="PS" VS="VS">
+    <RootSignature>RootFlags(ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT), SRV(t0), SRV(t1), UAV(u0), UAV(u1), DescriptorTable(SRV(t2,numDescriptors=2), UAV(u2,numDescriptors=2))</RootSignature>
+    <Resource Name="SRVBuffer0" Dimension="BUFFER" Width="20"  InitialResourceState="COPY_DEST" Init="ByName" Format="R32_TYPELESS"/>
+    <Resource Name="SRVBuffer1" Dimension="BUFFER" Width="20"  InitialResourceState="COPY_DEST" Init="ByName" />
+    <Resource Name="SRVBuffer2" Dimension="BUFFER" Width="20"  InitialResourceState="COPY_DEST" Init="ByName" Format="R32_TYPELESS"/>
+    <Resource Name="SRVBuffer3" Dimension="BUFFER" Width="20"  InitialResourceState="COPY_DEST" Init="ByName" />
+    <Resource Name="UAVBuffer0" Dimension="BUFFER" Width="60" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" Format="R32_TYPELESS" />
+    <Resource Name="UAVBuffer1" Dimension="BUFFER" Width="60" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <Resource Name="UAVBuffer2" Dimension="BUFFER" Width="60" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" Format="R32_TYPELESS" />
+    <Resource Name="UAVBuffer3" Dimension="BUFFER" Width="60" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <Resource Name="VBuffer" Dimension="BUFFER" InitialResourceState="COPY_DEST" Init="FromBytes" Topology="TRIANGLELIST">
+      { { -1.0f, 1.0f,  0.0f } },
+      { {  1.0f, 1.0f,  0.0f } },
+      { { -1.0f, -1.0f, 0.0f } },
+
+      { { -1.0f, -1.0f, 0.0f } },
+      { {  1.0f,  1.0f, 0.0f } },
+      { {  1.0f, -1.0f, 0.0f } }
+    </Resource>
+    <Resource Name="RTarget" Dimension="TEXTURE2D" Width="16" Height="16" Format="R32G32B32A32_UINT" Flags="ALLOW_RENDER_TARGET" InitialResourceState="COPY_DEST" ReadBack="true" />
+    <RootValues>
+      <RootValue Index="0" ResName="SRVBuffer0" />
+      <RootValue Index="1" ResName="SRVBuffer1" />
+      <RootValue Index="2" ResName="UAVBuffer0" />
+      <RootValue Index="3" ResName="UAVBuffer1" />
+      <RootValue Index="4" HeapName="ResHeap" />
+    </RootValues>
+    <DescriptorHeap Name="ResHeap" Type="CBV_SRV_UAV">
+      <Descriptor Name='SRVBuffer2' Kind='SRV' ResName='SRVBuffer2' Flags='RAW' NumElements="5" Format="R32_TYPELESS" />
+      <Descriptor Name='SRVBuffer3' Kind='SRV' ResName='SRVBuffer3' NumElements="1" StructureByteStride="20" />
+      <Descriptor Name='UAVBuffer2' Kind='UAV' ResName='UAVBuffer2' Flags='RAW' NumElements="15" Format="R32_TYPELESS" />
+      <Descriptor Name='UAVBuffer3' Kind='UAV' ResName='UAVBuffer3' NumElements="1" StructureByteStride="60" />
+    </DescriptorHeap>
+    <DescriptorHeap Name="RtvHeap" NumDescriptors="1" Type="RTV">
+      <Descriptor Name="RTarget" Kind="RTV"/>
+    </DescriptorHeap>
+    <InputElements>
+      <InputElement SemanticName="POSITION" Format="R32G32B32_FLOAT" AlignedByteOffset="0" />
+    </InputElements>
+    <RenderTargets>
+      <RenderTarget Name="RTarget"/>
+    </RenderTargets>
+    <Shader Name="VS" Target="vs_6_2">
+      <![CDATA[
+        struct PSInput {
+          float4 pos : SV_POSITION;
+        };
+        PSInput main(float3 pos : POSITION) {
+          PSInput r;
+          r.pos = float4(pos, 1); 
+          return r;
+        }
+      ]]>
+    </Shader>
+    <Shader Name="PS" Target="ps_6_2">
+      <![CDATA[// Shader source code will be set at runtime]]>
+    </Shader>
   </ShaderOp>
+
   <!--
   TODO: Dynamically index into tables
   -->
diff --git a/tools/clang/test/HLSL/binop-dims.hlsl b/tools/clang/test/HLSL/binop-dims.hlsl
index c7f663176..34d3cb2f3 100644
--- a/tools/clang/test/HLSL/binop-dims.hlsl
+++ b/tools/clang/test/HLSL/binop-dims.hlsl
@@ -7,12 +7,6 @@
 // we use -Wno-unused-value because we generate some no-op expressions to yield errors
 // without also putting them in a static assertion
 
-// TODO: Fix LValue casting to match fxc.
-// Currently certain LValue casts will crash CodeGen,
-// so an error has been placed here instead for now.
-// Need to look for the following expected error and fix in the future:
-//    cannot truncate lvalue vector/matrix
-
 /*<py>
 import re
 rxComments = re.compile(r'(//.*|/\*.*?\*\/)')
@@ -135,36 +129,36 @@ float4 main(float4 a : A, float3 c :C) : SV_TARGET {
   f1 += (float1)m1x1;
   f1 = f1 + m2x1;                                           /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   f1 += m2x1;                                               /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  f1 = f1 + (float1)m2x1;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  f1 += (float1)m2x1;                                       /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  f1 = f1 + (float1)m2x1;
+  f1 += (float1)m2x1;
   f1 = f1 + m4x1;                                           /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   f1 += m4x1;                                               /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  f1 = f1 + (float1)m4x1;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  f1 += (float1)m4x1;                                       /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  f1 = f1 + (float1)m4x1;
+  f1 += (float1)m4x1;
   f1 = f1 + m1x2;                                           /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   f1 += m1x2;                                               /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  f1 = f1 + (float1)m1x2;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  f1 += (float1)m1x2;                                       /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  f1 = f1 + (float1)m1x2;
+  f1 += (float1)m1x2;
   f1 = f1 + m2x2;                                           /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   f1 += m2x2;                                               /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  f1 = f1 + (float1)m2x2;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  f1 += (float1)m2x2;                                       /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  f1 = f1 + (float1)m2x2;
+  f1 += (float1)m2x2;
   f1 = f1 + m4x2;                                           /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   f1 += m4x2;                                               /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  f1 = f1 + (float1)m4x2;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  f1 += (float1)m4x2;                                       /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  f1 = f1 + (float1)m4x2;
+  f1 += (float1)m4x2;
   f1 = f1 + m1x4;                                           /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   f1 += m1x4;                                               /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  f1 = f1 + (float1)m1x4;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  f1 += (float1)m1x4;                                       /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  f1 = f1 + (float1)m1x4;
+  f1 += (float1)m1x4;
   f1 = f1 + m2x4;                                           /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   f1 += m2x4;                                               /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  f1 = f1 + (float1)m2x4;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  f1 += (float1)m2x4;                                       /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  f1 = f1 + (float1)m2x4;
+  f1 += (float1)m2x4;
   f1 = f1 + m4x4;                                           /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   f1 += m4x4;                                               /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  f1 = f1 + (float1)m4x4;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  f1 += (float1)m4x4;                                       /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  f1 = f1 + (float1)m4x4;
+  f1 += (float1)m4x4;
   f2 = f2 + f;
   f2 += f;
   f2 = f2 + (float2)f;
@@ -191,8 +185,8 @@ float4 main(float4 a : A, float3 c :C) : SV_TARGET {
   f2 += (float2)m2x1;
   f2 = f2 + m4x1;                                           /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   f2 += m4x1;                                               /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  f2 = f2 + (float2)m4x1;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  f2 += (float2)m4x1;                                       /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  f2 = f2 + (float2)m4x1;
+  f2 += (float2)m4x1;
   f2 = f2 + m1x2;
   f2 += m1x2;
   f2 = f2 + (float2)m1x2;
@@ -207,8 +201,8 @@ float4 main(float4 a : A, float3 c :C) : SV_TARGET {
   f2 += (float2)m4x2;                                       /* expected-error {{cannot convert from 'float4x2' to 'float2'}} fxc-error {{X3017: cannot convert from 'float4x2' to 'float2'}} */
   f2 = f2 + m1x4;                                           /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   f2 += m1x4;                                               /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  f2 = f2 + (float2)m1x4;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  f2 += (float2)m1x4;                                       /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  f2 = f2 + (float2)m1x4;
+  f2 += (float2)m1x4;
   f2 = f2 + m2x4;                                           /* expected-error {{cannot convert from 'float2x4' to 'float2'}} fxc-error {{X3020: type mismatch}} */
   f2 += m2x4;                                               /* expected-error {{cannot convert from 'float2x4' to 'float2'}} fxc-error {{X3020: type mismatch}} */
   f2 = f2 + (float2)m2x4;                                   /* expected-error {{cannot convert from 'float2x4' to 'float2'}} fxc-error {{X3017: cannot convert from 'float2x4' to 'float2'}} */
@@ -291,36 +285,36 @@ float4 main(float4 a : A, float3 c :C) : SV_TARGET {
   m1x1 += (float1x1)m1x1;
   m1x1 = m1x1 + m2x1;                                       /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   m1x1 += m2x1;                                             /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  m1x1 = m1x1 + (float1x1)m2x1;                             /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  m1x1 += (float1x1)m2x1;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  m1x1 = m1x1 + (float1x1)m2x1;
+  m1x1 += (float1x1)m2x1;
   m1x1 = m1x1 + m4x1;                                       /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   m1x1 += m4x1;                                             /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  m1x1 = m1x1 + (float1x1)m4x1;                             /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  m1x1 += (float1x1)m4x1;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  m1x1 = m1x1 + (float1x1)m4x1;
+  m1x1 += (float1x1)m4x1;
   m1x1 = m1x1 + m1x2;                                       /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   m1x1 += m1x2;                                             /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  m1x1 = m1x1 + (float1x1)m1x2;                             /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  m1x1 += (float1x1)m1x2;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  m1x1 = m1x1 + (float1x1)m1x2;
+  m1x1 += (float1x1)m1x2;
   m1x1 = m1x1 + m2x2;                                       /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   m1x1 += m2x2;                                             /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  m1x1 = m1x1 + (float1x1)m2x2;                             /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  m1x1 += (float1x1)m2x2;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  m1x1 = m1x1 + (float1x1)m2x2;
+  m1x1 += (float1x1)m2x2;
   m1x1 = m1x1 + m4x2;                                       /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   m1x1 += m4x2;                                             /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  m1x1 = m1x1 + (float1x1)m4x2;                             /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  m1x1 += (float1x1)m4x2;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  m1x1 = m1x1 + (float1x1)m4x2;
+  m1x1 += (float1x1)m4x2;
   m1x1 = m1x1 + m1x4;                                       /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   m1x1 += m1x4;                                             /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  m1x1 = m1x1 + (float1x1)m1x4;                             /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  m1x1 += (float1x1)m1x4;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  m1x1 = m1x1 + (float1x1)m1x4;
+  m1x1 += (float1x1)m1x4;
   m1x1 = m1x1 + m2x4;                                       /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   m1x1 += m2x4;                                             /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  m1x1 = m1x1 + (float1x1)m2x4;                             /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  m1x1 += (float1x1)m2x4;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  m1x1 = m1x1 + (float1x1)m2x4;
+  m1x1 += (float1x1)m2x4;
   m1x1 = m1x1 + m4x4;                                       /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   m1x1 += m4x4;                                             /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  m1x1 = m1x1 + (float1x1)m4x4;                             /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  m1x1 += (float1x1)m4x4;                                   /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  m1x1 = m1x1 + (float1x1)m4x4;
+  m1x1 += (float1x1)m4x4;
   m2x1 = m2x1 + f;
   m2x1 += f;
   m2x1 = m2x1 + (float2x1)f;
@@ -335,8 +329,8 @@ float4 main(float4 a : A, float3 c :C) : SV_TARGET {
   m2x1 += (float2x1)f2;
   m2x1 = m2x1 + f4;                                         /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   m2x1 += f4;                                               /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  m2x1 = m2x1 + (float2x1)f4;                               /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  m2x1 += (float2x1)f4;                                     /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  m2x1 = m2x1 + (float2x1)f4;
+  m2x1 += (float2x1)f4;
   m2x1 = m2x1 + m1x1;
   m2x1 += m1x1;
   m2x1 = m2x1 + (float2x1)m1x1;
@@ -439,8 +433,8 @@ float4 main(float4 a : A, float3 c :C) : SV_TARGET {
   m1x2 += (float1x2)f2;
   m1x2 = m1x2 + f4;                                         /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
   m1x2 += f4;                                               /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: implicit truncation of vector type}} */
-  m1x2 = m1x2 + (float1x2)f4;                               /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
-  m1x2 += (float1x2)f4;                                     /* expected-error {{cannot truncate lvalue vector/matrix}} fxc-pass {{}} */
+  m1x2 = m1x2 + (float1x2)f4;
+  m1x2 += (float1x2)f4;
   m1x2 = m1x2 + m1x1;
   m1x2 += m1x1;
   m1x2 = m1x2 + (float1x2)m1x1;
diff --git a/tools/clang/test/HLSL/conversions-between-type-shapes.hlsl b/tools/clang/test/HLSL/conversions-between-type-shapes.hlsl
new file mode 100644
index 000000000..fbea31cf3
--- /dev/null
+++ b/tools/clang/test/HLSL/conversions-between-type-shapes.hlsl
@@ -0,0 +1,315 @@
+// RUN: %clang_cc1 -Wno-unused-value -fsyntax-only -ffreestanding -verify -verify-ignore-unexpected=note %s
+
+// Tests all implicit conversions and explicit casts between type shapes
+// (scalars, vectors, matrices, arrays and structs).
+
+// Explicit casts are assumed to be "stronger" than implicit conversions.
+// > If an implicit conversion succeeds, we don't test the explicit cast
+// > If an explicit cast fails, we don't test the implicit conversion
+
+typedef int A1[1];
+typedef int A2[2];
+typedef int A4[4];
+typedef int A5[5];
+struct S1 { int a; };
+struct S2 { int a, b; };
+struct S4 { int a, b, c, d; };
+struct S5 { int a, b, c, d, e; };
+
+// Clang generates a bunch of "notes" about overload candidates here, but we're not testing for these
+void to_i(int i) {}
+void to_v1(int1 v) {}
+void to_v2(int2 v) {}
+void to_v4(int4 v) {}
+void to_m1x1(int1x1 m) {}
+void to_m1x2(int1x2 m) {}
+void to_m2x1(int2x1 m) {}
+void to_m2x2(int2x2 m) {}
+void to_m3x3(int3x3 m) {}
+void to_a1(A1 a) {}
+void to_a2(A2 a) {}
+void to_a4(A4 a) {}
+void to_a5(A5 a) {}
+void to_s1(S1 s) {}
+void to_s2(S2 s) {}
+void to_s4(S4 s) {}
+void to_s5(S5 s) {}
+
+void main()
+{
+    int i = 0;
+    int1 v1 = 0;
+    int2 v2 = 0;
+    int4 v4 = 0;
+    int1x1 m1x1 = 0;
+    int1x2 m1x2 = 0;
+    int2x1 m2x1 = 0;
+    int2x2 m2x2 = 0;
+    int1x3 m1x3 = 0;
+    int2x3 m2x3 = 0;
+    int3x1 m3x1 = 0;
+    int3x2 m3x2 = 0;
+    int3x3 m3x3 = 0;
+    A1 a1 = { 0 };
+    A2 a2 = { 0, 0 };
+    A4 a4 = { 0, 0, 0, 0 };
+    A5 a5 = { 0, 0, 0, 0, 0 };
+    S1 s1 = { 0 };
+    S2 s2 = { 0, 0 };
+    S4 s4 = { 0, 0, 0, 0 };
+    S5 s5 = { 0, 0, 0, 0, 0 };
+
+    // =========== Scalar/single-element ===========
+    to_i(v1);
+    to_i(m1x1);
+    to_i(a1);                                               /* expected-error {{no matching function for call to 'to_i'}} fxc-error {{X3017: 'to_i': cannot convert from 'typedef int[1]' to 'int'}} */
+    (int)a1;
+    to_i(s1);                                               /* expected-error {{no matching function for call to 'to_i'}} fxc-error {{X3017: 'to_i': cannot convert from 'struct S1' to 'int'}} */
+    (int)s1;
+
+    to_v1(i);
+    to_v1(m1x1);
+    to_v1(a1);                                              /* expected-error {{no matching function for call to 'to_v1'}} fxc-error {{X3017: 'to_v1': cannot convert from 'typedef int[1]' to 'int1'}} */
+    (int1)a1;
+    to_v1(s1);                                              /* expected-error {{no matching function for call to 'to_v1'}} fxc-error {{X3017: 'to_v1': cannot convert from 'struct S1' to 'int1'}} */
+    (int1)s1;
+
+    to_m1x1(i);
+    to_m1x1(v1);
+    to_m1x1(a1);                                            /* expected-error {{no matching function for call to 'to_m1x1'}} fxc-error {{X3017: 'to_m1x1': cannot convert from 'typedef int[1]' to 'int1'}} */
+    (int1x1)a1;
+    to_m1x1(s1);                                            /* expected-error {{no matching function for call to 'to_m1x1'}} fxc-error {{X3017: 'to_m1x1': cannot convert from 'struct S1' to 'int1'}} */
+    (int1x1)s1;
+
+    to_a1(i);                                               /* expected-error {{no matching function for call to 'to_a1'}} fxc-error {{X3017: 'to_a1': cannot convert from 'int' to 'typedef int[1]'}} */
+    (A1)i;
+    to_a1(v1);                                              /* expected-error {{no matching function for call to 'to_a1'}} fxc-error {{X3017: 'to_a1': cannot convert from 'int1' to 'typedef int[1]'}} */
+    (A1)v1;
+    to_a1(m1x1);                                            /* expected-error {{no matching function for call to 'to_a1'}} fxc-error {{X3017: 'to_a1': cannot convert from 'int1' to 'typedef int[1]'}} */
+    (A1)m1x1;
+    to_a1(s1);                                              /* expected-error {{no matching function for call to 'to_a1'}} fxc-pass {{}} */
+    (A1)s1;
+
+    to_s1(i);                                               /* expected-error {{no matching function for call to 'to_s1'}} fxc-error {{X3017: 'to_s1': cannot convert from 'int' to 'struct S1'}} */
+    (S1)i;
+    to_s1(v1);                                              /* expected-error {{no matching function for call to 'to_s1'}} fxc-error {{X3017: 'to_s1': cannot convert from 'int1' to 'struct S1'}} */
+    (S1)v1;
+    to_s1(m1x1);                                            /* expected-error {{no matching function for call to 'to_s1'}} fxc-error {{X3017: 'to_s1': cannot convert from 'int1' to 'struct S1'}} */
+    (S1)m1x1;
+    to_s1(a1);                                              /* expected-error {{no matching function for call to 'to_s1'}} fxc-pass {{}} */
+    (S1)a1;
+
+    // =========== Truncation to scalar/single-element ===========
+    // Single element sources already tested
+    to_i(v2);                                               /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_i': implicit truncation of vector type}} */
+    to_i(m2x2);                                             /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_i': implicit truncation of vector type}} */
+    to_i(a2);                                               /* expected-error {{no matching function for call to 'to_i'}} fxc-error {{X3017: 'to_i': cannot convert from 'typedef int[2]' to 'int'}} */
+    (int)a2;
+    to_i(s2);                                               /* expected-error {{no matching function for call to 'to_i'}} fxc-error {{X3017: 'to_i': cannot convert from 'struct S2' to 'int'}} */
+    (int)s2;
+
+    to_v1(v2);                                              /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_v1': implicit truncation of vector type}} */
+    to_v1(m2x2);                                            /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_v1': implicit truncation of vector type}} */
+    to_v1(a2);                                              /* expected-error {{no matching function for call to 'to_v1'}} fxc-error {{X3017: 'to_v1': cannot convert from 'typedef int[2]' to 'int1'}} */
+    (int1)a2;
+    to_v1(s2);                                              /* expected-error {{no matching function for call to 'to_v1'}} fxc-error {{X3017: 'to_v1': cannot convert from 'struct S2' to 'int1'}} */
+    (int1)s2;
+
+    to_m1x1(v2);                                            /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_m1x1': implicit truncation of vector type}} */
+    to_m1x1(m2x2);                                          /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_m1x1': implicit truncation of vector type}} */
+    to_m1x1(a2);                                            /* expected-error {{no matching function for call to 'to_m1x1'}} fxc-error {{X3017: 'to_m1x1': cannot convert from 'typedef int[2]' to 'int1'}} */
+    (int1x1)a2;
+    to_m1x1(s2);                                            /* expected-error {{no matching function for call to 'to_m1x1'}} fxc-error {{X3017: 'to_m1x1': cannot convert from 'struct S2' to 'int1'}} */
+    (int1x1)s2;
+
+    to_a1(v2);                                              /* expected-error {{no matching function for call to 'to_a1'}} fxc-error {{X3017: 'to_a1': cannot convert from 'int2' to 'typedef int[1]'}} */
+    to_a1(m2x2);                                            /* expected-error {{no matching function for call to 'to_a1'}} fxc-error {{X3017: 'to_a1': cannot implicitly convert from 'int2x2' to 'typedef int[1]'}} */
+    to_a1(a2);                                              /* expected-error {{no matching function for call to 'to_a1'}} fxc-error {{X3017: 'to_a1': cannot convert from 'typedef int[2]' to 'typedef int[1]'}} */
+    (A1)a2;
+    to_a1(s2);                                              /* expected-error {{no matching function for call to 'to_a1'}} fxc-error {{X3017: 'to_a1': cannot convert from 'struct S2' to 'typedef int[1]'}} */
+    (A1)s2;
+
+    to_s1(v2);                                              /* expected-error {{no matching function for call to 'to_s1'}} fxc-error {{X3017: 'to_s1': cannot convert from 'int2' to 'struct S1'}} */
+    to_s1(m2x2);                                            /* expected-error {{no matching function for call to 'to_s1'}} fxc-error {{X3017: 'to_s1': cannot implicitly convert from 'int2x2' to 'struct S1'}} */
+    to_s1(a2);                                              /* expected-error {{no matching function for call to 'to_s1'}} fxc-error {{X3017: 'to_s1': cannot convert from 'typedef int[2]' to 'struct S1'}} */
+    (S1)a2;
+    to_s1(s2);                                              /* expected-error {{no matching function for call to 'to_s1'}} fxc-error {{X3017: 'to_s1': cannot convert from 'struct S2' to 'struct S1'}} */
+    (S1)s2;
+
+    // =========== Splatting ===========
+    // Single element dests already tested
+    to_v2(i);
+    to_v2(v1);
+    to_v2(m1x1);
+    (int2)a1;                                               /* expected-error {{cannot convert from 'A1' (aka 'int [1]') to 'int2'}} fxc-error {{X3017: cannot convert from 'typedef int[1]' to 'int2'}} */
+    (int2)s1;                                               /* expected-error {{cannot convert from 'S1' to 'int2'}} fxc-error {{X3017: cannot convert from 'struct S1' to 'int2'}} */
+
+    to_m2x2(i);
+    to_m2x2(v1);
+    to_m2x2(m1x1);
+    (int2x2)a1;                                             /* expected-error {{cannot convert from 'A1' (aka 'int [1]') to 'int2x2'}} fxc-error {{X3017: cannot convert from 'typedef int[1]' to 'int2x2'}} */
+    (int2x2)s1;                                             /* expected-error {{cannot convert from 'S1' to 'int2x2'}} fxc-error {{X3017: cannot convert from 'struct S1' to 'int2x2'}} */
+
+    to_a2(i);                                               /* expected-error {{no matching function for call to 'to_a2'}} fxc-error {{X3017: 'to_a2': cannot convert from 'int' to 'typedef int[2]'}} */
+    (A2)i;
+    to_a2(v1);                                              /* expected-error {{no matching function for call to 'to_a2'}} fxc-error {{X3017: 'to_a2': cannot convert from 'int1' to 'typedef int[2]'}} */
+    (A2)v1;                                                 /* expected-error {{cannot convert from 'int1' to 'A2' (aka 'int [2]')}} fxc-pass {{}} */
+    to_a2(m1x1);                                            /* expected-error {{no matching function for call to 'to_a2'}} fxc-error {{X3017: 'to_a2': cannot convert from 'int1' to 'typedef int[2]'}} */
+    (A2)m1x1;                                               /* expected-error {{cannot convert from 'int1x1' to 'A2' (aka 'int [2]')}} fxc-pass {{}} */
+    (A2)a1;                                                 /* expected-error {{cannot convert from 'A1' (aka 'int [1]') to 'A2' (aka 'int [2]')}} fxc-error {{X3017: cannot convert from 'typedef int[1]' to 'typedef int[2]'}} */
+    (A2)s1;                                                 /* expected-error {{cannot convert from 'S1' to 'A2' (aka 'int [2]')}} fxc-error {{X3017: cannot convert from 'struct S1' to 'typedef int[2]'}} */
+
+    to_s2(i);                                               /* expected-error {{no matching function for call to 'to_s2'}} fxc-error {{X3017: 'to_s2': cannot convert from 'int' to 'struct S2'}} */
+    (S2)i;
+    to_s2(v1);                                              /* expected-error {{no matching function for call to 'to_s2'}} fxc-error {{X3017: 'to_s2': cannot convert from 'int1' to 'struct S2'}} */
+    (S2)v1;                                                 /* expected-error {{cannot convert from 'int1' to 'S2'}} fxc-pass {{}} */
+    to_s2(m1x1);                                            /* expected-error {{no matching function for call to 'to_s2'}} fxc-error {{X3017: 'to_s2': cannot convert from 'int1' to 'struct S2'}} */
+    (S2)m1x1;                                               /* expected-error {{cannot convert from 'int1x1' to 'S2'}} fxc-pass {{}} */
+    (S2)a1;                                                 /* expected-error {{cannot convert from 'A1' (aka 'int [1]') to 'S2'}} fxc-error {{X3017: cannot convert from 'typedef int[1]' to 'struct S2'}} */
+    (S2)s1;                                                 /* expected-error {{cannot convert from 'S1' to 'S2'}} fxc-error {{X3017: cannot convert from 'struct S1' to 'struct S2'}} */
+
+    // =========== Element-preserving ===========
+    // Single element sources/dests already tested
+    to_v2(m1x2);
+    to_v2(m2x1);
+    to_v4(m2x2);
+    to_v2(a2);                                              /* expected-error {{no matching function for call to 'to_v2'}} fxc-error {{X3017: 'to_v2': cannot convert from 'typedef int[2]' to 'int2'}} */
+    (int2)a2;
+    to_v2(s2);                                              /* expected-error {{no matching function for call to 'to_v2'}} fxc-error {{X3017: 'to_v2': cannot convert from 'struct S2' to 'int2'}} */
+    (int2)s2;
+
+    to_m1x2(v2);
+    to_m2x1(v2);
+    to_m2x2(v4);
+    (int1x2)m2x1;                                           /* expected-error {{cannot convert from 'int2x1' to 'int1x2'}} fxc-error {{X3017: cannot convert from 'int2x1' to 'int2'}} */
+    (int2x1)m1x2;                                           /* expected-error {{cannot convert from 'int1x2' to 'int2x1'}} fxc-error {{X3017: cannot convert from 'int2' to 'int2x1'}} */
+    to_m1x2(a2);                                            /* expected-error {{no matching function for call to 'to_m1x2'}} fxc-error {{X3017: 'to_m1x2': cannot convert from 'typedef int[2]' to 'int2'}} */
+    (int1x2)a2;
+    to_m2x1(a2);                                            /* expected-error {{no matching function for call to 'to_m2x1'}} fxc-error {{X3017: 'to_m2x1': cannot convert from 'typedef int[2]' to 'int2x1'}} */
+    (int2x1)a2;
+    to_m2x2(a4);                                            /* expected-error {{no matching function for call to 'to_m2x2'}} fxc-error {{X3017: 'to_m2x2': cannot convert from 'typedef int[4]' to 'int2x2'}} */
+    (int2x2)a4;
+    to_m1x2(s2);                                            /* expected-error {{no matching function for call to 'to_m1x2'}} fxc-error {{X3017: 'to_m1x2': cannot convert from 'struct S2' to 'int2'}} */
+    (int1x2)s2;
+    to_m2x1(s2);                                            /* expected-error {{no matching function for call to 'to_m2x1'}} fxc-error {{X3017: 'to_m2x1': cannot convert from 'struct S2' to 'int2x1'}} */
+    (int2x1)s2;
+    to_m2x2(s4);                                            /* expected-error {{no matching function for call to 'to_m2x2'}} fxc-error {{X3017: 'to_m2x2': cannot convert from 'struct S4' to 'int2x2'}} */
+    (int2x2)s4;
+
+    to_a2(v2);                                              /* expected-error {{no matching function for call to 'to_a2'}} fxc-error {{X3017: 'to_a2': cannot convert from 'int2' to 'typedef int[2]'}} */
+    (A2)v2;
+    to_a2(m1x2);                                            /* expected-error {{no matching function for call to 'to_a2'}} fxc-error {{X3017: 'to_a2': cannot convert from 'int2' to 'typedef int[2]'}} */
+    (A2)m1x2;
+    to_a2(m2x1);                                            /* expected-error {{no matching function for call to 'to_a2'}} fxc-error {{X3017: 'to_a2': cannot convert from 'int2x1' to 'typedef int[2]'}} */
+    (A2)m2x1;
+    to_a4(m2x2);                                            /* expected-error {{no matching function for call to 'to_a4'}} fxc-error {{X3017: 'to_a4': cannot convert from 'int2x2' to 'typedef int[4]'}} */
+    (A4)m2x2;
+    to_a2(s2);                                              /* expected-error {{no matching function for call to 'to_a2'}} fxc-pass {{}} */
+    (A2)s2;
+
+    to_s2(v2);                                              /* expected-error {{no matching function for call to 'to_s2'}} fxc-error {{X3017: 'to_s2': cannot convert from 'int2' to 'struct S2'}} */
+    (S2)v2;
+    to_s2(m1x2);                                            /* expected-error {{no matching function for call to 'to_s2'}} fxc-error {{X3017: 'to_s2': cannot convert from 'int2' to 'struct S2'}} */
+    (S2)m1x2;
+    to_s2(m2x1);                                            /* expected-error {{no matching function for call to 'to_s2'}} fxc-error {{X3017: 'to_s2': cannot convert from 'int2x1' to 'struct S2'}} */
+    (S2)m2x1;
+    to_s4(m2x2);                                            /* expected-error {{no matching function for call to 'to_s4'}} fxc-error {{X3017: 'to_s4': cannot convert from 'int2x2' to 'struct S4'}} */
+    (S4)m2x2;
+    to_s2(a2);                                              /* expected-error {{no matching function for call to 'to_s2'}} fxc-pass {{}} */
+    (S2)a2;
+
+    // =========== Truncating ===========
+    // Single element dests already tested
+    to_v2(v4);                                              /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_v2': implicit truncation of vector type}} */
+    to_v2(m1x3);                                            /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_v2': implicit truncation of vector type}} */
+    to_v2(m3x1);                                            /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_v2': implicit truncation of vector type}} */
+    (int2)m2x2;                                             /* expected-error {{cannot convert from 'int2x2' to 'int2'}} fxc-error {{X3017: cannot convert from 'int2x2' to 'int2'}} */
+    (int2)m3x3;                                             /* expected-error {{cannot convert from 'int3x3' to 'int2'}} fxc-error {{X3017: cannot convert from 'int3x3' to 'int2'}} */
+    to_v2(a4);                                              /* expected-error {{no matching function for call to 'to_v2'}} fxc-error {{X3017: 'to_v2': cannot convert from 'typedef int[4]' to 'int2'}} */
+    (int2)a4;
+    to_v2(s4);                                              /* expected-error {{no matching function for call to 'to_v2'}} fxc-error {{X3017: 'to_v2': cannot convert from 'struct S4' to 'int2'}} */
+    (int2)s4;
+
+    to_m1x2(v4);                                            /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_m1x2': implicit truncation of vector type}} */
+    to_m2x1(v4);                                            /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_m2x1': implicit truncation of vector type}} */
+    to_m1x2(m1x3);                                          /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_m1x2': implicit truncation of vector type}} */
+    (int1x2)m3x1;                                           /* expected-error {{cannot convert from 'int3x1' to 'int1x2'}} fxc-error {{X3017: cannot convert from 'int3x1' to 'int2'}} */
+    to_m1x2(m2x2);                                          /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_m1x2': implicit truncation of vector type}} */
+    to_m2x1(m3x1);                                          /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_m2x1': implicit truncation of vector type}} */
+    (int2x1)m1x3;                                           /* expected-error {{cannot convert from 'int1x3' to 'int2x1'}} fxc-error {{X3017: cannot convert from 'int3' to 'int2x1'}} */
+    to_m2x1(m2x2);                                          /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_m2x1': implicit truncation of vector type}} */
+    to_m2x2(m2x3);                                          /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_m2x2': implicit truncation of vector type}} */
+    to_m2x2(m3x2);                                          /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_m2x2': implicit truncation of vector type}} */
+    to_m2x2(m3x3);                                          /* expected-warning {{implicit truncation of vector type}} fxc-warning {{X3206: 'to_m2x2': implicit truncation of vector type}} */
+    to_m1x2(a4);                                            /* expected-error {{no matching function for call to 'to_m1x2'}} fxc-error {{X3017: 'to_m1x2': cannot convert from 'typedef int[4]' to 'int2'}} */
+    (int1x2)a4;
+    to_m2x1(a4);                                            /* expected-error {{no matching function for call to 'to_m2x1'}} fxc-error {{X3017: 'to_m2x1': cannot convert from 'typedef int[4]' to 'int2x1'}} */
+    (int2x1)a4;
+    to_m2x2(a5);                                            /* expected-error {{no matching function for call to 'to_m2x2'}} fxc-error {{X3017: 'to_m2x2': cannot implicitly convert from 'typedef int[5]' to 'int2x2'}} */
+    (int2x2)a5;                                             /* fxc-error {{X3017: cannot convert from 'typedef int[5]' to 'int2x2'}} */
+    to_m1x2(s4);                                            /* expected-error {{no matching function for call to 'to_m1x2'}} fxc-error {{X3017: 'to_m1x2': cannot convert from 'struct S4' to 'int2'}} */
+    (int1x2)s4;
+    to_m2x1(s4);                                            /* expected-error {{no matching function for call to 'to_m2x1'}} fxc-error {{X3017: 'to_m2x1': cannot convert from 'struct S4' to 'int2x1'}} */
+    (int2x1)s4;
+    to_m2x2(s5);                                            /* expected-error {{no matching function for call to 'to_m2x2'}} fxc-error {{X3017: 'to_m2x2': cannot implicitly convert from 'struct S5' to 'int2x2'}} */
+    (int2x2)s5;                                             /* fxc-error {{X3017: cannot convert from 'struct S5' to 'int2x2'}} */
+
+    to_a2(v4);                                              /* expected-error {{no matching function for call to 'to_a2'}} fxc-error {{X3017: 'to_a2': cannot convert from 'int4' to 'typedef int[2]'}} */
+    (A2)v4;
+    to_a2(m1x3);                                            /* expected-error {{no matching function for call to 'to_a2'}} fxc-error {{X3017: 'to_a2': cannot convert from 'int3' to 'typedef int[2]'}} */
+    (A2)m1x3;
+    to_a2(m3x1);                                            /* expected-error {{no matching function for call to 'to_a2'}} fxc-error {{X3017: 'to_a2': cannot convert from 'int3x1' to 'typedef int[2]'}} */
+    (A2)m3x1;
+    to_a2(m2x2);                                            /* expected-error {{no matching function for call to 'to_a2'}} fxc-error {{X3017: 'to_a2': cannot implicitly convert from 'int2x2' to 'typedef int[2]'}} */
+    (A2)m2x2;                                               /* fxc-error {{X3017: cannot convert from 'int2x2' to 'typedef int[2]'}} */
+    to_a2(m3x3);                                            /* expected-error {{no matching function for call to 'to_a2'}} fxc-error {{X3017: 'to_a2': cannot implicitly convert from 'int3x3' to 'typedef int[2]'}} */
+    (A2)m3x3;                                               /* fxc-error {{X3017: cannot convert from 'int3x3' to 'typedef int[2]'}} */
+    to_a2(a4);                                              /* expected-error {{no matching function for call to 'to_a2'}} fxc-error {{X3017: 'to_a2': cannot convert from 'typedef int[4]' to 'typedef int[2]'}} */
+    (A2)a4;
+    to_a2(s4);                                              /* expected-error {{no matching function for call to 'to_a2'}} fxc-error {{X3017: 'to_a2': cannot convert from 'struct S4' to 'typedef int[2]'}} */
+    (A2)s4;
+
+    to_s2(v4);                                              /* expected-error {{no matching function for call to 'to_s2'}} fxc-error {{X3017: 'to_s2': cannot convert from 'int4' to 'struct S2'}} */
+    (S2)v4;
+    to_s2(m1x3);                                            /* expected-error {{no matching function for call to 'to_s2'}} fxc-error {{X3017: 'to_s2': cannot convert from 'int3' to 'struct S2'}} */
+    (S2)m1x3;
+    to_s2(m3x1);                                            /* expected-error {{no matching function for call to 'to_s2'}} fxc-error {{X3017: 'to_s2': cannot convert from 'int3x1' to 'struct S2'}} */
+    (S2)m3x1;
+    to_s2(m2x2);                                            /* expected-error {{no matching function for call to 'to_s2'}} fxc-error {{X3017: 'to_s2': cannot implicitly convert from 'int2x2' to 'struct S2'}} */
+    (S2)m2x2;                                               /* fxc-error {{X3017: cannot convert from 'int2x2' to 'struct S2'}} */
+    to_s2(m3x3);                                            /* expected-error {{no matching function for call to 'to_s2'}} fxc-error {{X3017: 'to_s2': cannot implicitly convert from 'int3x3' to 'struct S2'}} */
+    (S2)m3x3;                                               /* fxc-error {{X3017: cannot convert from 'int3x3' to 'struct S2'}} */
+    to_s2(a4);                                              /* expected-error {{no matching function for call to 'to_s2'}} fxc-error {{X3017: 'to_s2': cannot convert from 'typedef int[4]' to 'struct S2'}} */
+    (S2)a4;
+    to_s2(s4);                                              /* expected-error {{no matching function for call to 'to_s2'}} fxc-error {{X3017: 'to_s2': cannot convert from 'struct S4' to 'struct S2'}} */
+    (S2)s4;
+
+    // =========== Extending ===========
+    // Single element sources already tested (splatting)
+    (int4)v2;                                               /* expected-error {{cannot convert from 'int2' to 'int4'}} fxc-error {{X3017: cannot convert from 'int2' to 'int4'}} */
+    (int4)m1x2;                                             /* expected-error {{cannot convert from 'int1x2' to 'int4'}} fxc-error {{X3017: cannot convert from 'int2' to 'int4'}} */
+    (int4)m2x1;                                             /* expected-error {{cannot convert from 'int2x1' to 'int4'}} fxc-error {{X3017: cannot convert from 'int2x1' to 'int4'}} */
+    (int4)a2;                                               /* expected-error {{cannot convert from 'A2' (aka 'int [2]') to 'int4'}} fxc-error {{X3017: cannot convert from 'typedef int[2]' to 'int4'}} */
+    (int4)s2;                                               /* expected-error {{cannot convert from 'S2' to 'int4'}} fxc-error {{X3017: cannot convert from 'struct S2' to 'int4'}} */
+
+    (int2x2)v2;                                             /* expected-error {{cannot convert from 'int2' to 'int2x2'}} fxc-error {{X3017: cannot convert from 'int2' to 'int2x2'}} */
+    (int2x2)m1x2;                                           /* expected-error {{cannot convert from 'int1x2' to 'int2x2'}} fxc-error {{X3017: cannot convert from 'int2' to 'int2x2'}} */
+    (int2x2)m2x1;                                           /* expected-error {{cannot convert from 'int2x1' to 'int2x2'}} fxc-error {{X3017: cannot convert from 'int2x1' to 'int2x2'}} */
+    (int3x3)m2x2;                                           /* expected-error {{cannot convert from 'int2x2' to 'int3x3'}} fxc-error {{X3017: cannot convert from 'int2x2' to 'int3x3'}} */
+    (int2x2)a2;                                             /* expected-error {{cannot convert from 'A2' (aka 'int [2]') to 'int2x2'}} fxc-error {{X3017: cannot convert from 'typedef int[2]' to 'int2x2'}} */
+    (int2x2)s2;                                             /* expected-error {{cannot convert from 'S2' to 'int2x2'}} fxc-error {{X3017: cannot convert from 'struct S2' to 'int2x2'}} */
+
+    (A4)v2;                                                 /* expected-error {{cannot convert from 'int2' to 'A4' (aka 'int [4]')}} fxc-error {{X3017: cannot convert from 'int2' to 'typedef int[4]'}} */
+    (A4)m1x2;                                               /* expected-error {{cannot convert from 'int1x2' to 'A4' (aka 'int [4]')}} fxc-error {{X3017: cannot convert from 'int2' to 'typedef int[4]'}} */
+    (A4)m2x1;                                               /* expected-error {{cannot convert from 'int2x1' to 'A4' (aka 'int [4]')}} fxc-error {{X3017: cannot convert from 'int2x1' to 'typedef int[4]'}} */
+    (A5)m2x2;                                               /* expected-error {{cannot convert from 'int2x2' to 'A5' (aka 'int [5]')}} fxc-error {{X3017: cannot convert from 'int2x2' to 'typedef int[5]'}} */
+    (A4)a2;                                                 /* expected-error {{cannot convert from 'A2' (aka 'int [2]') to 'A4' (aka 'int [4]')}} fxc-error {{X3017: cannot convert from 'typedef int[2]' to 'typedef int[4]'}} */
+    (A4)s2;                                                 /* expected-error {{cannot convert from 'S2' to 'A4' (aka 'int [4]')}} fxc-error {{X3017: cannot convert from 'struct S2' to 'typedef int[4]'}} */
+
+    (S4)v2;                                                 /* expected-error {{cannot convert from 'int2' to 'S4'}} fxc-error {{X3017: cannot convert from 'int2' to 'struct S4'}} */
+    (S4)m1x2;                                               /* expected-error {{cannot convert from 'int1x2' to 'S4'}} fxc-error {{X3017: cannot convert from 'int2' to 'struct S4'}} */
+    (S4)m2x1;                                               /* expected-error {{cannot convert from 'int2x1' to 'S4'}} fxc-error {{X3017: cannot convert from 'int2x1' to 'struct S4'}} */
+    (S5)m2x2;                                               /* expected-error {{cannot convert from 'int2x2' to 'S5'}} fxc-error {{X3017: cannot convert from 'int2x2' to 'struct S5'}} */
+    (S4)a2;                                                 /* expected-error {{cannot convert from 'A2' (aka 'int [2]') to 'S4'}} fxc-error {{X3017: cannot convert from 'typedef int[2]' to 'struct S4'}} */
+    (S4)s2;                                                 /* expected-error {{cannot convert from 'S2' to 'S4'}} fxc-error {{X3017: cannot convert from 'struct S2' to 'struct S4'}} */
+}
\ No newline at end of file
diff --git a/tools/clang/test/HLSL/intrinsic-examples.hlsl b/tools/clang/test/HLSL/intrinsic-examples.hlsl
index a781b69a1..5e98a482d 100644
--- a/tools/clang/test/HLSL/intrinsic-examples.hlsl
+++ b/tools/clang/test/HLSL/intrinsic-examples.hlsl
@@ -40,18 +40,18 @@ float4 RWByteAddressBufferMain(uint2 a : A, uint2 b : B) : SV_Target
   r += uav1.Load<float32_t1>(20, status);
 
   // errors
-  r += uav1.Load<float, float3>(16);                        /* expected-error {{Explicit template arguments on intrinsic Load requires HLSL version 2018 or above.}} */
-  r += uav1.Load<double3>(16);                              /* expected-error {{Explicit template arguments on intrinsic Load requires HLSL version 2018 or above.}} expected-error {{cannot convert from 'vector<double, 3>' to 'float4'}} */
+  r += uav1.Load<float, float3>(16);                        /* expected-error {{Explicit template arguments on intrinsic Load are limited one to scalar or vector type.}} */
+  r += uav1.Load<double3>(16);                              /* expected-error {{cannot convert from 'double3' to 'float4'}} */
   r += uav1.Load2<float>(16);                               /* expected-error {{Explicit template arguments on intrinsic Load2 are not supported.}} */
   r += uav1.Load3<int>(20);                                 /* expected-error {{Explicit template arguments on intrinsic Load3 are not supported.}} */
   r += uav1.Load4<int16_t>(24);                             /* expected-error {{Explicit template arguments on intrinsic Load4 are not supported.}} */
-  r += uav1.Load<half3x4>(24);                              /* expected-error {{Explicit template arguments on intrinsic Load requires HLSL version 2018 or above.}} expected-error {{cannot convert from 'matrix<half, 3, 4>' to 'float4'}} */
-  r += uav1.Load<float, float3>(16, status);                /* expected-error {{Explicit template arguments on intrinsic Load requires HLSL version 2018 or above.}} */
-  r += uav1.Load<double3>(16, status);                      /* expected-error {{Explicit template arguments on intrinsic Load requires HLSL version 2018 or above.}} expected-error {{cannot convert from 'vector<double, 3>' to 'float4'}} */
+  r += uav1.Load<half3x4>(24);                              /* expected-error {{Explicit template arguments on intrinsic Load are limited one to scalar or vector type.}} expected-error {{cannot convert from 'matrix<half, 3, 4>' to 'float4'}} */
+  r += uav1.Load<float, float3>(16, status);                /* expected-error {{Explicit template arguments on intrinsic Load are limited one to scalar or vector type.}} */
+  r += uav1.Load<double3>(16, status);                      /* expected-error {{cannot convert from 'double3' to 'float4'}} */
   r += uav1.Load2<float>(16, status);                       /* expected-error {{Explicit template arguments on intrinsic Load2 are not supported.}} */
   r += uav1.Load3<int>(20, status);                         /* expected-error {{Explicit template arguments on intrinsic Load3 are not supported.}} */
   r += uav1.Load4<int16_t>(24, status);                     /* expected-error {{Explicit template arguments on intrinsic Load4 are not supported.}} */
-  r += uav1.Load<half3x4>(24, status);                      /* expected-error {{Explicit template arguments on intrinsic Load requires HLSL version 2018 or above.}} expected-error {{cannot convert from 'matrix<half, 3, 4>' to 'float4'}} */
+  r += uav1.Load<half3x4>(24, status);                      /* expected-error {{Explicit template arguments on intrinsic Load are limited one to scalar or vector type.}} expected-error {{cannot convert from 'matrix<half, 3, 4>' to 'float4'}} */
   // valid template argument
   uav1.Store(0, r);
   uav1.Store(0, r.x);
@@ -69,8 +69,8 @@ float4 RWByteAddressBufferMain(uint2 a : A, uint2 b : B) : SV_Target
   uav1.Store4<float>(0, r);                                 /* expected-error {{Explicit template arguments on intrinsic Store4 are not supported.}} */
   uav1.Store(0, float2x4(1,2,3,4,5,6,7,8));                 /* expected-error {{no matching member function for call to 'Store'}} */
   uav1.Store<float3x2>(0, float3x2(1,2,3,4,5,6));           /* expected-error {{no matching member function for call to 'Store'}} */
-  uav1.Store(0, (double3)r.xyz);                            /* expected-error {{no matching member function for call to 'Store'}} expected-error {{no matching member function for call to Store}} expected-note@? {{candidate template ignored: couldn't infer template argument 'TResult'}}*/
-  uav1.Store(0, (uint64_t4)r);                              /* expected-error {{no matching member function for call to 'Store'}} expected-error {{no matching member function for call to Store}} expected-note@? {{candidate template ignored: couldn't infer template argument 'TResult'}}*/
+  uav1.Store(0, (double3)r.xyz);                            
+  uav1.Store(0, (uint64_t4)r);                              
   MyStruct myStruct;
   uav1.Store(0, myStruct);                                  /* expected-error {{no matching member function for call to 'Store'}} */
   return r;
diff --git a/tools/clang/test/HLSL/rewriter/correct_rewrites/matrix-pack-orientation_gold.hlsl b/tools/clang/test/HLSL/rewriter/correct_rewrites/matrix-pack-orientation_gold.hlsl
new file mode 100644
index 000000000..085cdb248
--- /dev/null
+++ b/tools/clang/test/HLSL/rewriter/correct_rewrites/matrix-pack-orientation_gold.hlsl
@@ -0,0 +1,10 @@
+// Rewrite unchanged result:
+void default_noPragma(int1x1 m);
+void rowMajorAttribute_noPragma(row_major int1x1 m);
+void columnMajorAttribute_noPragma(column_major int1x1 m);
+void default_pragmaRowMajor(row_major int1x1 m);
+void rowMajorAttribute_pragmaRowMajor(row_major int1x1 m);
+void columnMajorAttribute_pragmaRowMajor(column_major int1x1 m);
+void default_pragmaColumnMajor(column_major int1x1 m);
+void rowMajorAttribute_pragmaColumnMajor(row_major int1x1 m);
+void columnMajorAttribute_pragmaColumnMajor(column_major int1x1 m);
diff --git a/tools/clang/test/HLSL/rewriter/matrix-pack-orientation.hlsl b/tools/clang/test/HLSL/rewriter/matrix-pack-orientation.hlsl
new file mode 100644
index 000000000..23c70bb37
--- /dev/null
+++ b/tools/clang/test/HLSL/rewriter/matrix-pack-orientation.hlsl
@@ -0,0 +1,15 @@
+// Test that the semantics of matrix pack orientations are preserved through rewriting.
+
+void default_noPragma(int1x1 m); // The lack of pack orientation annotation should be preserved
+void rowMajorAttribute_noPragma(row_major int1x1 m);
+void columnMajorAttribute_noPragma(column_major int1x1 m);
+
+#pragma pack_matrix(row_major)
+void default_pragmaRowMajor(int1x1 m); // This should get a row_major attribute added
+void rowMajorAttribute_pragmaRowMajor(row_major int1x1 m);
+void columnMajorAttribute_pragmaRowMajor(column_major int1x1 m);
+
+#pragma pack_matrix(column_major)
+void default_pragmaColumnMajor(int1x1 m); // This should get a column_major attribute added
+void rowMajorAttribute_pragmaColumnMajor(row_major int1x1 m);
+void columnMajorAttribute_pragmaColumnMajor(column_major int1x1 m);
\ No newline at end of file
diff --git a/tools/clang/tools/dotnetc/EditorForm.cs b/tools/clang/tools/dotnetc/EditorForm.cs
index f28835dfc..f1cf05a5f 100644
--- a/tools/clang/tools/dotnetc/EditorForm.cs
+++ b/tools/clang/tools/dotnetc/EditorForm.cs
@@ -310,6 +310,7 @@ namespace MainNs
                 " float4 position : SV_POSITION;\r\n" +
                 " float4 color : COLOR;\r\n" +
                 "};\r\n" +
+                "[RootSignature(\"RootFlags(ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT)\")]\r\n" +
                 "PSInput VSMain(float4 position: POSITION, float4 color: COLOR) {\r\n" +
                 " float aspect = 320.0 / 200.0;\r\n" +
                 " PSInput result;\r\n" +
@@ -318,6 +319,7 @@ namespace MainNs
                 " result.color = color;\r\n" +
                 " return result;\r\n" +
                 "}\r\n" +
+                "[RootSignature(\"RootFlags(ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT)\")]\r\n" +
                "float4 PSMain(PSInput input) : SV_TARGET {\r\n" +
                " return input.color;\r\n" +
                "}\r\n";
@@ -1556,7 +1558,7 @@ namespace MainNs
             args.Add("/Odump");
             IDxcCompiler compiler = HlslDxcLib.CreateDxcCompiler();
             IDxcOperationResult optDumpResult =
-                compiler.Compile(CreateBlobForText("float4 main() : SV_Target { return 0; }"), "hlsl.hlsl", "main", "ps_6_0", args.ToArray(), args.Count, null, 0, null);
+                compiler.Compile(CreateBlobForText("[RootSignature(\"\")]float4 main() : SV_Target { return 0; }"), "hlsl.hlsl", "main", "ps_6_0", args.ToArray(), args.Count, null, 0, null);
             IDxcBlob optDumpBlob = optDumpResult.GetResult();
             string optDumpText = GetStringFromBlob(optDumpBlob);
             this.AddSelectedPassesFromText(optDumpText, true);
diff --git a/tools/clang/tools/dotnetc/OptEditorForm.cs b/tools/clang/tools/dotnetc/OptEditorForm.cs
index 255b65556..75222a0e8 100644
--- a/tools/clang/tools/dotnetc/OptEditorForm.cs
+++ b/tools/clang/tools/dotnetc/OptEditorForm.cs
@@ -70,22 +70,10 @@ namespace MainNs
 
         private void ApplyChangesButton_Click(object sender, EventArgs e)
         {
-            // Turn the text into a container.
-            IDxcBlobEncoding sourceBlob = EditorForm.CreateBlobForText(this.Library, this.CodeBox.Text);
-            EditorForm.AssembleResult assembleResult = EditorForm.RunAssembly(this.Library, sourceBlob);
-            if (assembleResult.Blob == null)
-            {
-                MessageBox.Show("Failed to assemble: " + assembleResult.ResultText);
-                return;
-            }
-
-            // Extract the bitcode portion.
-            const uint DxilKind = 0x4c495844; // 'LIXD' - DXIL
-            uint index;
-            IDxcContainerReflection reflection = HlslDxcLib.CreateDxcContainerReflection();
-            reflection.Load(assembleResult.Blob);
-            reflection.FindFirstPartKind(DxilKind, out index);
-            IDxcBlob bitcodeBlob = reflection.GetPartContent(index);
+            // Turn the text into the expected encoding.
+            IDxcBlobEncoding sourceBlob = EditorForm.CreateBlobForText(this.Library, this.CodeBox.Text);
+            sourceBlob = this.Library.GetBlobAstUf8(sourceBlob);
+            IDxcBlob bitcodeBlob = sourceBlob;
 
             List<string> passes = new List<string>();
             passes.Add("hlsl-passes-resume");
@@ -231,7 +219,7 @@ namespace MainNs
                 if (!ClosestMatch(text, ref next, separators, out separator))
                     next = -1;
                 string sectionText = (next < 0) ? text.Substring(lineEnd + 1) : text.Substring(lineEnd + 1, next - (lineEnd + 1));
-                sectionText = sectionText.Trim();
+                sectionText = sectionText.Trim() + "\n";
                 bool hasChange = sectionText != prior;
                 yield return new TextSection { HasChange = hasChange, Title = title, Text = hasChange ? sectionText : prior };
                 idx = next;
diff --git a/tools/clang/tools/dxcompiler/CMakeLists.txt b/tools/clang/tools/dxcompiler/CMakeLists.txt
index f03d3b199..5887191c5 100644
--- a/tools/clang/tools/dxcompiler/CMakeLists.txt
+++ b/tools/clang/tools/dxcompiler/CMakeLists.txt
@@ -1,10 +1,6 @@
 ﻿# Copyright (C) Microsoft Corporation. All rights reserved.
 # This file is distributed under the University of Illinois Open Source License. See LICENSE.TXT for details.
 
-if (WIN32)
-  find_package(DiaSDK REQUIRED) # Used for constants and declarations.
-endif (WIN32)
-
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
   analysis
@@ -48,7 +44,6 @@ if (WIN32)
 set(SOURCES
   dxcapi.cpp
   dxcassembler.cpp
-  dxcdia.cpp
   dxclibrary.cpp
   dxcompilerobj.cpp
   dxcvalidator.cpp
@@ -101,6 +96,12 @@ set(LIBRARIES
   libclang
   )
 
+if(WIN32)
+  set(LIBRARIES
+    ${LIBRARIES} 
+    LLVMDxilDia)
+endif(WIN32)
+
 set(GENERATED_HEADERS
   ClangAttrClasses
   ClangAttrList
@@ -114,20 +115,14 @@ set(GENERATED_HEADERS
 
 add_clang_library(dxcompiler SHARED ${SOURCES})
 if (WIN32)
-  target_link_libraries(dxcompiler PRIVATE ${LIBRARIES} ${DIASDK_LIBRARIES})
-  if (ENABLE_SPIRV_CODEGEN)
-    target_link_libraries(dxcompiler PRIVATE clangSPIRV)
-  endif (ENABLE_SPIRV_CODEGEN)
+  # No DxcEtw on non-Windows platforms.
   add_dependencies(dxcompiler DxcEtw)
-  include_directories(AFTER ${LLVM_INCLUDE_DIR}/dxc/Tracing ${DIASDK_INCLUDE_DIRS})
-else ()
-  # No DIASDK or DxcEtw on non-Windows platforms.
-  target_link_libraries(dxcompiler PRIVATE ${LIBRARIES})
-  if (ENABLE_SPIRV_CODEGEN)
-    target_link_libraries(dxcompiler PRIVATE clangSPIRV)
-  endif (ENABLE_SPIRV_CODEGEN)
-  include_directories(AFTER ${LLVM_INCLUDE_DIR}/dxc/Tracing)
-endif (WIN32)
+endif()
+target_link_libraries(dxcompiler PRIVATE ${LIBRARIES})
+if (ENABLE_SPIRV_CODEGEN)
+  target_link_libraries(dxcompiler PRIVATE clangSPIRV)
+endif (ENABLE_SPIRV_CODEGEN)
+include_directories(AFTER ${LLVM_INCLUDE_DIR}/dxc/Tracing)
 
 set_target_properties(dxcompiler
   PROPERTIES
diff --git a/tools/clang/tools/dxcompiler/dxcassembler.cpp b/tools/clang/tools/dxcompiler/dxcassembler.cpp
index 5fbc346a9..529da02cf 100644
--- a/tools/clang/tools/dxcompiler/dxcassembler.cpp
+++ b/tools/clang/tools/dxcompiler/dxcassembler.cpp
@@ -130,8 +130,11 @@ HRESULT STDMETHODCALLTYPE DxcAssembler::AssembleToContainer(
     outStream.flush();
 
     CComPtr<IDxcBlob> pResultBlob;
+    static constexpr hlsl::SerializeDxilFlags flags = static_cast<hlsl::SerializeDxilFlags>(
+        static_cast<uint32_t>(SerializeDxilFlags::IncludeDebugNamePart) |
+        static_cast<uint32_t>(SerializeDxilFlags::IncludeDebugInfoPart));
     dxcutil::AssembleToContainer(std::move(M), pResultBlob,
-                                         TM.p, SerializeDxilFlags::IncludeDebugNamePart,
+                                         TM.p, flags,
                                          pOutputStream);
 
     IFT(DxcOperationResult::CreateFromResultErrorStatus(pResultBlob, nullptr, S_OK, ppResult));
diff --git a/tools/clang/tools/dxcompiler/dxcdia.cpp b/tools/clang/tools/dxcompiler/dxcdia.cpp
deleted file mode 100644
index b006e106c..000000000
--- a/tools/clang/tools/dxcompiler/dxcdia.cpp
+++ /dev/null
@@ -1,2309 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////
-//                                                                           //
-// dxcdia.cpp                                                                //
-// Copyright (C) Microsoft Corporation. All rights reserved.                 //
-// This file is distributed under the University of Illinois Open Source     //
-// License. See LICENSE.TXT for details.                                     //
-//                                                                           //
-// Implements the diagnostic APIs for a DirectX Compiler program.            //
-//                                                                           //
-///////////////////////////////////////////////////////////////////////////////
-
-#include "clang/AST/ASTConsumer.h"
-#include "clang/AST/RecursiveASTVisitor.h"
-#include "llvm/Bitcode/ReaderWriter.h"
-#include "clang/Sema/SemaHLSL.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/DiagnosticPrinter.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-
-#include "dxc/Support/WinIncludes.h"
-#include "dxc/DxilContainer/DxilContainer.h"
-#include "dxc/DXIL/DxilShaderModel.h"
-#include "dxc/DXIL/DxilMetadataHelper.h"
-#include "dxc/DXIL/DxilModule.h"
-#include "dxc/DXIL/DxilUtil.h"
-#include "dxc/Support/Global.h"
-#ifdef _WIN32
-#include "dia2.h"
-#endif
-
-#include "dxc/dxcapi.internal.h"
-
-#include "dxc/Support/Global.h"
-#include "dxc/Support/Unicode.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MSFileSystem.h"
-#include "dxc/Support/microcom.h"
-#include "dxc/Support/FileIOHelper.h"
-#include "dxc/Support/dxcapi.impl.h"
-#include <algorithm>
-#include <array>
-#ifdef _WIN32
-#include <comdef.h>
-#endif
-#include "dxcutil.h"
-
-using namespace llvm;
-using namespace clang;
-using namespace hlsl;
-
-///////////////////////////////////////////////////////////////////////////////
-// Forward declarations.
-class DxcDiaDataSource;
-class DxcDiaEnumSegments;
-class DxcDiaEnumTables;
-class DxcDiaSegment;
-class DxcDiaSession;
-class DxcDiaSymbol;
-class DxcDiaTable;
-
-///////////////////////////////////////////////////////////////////////////////
-// Constants and helper structures.
-enum class DiaTableKind {
-  Symbols,
-  SourceFiles,
-  LineNumbers,
-  Sections,
-  SegmentMap,
-  InjectedSource,
-  FrameData,
-  InputAssemblyFile
-};
-static const DiaTableKind FirstTableKind = DiaTableKind::Symbols;
-static const DiaTableKind LastTableKind = DiaTableKind::InputAssemblyFile;
-
-const LPCWSTR TableNames[] = {
-  L"Symbols",
-  L"SourceFiles",
-  L"LineNumbers",
-  L"Sections",
-  L"SegmentMap",
-  L"InjectedSource",
-  L"FrameData",
-  L"InputAssemblyFiles"
-};
-
-// Single program, single compiland allows for some simplifications.
-static const DWORD HlslProgramId = 1;
-static const DWORD HlslCompilandId = 2;
-static const DWORD HlslCompilandDetailsId = 3;
-static const DWORD HlslCompilandEnvFlagsId = 4;
-static const DWORD HlslCompilandEnvTargetId = 5;
-static const DWORD HlslCompilandEnvEntryId = 6;
-static const DWORD HlslCompilandEnvDefinesId = 7;
-static const DWORD HlslCompilandEnvArgumentsId = 8;
-
-///////////////////////////////////////////////////////////////////////////////
-// Memory helpers.
-static
-std::unique_ptr<MemoryBuffer> getMemBufferFromBlob(_In_ IDxcBlob *pBlob,
-                                                   const Twine &BufferName) {
-  StringRef Data((LPSTR)pBlob->GetBufferPointer(), pBlob->GetBufferSize());
-  return MemoryBuffer::getMemBufferCopy(Data, BufferName);
-}
-
-static
-std::unique_ptr<MemoryBuffer> getMemBufferFromStream(_In_ IStream *pStream,
-                                                     const Twine &BufferName) {
-  CComPtr<IDxcBlob> pBlob;
-  if (SUCCEEDED(pStream->QueryInterface(&pBlob))) {
-    return getMemBufferFromBlob(pBlob, BufferName);
-  }
-
-  STATSTG statstg;
-  IFT(pStream->Stat(&statstg, STATFLAG_NONAME));
-  size_t size = statstg.cbSize.LowPart;
-  std::unique_ptr<llvm::MemoryBuffer> result(
-    llvm::MemoryBuffer::getNewUninitMemBuffer(size, BufferName));
-  char *pBuffer = (char *)result.get()->getBufferStart();
-  ULONG read;
-  IFT(pStream->Read(pBuffer, size, &read));
-  return result;
-}
-
-static HRESULT StringRefToBSTR(llvm::StringRef value, BSTR *pRetVal) {
-  try {
-    wchar_t *wide;
-    size_t sideSize;
-    if (!Unicode::UTF8BufferToUTF16Buffer(value.data(), value.size(), &wide,
-                                          &sideSize))
-      return E_FAIL;
-    *pRetVal = SysAllocString(wide);
-    delete[] wide;
-  }
-  CATCH_CPP_RETURN_HRESULT();
-  return S_OK;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// DirectX compiler API.
-
-static HRESULT CreateDxcDiaEnumTables(DxcDiaSession *, IDiaEnumTables **);
-static HRESULT CreateDxcDiaTable(DxcDiaSession *, DiaTableKind kind, IDiaTable **ppTable);
-static HRESULT DxcDiaFindLineNumbersByRVA(DxcDiaSession *, DWORD rva, DWORD length, IDiaEnumLineNumbers **);
-
-class DxcDiaSession : public IDiaSession {
-private:
-  DXC_MICROCOM_TM_REF_FIELDS()
-  std::shared_ptr<llvm::LLVMContext> m_context;
-  std::shared_ptr<llvm::Module> m_module;
-  std::shared_ptr<llvm::DebugInfoFinder> m_finder;
-  std::unique_ptr<DxilModule> m_dxilModule;
-  llvm::NamedMDNode *m_contents;
-  llvm::NamedMDNode *m_defines;
-  llvm::NamedMDNode *m_mainFileName;
-  llvm::NamedMDNode *m_arguments;
-  std::vector<const Instruction *> m_instructions;
-  std::vector<const Instruction *> m_instructionLines; // Instructions with line info.
-  typedef unsigned RVA;
-  std::unordered_map<const Instruction *, RVA> m_rvaMap; // Map instruction to its RVA.
-public:
-  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
-  DXC_MICROCOM_TM_CTOR(DxcDiaSession)
-
-  IMalloc *GetMallocNoRef() { return m_pMalloc.p; }
-
-  void Init(std::shared_ptr<llvm::LLVMContext> context,
-      std::shared_ptr<llvm::Module> module,
-      std::shared_ptr<llvm::DebugInfoFinder> finder) {
-    m_pEnumTables = nullptr;
-    m_module = module;
-    m_context = context;
-    m_finder = finder;
-    m_dxilModule = std::make_unique<DxilModule>(module.get());
-  
-    // Extract HLSL metadata.
-    m_dxilModule->LoadDxilMetadata();
-
-    // Get file contents.
-    m_contents =
-        m_module->getNamedMetadata(DxilMDHelper::kDxilSourceContentsMDName);
-    if (!m_contents)
-      m_contents = m_module->getNamedMetadata("llvm.dbg.contents");
-
-    m_defines =
-        m_module->getNamedMetadata(DxilMDHelper::kDxilSourceDefinesMDName);
-    if (!m_defines)
-      m_defines = m_module->getNamedMetadata("llvm.dbg.defines");
-
-    m_mainFileName =
-        m_module->getNamedMetadata(DxilMDHelper::kDxilSourceMainFileNameMDName);
-    if (!m_mainFileName)
-      m_mainFileName = m_module->getNamedMetadata("llvm.dbg.mainFileName");
-
-    m_arguments =
-        m_module->getNamedMetadata(DxilMDHelper::kDxilSourceArgsMDName);
-    if (!m_arguments)
-      m_arguments = m_module->getNamedMetadata("llvm.dbg.args");
-
-    // Build up a linear list of instructions. The index will be used as the
-    // RVA. Debug instructions are ommitted from this enumeration.
-    for (const Function &fn : m_module->functions()) {
-      for (const_inst_iterator it = inst_begin(fn), end = inst_end(fn); it != end; ++it) {
-        const Instruction &i = *it;
-        if (const CallInst *call = dyn_cast<const CallInst>(&i)) {
-          const Function *pFn = call->getCalledFunction();
-          if (pFn && pFn->getName().startswith("llvm.dbg.")) {
-            continue;
-          }
-        }
-
-        m_rvaMap.insert({ &i, static_cast<RVA>(m_instructions.size()) });
-        m_instructions.push_back(&i);
-        if (i.getDebugLoc()) {
-          m_instructionLines.push_back(&i);
-        }
-      }
-    }
-
-    // Sanity check to make sure rva map is same as instruction index.
-    for (size_t i = 0, e = m_instructions.size(); i < e; ++i) {
-      DXASSERT(m_rvaMap.find(m_instructions[i]) != m_rvaMap.end(), "instruction not mapped to rva");
-      DXASSERT(m_rvaMap[m_instructions[i]] == i, "instruction mapped to wrong rva");
-    }
-  }
-  llvm::NamedMDNode *Contents() { return m_contents; }
-  llvm::NamedMDNode *Defines() { return m_defines; }
-  llvm::NamedMDNode *MainFileName() { return m_mainFileName; }
-  llvm::NamedMDNode *Arguments() { return m_arguments; }
-  hlsl::DxilModule &DxilModuleRef() { return *m_dxilModule.get(); }
-  llvm::Module &ModuleRef() { return *m_module.get(); }
-  llvm::DebugInfoFinder &InfoRef() { return *m_finder.get(); }
-  std::vector<const Instruction *> &InstructionsRef() { return m_instructions; }
-  std::vector<const Instruction *> &InstructionLinesRef() { return m_instructionLines; }
-  std::unordered_map<const Instruction *, RVA> &RvaMapRef() { return m_rvaMap; }
-
-  HRESULT getSourceFileIdByName(StringRef fileName, DWORD *pRetVal) {
-    if (Contents() != nullptr) {
-      for (unsigned i = 0; i < Contents()->getNumOperands(); ++i) {
-        StringRef fn =
-            dyn_cast<MDString>(Contents()->getOperand(i)->getOperand(0))
-                ->getString();
-        if (fn.equals(fileName)) {
-          *pRetVal = i;
-          return S_OK;
-        }
-      }
-    }
-    *pRetVal = 0;
-    return S_FALSE;
-  }
-
-  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
-    return DoBasicQueryInterface<IDiaSession>(this, iid, ppvObject);
-  }
-
-  STDMETHODIMP get_loadAddress(
-    /* [retval][out] */ ULONGLONG *pRetVal) override { 
-    *pRetVal = 0;
-    return S_OK;
-  }
-
-  STDMETHODIMP put_loadAddress(
-    /* [in] */ ULONGLONG NewVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_globalScope(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP getEnumTables(
-    _COM_Outptr_ IDiaEnumTables **ppEnumTables) override {
-    if (!m_pEnumTables) {
-      DxcThreadMalloc TM(m_pMalloc);
-      IFR(CreateDxcDiaEnumTables(this, &m_pEnumTables));
-    }
-    m_pEnumTables.p->AddRef();
-    *ppEnumTables = m_pEnumTables;
-    return S_OK;
-  }
-
-  STDMETHODIMP getSymbolsByAddr(
-    /* [out] */ IDiaEnumSymbolsByAddr **ppEnumbyAddr) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findChildren(
-    /* [in] */ IDiaSymbol *parent,
-  /* [in] */ enum SymTagEnum symtag,
-    /* [in] */ LPCOLESTR name,
-    /* [in] */ DWORD compareFlags,
-    /* [out] */ IDiaEnumSymbols **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findChildrenEx(
-    /* [in] */ IDiaSymbol *parent,
-  /* [in] */ enum SymTagEnum symtag,
-    /* [in] */ LPCOLESTR name,
-    /* [in] */ DWORD compareFlags,
-    /* [out] */ IDiaEnumSymbols **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findChildrenExByAddr(
-    /* [in] */ IDiaSymbol *parent,
-  /* [in] */ enum SymTagEnum symtag,
-    /* [in] */ LPCOLESTR name,
-    /* [in] */ DWORD compareFlags,
-    /* [in] */ DWORD isect,
-    /* [in] */ DWORD offset,
-    /* [out] */ IDiaEnumSymbols **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findChildrenExByVA(
-    /* [in] */ IDiaSymbol *parent,
-  /* [in] */ enum SymTagEnum symtag,
-    /* [in] */ LPCOLESTR name,
-    /* [in] */ DWORD compareFlags,
-    /* [in] */ ULONGLONG va,
-    /* [out] */ IDiaEnumSymbols **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findChildrenExByRVA(
-    /* [in] */ IDiaSymbol *parent,
-  /* [in] */ enum SymTagEnum symtag,
-    /* [in] */ LPCOLESTR name,
-    /* [in] */ DWORD compareFlags,
-    /* [in] */ DWORD rva,
-    /* [out] */ IDiaEnumSymbols **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findSymbolByAddr(
-    /* [in] */ DWORD isect,
-    /* [in] */ DWORD offset,
-  /* [in] */ enum SymTagEnum symtag,
-    /* [out] */ IDiaSymbol **ppSymbol) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findSymbolByRVA(
-    /* [in] */ DWORD rva,
-  /* [in] */ enum SymTagEnum symtag,
-    /* [out] */ IDiaSymbol **ppSymbol) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findSymbolByVA(
-    /* [in] */ ULONGLONG va,
-  /* [in] */ enum SymTagEnum symtag,
-    /* [out] */ IDiaSymbol **ppSymbol) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findSymbolByToken(
-    /* [in] */ ULONG token,
-  /* [in] */ enum SymTagEnum symtag,
-    /* [out] */ IDiaSymbol **ppSymbol) override { return E_NOTIMPL; }
-
-  STDMETHODIMP symsAreEquiv(
-    /* [in] */ IDiaSymbol *symbolA,
-    /* [in] */ IDiaSymbol *symbolB) override { return E_NOTIMPL; }
-
-  STDMETHODIMP symbolById(
-    /* [in] */ DWORD id,
-    /* [out] */ IDiaSymbol **ppSymbol) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findSymbolByRVAEx(
-    /* [in] */ DWORD rva,
-  /* [in] */ enum SymTagEnum symtag,
-    /* [out] */ IDiaSymbol **ppSymbol,
-    /* [out] */ long *displacement) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findSymbolByVAEx(
-    /* [in] */ ULONGLONG va,
-  /* [in] */ enum SymTagEnum symtag,
-    /* [out] */ IDiaSymbol **ppSymbol,
-    /* [out] */ long *displacement) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findFile(
-    /* [in] */ IDiaSymbol *pCompiland,
-    /* [in] */ LPCOLESTR name,
-    /* [in] */ DWORD compareFlags,
-    /* [out] */ IDiaEnumSourceFiles **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findFileById(
-    /* [in] */ DWORD uniqueId,
-    /* [out] */ IDiaSourceFile **ppResult) override {
-    if (!m_pEnumTables) {
-      return E_INVALIDARG;
-    }
-    CComPtr<IDiaTable> pTable;
-    VARIANT vtIndex;
-    vtIndex.vt = VT_UI4;
-    vtIndex.uintVal = (int)DiaTableKind::SourceFiles;
-    IFR(m_pEnumTables->Item(vtIndex, &pTable));
-    CComPtr<IUnknown> pElt;
-    IFR(pTable->Item(uniqueId, &pElt));
-    return pElt->QueryInterface(ppResult);
-  }
-
-  STDMETHODIMP findLines(
-    /* [in] */ IDiaSymbol *compiland,
-    /* [in] */ IDiaSourceFile *file,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findLinesByAddr(
-    /* [in] */ DWORD seg,
-    /* [in] */ DWORD offset,
-    /* [in] */ DWORD length,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) override {
-      DxcThreadMalloc TM(m_pMalloc);
-      return DxcDiaFindLineNumbersByRVA(this, offset, length, ppResult);
-    }
-
-  STDMETHODIMP findLinesByRVA(
-    /* [in] */ DWORD rva,
-    /* [in] */ DWORD length,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) override {
-    DxcThreadMalloc TM(m_pMalloc);
-    return DxcDiaFindLineNumbersByRVA(this, rva, length, ppResult);
-  }
-
-  STDMETHODIMP findLinesByVA(
-    /* [in] */ ULONGLONG va,
-    /* [in] */ DWORD length,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findLinesByLinenum(
-    /* [in] */ IDiaSymbol *compiland,
-    /* [in] */ IDiaSourceFile *file,
-    /* [in] */ DWORD linenum,
-    /* [in] */ DWORD column,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findInjectedSource(
-      /* [in] */ LPCOLESTR srcFile,
-      /* [out] */ IDiaEnumInjectedSources **ppResult) override;
-
-  STDMETHODIMP getEnumDebugStreams(
-    /* [out] */ IDiaEnumDebugStreams **ppEnumDebugStreams) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findInlineFramesByAddr(
-    /* [in] */ IDiaSymbol *parent,
-    /* [in] */ DWORD isect,
-    /* [in] */ DWORD offset,
-    /* [out] */ IDiaEnumSymbols **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findInlineFramesByRVA(
-    /* [in] */ IDiaSymbol *parent,
-    /* [in] */ DWORD rva,
-    /* [out] */ IDiaEnumSymbols **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findInlineFramesByVA(
-    /* [in] */ IDiaSymbol *parent,
-    /* [in] */ ULONGLONG va,
-    /* [out] */ IDiaEnumSymbols **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findInlineeLines(
-    /* [in] */ IDiaSymbol *parent,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findInlineeLinesByAddr(
-    /* [in] */ IDiaSymbol *parent,
-    /* [in] */ DWORD isect,
-    /* [in] */ DWORD offset,
-    /* [in] */ DWORD length,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findInlineeLinesByRVA(
-    /* [in] */ IDiaSymbol *parent,
-    /* [in] */ DWORD rva,
-    /* [in] */ DWORD length,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findInlineeLinesByVA(
-    /* [in] */ IDiaSymbol *parent,
-    /* [in] */ ULONGLONG va,
-    /* [in] */ DWORD length,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findInlineeLinesByLinenum(
-    /* [in] */ IDiaSymbol *compiland,
-    /* [in] */ IDiaSourceFile *file,
-    /* [in] */ DWORD linenum,
-    /* [in] */ DWORD column,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findInlineesByName(
-    /* [in] */ LPCOLESTR name,
-    /* [in] */ DWORD option,
-    /* [out] */ IDiaEnumSymbols **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findAcceleratorInlineeLinesByLinenum(
-    /* [in] */ IDiaSymbol *parent,
-    /* [in] */ IDiaSourceFile *file,
-    /* [in] */ DWORD linenum,
-    /* [in] */ DWORD column,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findSymbolsForAcceleratorPointerTag(
-    /* [in] */ IDiaSymbol *parent,
-    /* [in] */ DWORD tagValue,
-    /* [out] */ IDiaEnumSymbols **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findSymbolsByRVAForAcceleratorPointerTag(
-    /* [in] */ IDiaSymbol *parent,
-    /* [in] */ DWORD tagValue,
-    /* [in] */ DWORD rva,
-    /* [out] */ IDiaEnumSymbols **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findAcceleratorInlineesByName(
-    /* [in] */ LPCOLESTR name,
-    /* [in] */ DWORD option,
-    /* [out] */ IDiaEnumSymbols **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP addressForVA(
-    /* [in] */ ULONGLONG va,
-    /* [out] */ DWORD *pISect,
-    /* [out] */ DWORD *pOffset) override { return E_NOTIMPL; }
-
-  STDMETHODIMP addressForRVA(
-    /* [in] */ DWORD rva,
-    /* [out] */ DWORD *pISect,
-    /* [out] */ DWORD *pOffset) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findILOffsetsByAddr(
-    /* [in] */ DWORD isect,
-    /* [in] */ DWORD offset,
-    /* [in] */ DWORD length,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findILOffsetsByRVA(
-    /* [in] */ DWORD rva,
-    /* [in] */ DWORD length,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findILOffsetsByVA(
-    /* [in] */ ULONGLONG va,
-    /* [in] */ DWORD length,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findInputAssemblyFiles(
-    /* [out] */ IDiaEnumInputAssemblyFiles **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findInputAssembly(
-    /* [in] */ DWORD index,
-    /* [out] */ IDiaInputAssemblyFile **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findInputAssemblyById(
-    /* [in] */ DWORD uniqueId,
-    /* [out] */ IDiaInputAssemblyFile **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP getFuncMDTokenMapSize(
-    /* [out] */ DWORD *pcb) override { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE getFuncMDTokenMap(
-    /* [in] */ DWORD cb,
-    /* [out] */ DWORD *pcb,
-    /* [size_is][out] */ BYTE *pb) { return E_NOTIMPL; }
-
-  STDMETHODIMP getTypeMDTokenMapSize(
-    /* [out] */ DWORD *pcb) override { return E_NOTIMPL; }
-
-  STDMETHODIMP getTypeMDTokenMap(
-    /* [in] */ DWORD cb,
-    /* [out] */ DWORD *pcb,
-    /* [size_is][out] */ BYTE *pb) override { return E_NOTIMPL; }
-
-  STDMETHODIMP getNumberOfFunctionFragments_VA(
-    /* [in] */ ULONGLONG vaFunc,
-    /* [in] */ DWORD cbFunc,
-    /* [out] */ DWORD *pNumFragments) override { return E_NOTIMPL; }
-
-  STDMETHODIMP getNumberOfFunctionFragments_RVA(
-    /* [in] */ DWORD rvaFunc,
-    /* [in] */ DWORD cbFunc,
-    /* [out] */ DWORD *pNumFragments) override { return E_NOTIMPL; }
-
-  STDMETHODIMP getFunctionFragments_VA(
-    /* [in] */ ULONGLONG vaFunc,
-    /* [in] */ DWORD cbFunc,
-    /* [in] */ DWORD cFragments,
-    /* [size_is][out] */ ULONGLONG *pVaFragment,
-    /* [size_is][out] */ DWORD *pLenFragment) override { return E_NOTIMPL; }
-
-  STDMETHODIMP getFunctionFragments_RVA(
-    /* [in] */ DWORD rvaFunc,
-    /* [in] */ DWORD cbFunc,
-    /* [in] */ DWORD cFragments,
-    /* [size_is][out] */ DWORD *pRvaFragment,
-    /* [size_is][out] */ DWORD *pLenFragment) override { return E_NOTIMPL; }
-
-  STDMETHODIMP getExports(
-    /* [out] */ IDiaEnumSymbols **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP getHeapAllocationSites(
-    /* [out] */ IDiaEnumSymbols **ppResult) override { return E_NOTIMPL; }
-
-  STDMETHODIMP findInputAssemblyFile(
-    /* [in] */ IDiaSymbol *pSymbol,
-    /* [out] */ IDiaInputAssemblyFile **ppResult) override { return E_NOTIMPL; }
-private:
-  CComPtr<IDiaEnumTables> m_pEnumTables;
-};
-
-class DxcDiaEnumTables : public IDiaEnumTables {
-private:
-  DXC_MICROCOM_TM_REF_FIELDS()
-protected:
-  CComPtr<DxcDiaSession> m_pSession;
-  unsigned m_next;
-public:
-  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
-
-  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
-    return DoBasicQueryInterface<IDiaEnumTables>(this, iid, ppvObject);
-  }
-
-  DxcDiaEnumTables(IMalloc *pMalloc, DxcDiaSession *pSession)
-      : m_pMalloc(pMalloc), m_pSession(pSession), m_dwRef(0), m_next(0) {
-    m_tables.fill(nullptr);
-  }
-
-  STDMETHODIMP get__NewEnum(
-    /* [retval][out] */ IUnknown **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_Count(_Out_ LONG *pRetVal) override { 
-    *pRetVal = ((unsigned)LastTableKind - (unsigned)FirstTableKind) + 1;
-    return S_OK;
-  }
-
-  STDMETHODIMP Item(
-    /* [in] */ VARIANT index,
-    /* [retval][out] */ IDiaTable **table) override {
-    // Avoid pulling in additional variant support (could have used VariantChangeType instead).
-    DWORD indexVal;
-    switch (index.vt) {
-    case VT_UI4:
-      indexVal = index.uintVal;
-      break;
-    case VT_I4:
-      IFR(IntToDWord(index.intVal, &indexVal));
-      break;
-    default:
-      return E_INVALIDARG;
-    }
-    if (indexVal > (unsigned)LastTableKind) {
-      return E_INVALIDARG;
-    }
-    HRESULT hr = S_OK;
-    if (!m_tables[indexVal]) {
-      DxcThreadMalloc TM(m_pMalloc);
-      hr = CreateDxcDiaTable(m_pSession, (DiaTableKind)indexVal, &m_tables[indexVal]);
-    }
-    m_tables[indexVal].p->AddRef();
-    *table = m_tables[indexVal];
-    return hr;
-  }
-
-  STDMETHODIMP Next(
-    ULONG celt,
-    IDiaTable **rgelt,
-    ULONG *pceltFetched) override {
-    DxcThreadMalloc TM(m_pMalloc);
-    ULONG fetched = 0;
-    while (fetched < celt && m_next <= (unsigned)LastTableKind) {
-      HRESULT hr = S_OK;
-      if (!m_tables[m_next]) {
-        DxcThreadMalloc TM(m_pMalloc);
-        hr = CreateDxcDiaTable(m_pSession, (DiaTableKind)m_next, &m_tables[m_next]);
-        if (FAILED(hr)) {
-          return hr; // TODO: this leaks prior tables.
-        }
-      }
-      m_tables[m_next].p->AddRef();
-      rgelt[fetched] = m_tables[m_next];
-      ++m_next, ++fetched;
-    }
-    if (pceltFetched != nullptr)
-      *pceltFetched = fetched;
-    return (fetched == celt) ? S_OK : S_FALSE;
-  }
-
-  STDMETHODIMP Skip(
-    /* [in] */ ULONG celt) override { return E_NOTIMPL; }
-
-  STDMETHODIMP Reset(void) override { m_next = 0; return S_OK; }
-
-  STDMETHODIMP Clone(
-    /* [out] */ IDiaEnumTables **ppenum) override { return E_NOTIMPL; }
-private:
-  std::array<CComPtr<IDiaTable>, (int)LastTableKind+1> m_tables;
-};
-
-static HRESULT CreateDxcDiaEnumTables(DxcDiaSession *pSession, IDiaEnumTables **ppEnumTables) {
-  *ppEnumTables = CreateOnMalloc<DxcDiaEnumTables>(pSession->GetMallocNoRef(), pSession);
-  if (*ppEnumTables == nullptr)
-    return E_OUTOFMEMORY;
-  (*ppEnumTables)->AddRef();
-  return S_OK;
-}
-
-template<typename T, typename TItem>
-class DxcDiaTableBase : public IDiaTable, public T {
-protected:
-  DXC_MICROCOM_TM_REF_FIELDS()
-  CComPtr<DxcDiaSession> m_pSession;
-  unsigned m_next;
-  unsigned m_count;
-  DiaTableKind m_kind;
-public:
-  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
-
-  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
-    return DoBasicQueryInterface<IDiaTable, T, IEnumUnknown>(this, iid, ppvObject);
-  }
-
-  DxcDiaTableBase(IMalloc *pMalloc, DxcDiaSession *pSession, DiaTableKind kind) {
-    m_pMalloc = pMalloc;
-    m_pSession = pSession;
-    m_kind = kind;
-    m_next = 0;
-    m_count = 0;
-  }
-
-  // IEnumUnknown implementation.
-  STDMETHODIMP Next(
-    _In_  ULONG celt,
-    _Out_writes_to_(celt, *pceltFetched)  IUnknown **rgelt,
-    _Out_opt_  ULONG *pceltFetched) override {
-    DxcThreadMalloc TM(m_pMalloc);
-    ULONG fetched = 0;
-    while (fetched < celt && m_next < m_count) {
-      HRESULT hr = Item(m_next, &rgelt[fetched]);
-      if (FAILED(hr)) {
-        return hr; // TODO: this leaks prior tables.
-      }
-      ++m_next, ++fetched;
-    }
-    if (pceltFetched != nullptr)
-      *pceltFetched = fetched;
-    return (fetched == celt) ? S_OK : S_FALSE;
-  }
-
-  STDMETHODIMP Skip(ULONG celt) override {
-    if (celt + m_next <= m_count) {
-      m_next += celt;
-      return S_OK;
-    }
-    return S_FALSE;
-  }
-
-  STDMETHODIMP Reset(void) override {
-    m_next = 0;
-    return S_OK;
-  }
-
-  STDMETHODIMP Clone(IEnumUnknown **ppenum) override {
-    return E_NOTIMPL;
-  }
-
-  // IDiaTable implementation.
-  STDMETHODIMP get__NewEnum(IUnknown **pRetVal) override {
-    return E_NOTIMPL;
-  }
-
-  STDMETHODIMP get_name(BSTR *pRetVal) override {
-    *pRetVal = SysAllocString(TableNames[(unsigned)m_kind]);
-    return (*pRetVal) ? S_OK : E_OUTOFMEMORY;
-  }
-
-  STDMETHODIMP get_Count(_Out_ LONG *pRetVal) override {
-    *pRetVal = m_count;
-    return S_OK;
-  }
-
-  STDMETHODIMP Item(DWORD index, _COM_Outptr_ IUnknown **table) override {
-    if (index >= m_count)
-      return E_INVALIDARG;
-    return GetItem(index, (TItem **)table);
-  }
-
-  // T implementation (partial).
-  STDMETHODIMP Clone(_COM_Outptr_ T **ppenum) override {
-    *ppenum = nullptr;
-    return E_NOTIMPL;
-  }
-  STDMETHODIMP Next(
-    /* [in] */ ULONG celt,
-    /* [out] */ TItem **rgelt,
-    /* [out] */ ULONG *pceltFetched) override {
-    DxcThreadMalloc TM(m_pMalloc);
-    ULONG fetched = 0;
-    while (fetched < celt && m_next < m_count) {
-      HRESULT hr = GetItem(m_next, &rgelt[fetched]);
-      if (FAILED(hr)) {
-        return hr; // TODO: this leaks prior items.
-      }
-      ++m_next, ++fetched;
-    }
-    if (pceltFetched != nullptr)
-      *pceltFetched = fetched;
-    return (fetched == celt) ? S_OK : S_FALSE;
-  }
-  STDMETHODIMP Item(
-    /* [in] */ DWORD index,
-    /* [retval][out] */ TItem **ppItem) override {
-    DxcThreadMalloc TM(m_pMalloc);
-    if (index >= m_count)
-      return E_INVALIDARG;
-    return GetItem(index, ppItem);
-  }
-
-  virtual HRESULT GetItem(DWORD index, TItem **ppItem) {
-    UNREFERENCED_PARAMETER(index);
-    *ppItem = nullptr;
-    return E_NOTIMPL;
-  }
-};
-
-class DxcDiaSymbol : public IDiaSymbol {
-  DXC_MICROCOM_TM_REF_FIELDS()
-  CComPtr<DxcDiaSession> m_pSession;
-  DWORD m_index;
-  DWORD m_symTag;
-  DWORD m_lexicalParent = 0;
-  DWORD m_dataKind = 0;
-  CComBSTR m_sourceFileName;
-  CComBSTR m_name;
-  CComVariant m_value;
-public:
-  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
-  DXC_MICROCOM_TM_CTOR(DxcDiaSymbol)
-  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
-    return DoBasicQueryInterface<IDiaSymbol>(this, iid, ppvObject);
-  }
-
-  static HRESULT Create(IMalloc *pMalloc, DxcDiaSession *pSession, DWORD index, DWORD symTag, DxcDiaSymbol **pSymbol) {
-    *pSymbol = Alloc(pMalloc);
-    if (*pSymbol == nullptr) return E_OUTOFMEMORY;
-    (*pSymbol)->AddRef();
-    (*pSymbol)->Init(pSession, index, symTag);
-    return S_OK;
-  }
-
-  void Init(DxcDiaSession *pSession, DWORD index, DWORD symTag) {
-    m_pSession = pSession;
-    m_index = index;
-    m_symTag = symTag;
-  }
-
-  void SetDataKind(DWORD value) { m_dataKind = value; }
-  void SetLexicalParent(DWORD value) { m_lexicalParent = value; }
-  void SetName(LPCWSTR value) { m_name = value; }
-  void SetValue(LPCSTR value) { m_value = value; }
-  void SetValue(VARIANT *pValue) { m_value.Copy(pValue); }
-  void SetValue(unsigned value) { m_value = value; }
-  void SetSourceFileName(BSTR value) { m_sourceFileName = value; }
-
-#pragma region IDiaSymbol implementation.
-  STDMETHODIMP get_symIndexId(
-    /* [retval][out] */ DWORD *pRetVal) override { 
-    *pRetVal = m_index;
-    return S_OK;
-  }
-
-  STDMETHODIMP get_symTag(
-    /* [retval][out] */ DWORD *pRetVal) override {
-    *pRetVal = m_symTag;
-    return S_OK;
-  }
-
-  STDMETHODIMP get_name(
-    /* [retval][out] */ BSTR *pRetVal) override {
-    return m_name.CopyTo(pRetVal);
-  }
-
-  STDMETHODIMP get_lexicalParent(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_classParent(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_type(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_dataKind(
-    /* [retval][out] */ DWORD *pRetVal) override {
-    *pRetVal = m_dataKind;
-    return m_dataKind ? S_OK : S_FALSE;
-  }
-
-  STDMETHODIMP get_locationType(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_addressSection(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_addressOffset(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_relativeVirtualAddress(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_virtualAddress(
-    /* [retval][out] */ ULONGLONG *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_registerId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_offset(
-    /* [retval][out] */ LONG *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_length(
-    /* [retval][out] */ ULONGLONG *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_slot(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_volatileType(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_constType(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_unalignedType(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_access(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_libraryName(
-    /* [retval][out] */ BSTR *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_platform(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_language(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_editAndContinueEnabled(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_frontEndMajor(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_frontEndMinor(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_frontEndBuild(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_backEndMajor(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_backEndMinor(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_backEndBuild(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_sourceFileName(
-    /* [retval][out] */ BSTR *pRetVal) override {
-    if (pRetVal == nullptr) {
-      return E_INVALIDARG;
-    }
-    *pRetVal = m_sourceFileName.Copy();
-    return S_OK;
-  }
-
-  STDMETHODIMP get_unused(
-    /* [retval][out] */ BSTR *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_thunkOrdinal(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_thisAdjust(
-    /* [retval][out] */ LONG *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_virtualBaseOffset(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_virtual(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_intro(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_pure(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_callingConvention(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_value(
-    /* [retval][out] */ VARIANT *pRetVal) override { 
-    return VariantCopy(pRetVal, &m_value);
-  }
-
-  STDMETHODIMP get_baseType(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_token(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_timeStamp(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_guid(
-    /* [retval][out] */ GUID *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_symbolsFileName(
-    /* [retval][out] */ BSTR *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_reference(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_count(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_bitPosition(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_arrayIndexType(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_packed(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_constructor(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_overloadedOperator(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_nested(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasNestedTypes(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasAssignmentOperator(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasCastOperator(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_scoped(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_virtualBaseClass(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_indirectVirtualBaseClass(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_virtualBasePointerOffset(
-    /* [retval][out] */ LONG *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_virtualTableShape(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_lexicalParentId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_classParentId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_typeId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_arrayIndexTypeId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_virtualTableShapeId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_code(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_function(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_managed(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_msil(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_virtualBaseDispIndex(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_undecoratedName(
-    /* [retval][out] */ BSTR *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_age(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_signature(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_compilerGenerated(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_addressTaken(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_rank(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_lowerBound(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_upperBound(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_lowerBoundId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_upperBoundId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE get_dataBytes(
-    /* [in] */ DWORD cbData,
-    /* [out] */ DWORD *pcbData,
-    /* [size_is][out] */ BYTE *pbData) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findChildren(
-  /* [in] */ enum SymTagEnum symtag,
-    /* [in] */ LPCOLESTR name,
-    /* [in] */ DWORD compareFlags,
-    /* [out] */ IDiaEnumSymbols **ppResult) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findChildrenEx(
-  /* [in] */ enum SymTagEnum symtag,
-    /* [in] */ LPCOLESTR name,
-    /* [in] */ DWORD compareFlags,
-    /* [out] */ IDiaEnumSymbols **ppResult) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findChildrenExByAddr(
-  /* [in] */ enum SymTagEnum symtag,
-    /* [in] */ LPCOLESTR name,
-    /* [in] */ DWORD compareFlags,
-    /* [in] */ DWORD isect,
-    /* [in] */ DWORD offset,
-    /* [out] */ IDiaEnumSymbols **ppResult) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findChildrenExByVA(
-  /* [in] */ enum SymTagEnum symtag,
-    /* [in] */ LPCOLESTR name,
-    /* [in] */ DWORD compareFlags,
-    /* [in] */ ULONGLONG va,
-    /* [out] */ IDiaEnumSymbols **ppResult) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findChildrenExByRVA(
-  /* [in] */ enum SymTagEnum symtag,
-    /* [in] */ LPCOLESTR name,
-    /* [in] */ DWORD compareFlags,
-    /* [in] */ DWORD rva,
-    /* [out] */ IDiaEnumSymbols **ppResult) { return E_NOTIMPL; }
-
-  STDMETHODIMP get_targetSection(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_targetOffset(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_targetRelativeVirtualAddress(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_targetVirtualAddress(
-    /* [retval][out] */ ULONGLONG *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_machineType(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_oemId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_oemSymbolId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE get_types(
-    /* [in] */ DWORD cTypes,
-    /* [out] */ DWORD *pcTypes,
-    /* [size_is][size_is][out] */ IDiaSymbol **pTypes) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE get_typeIds(
-    /* [in] */ DWORD cTypeIds,
-    /* [out] */ DWORD *pcTypeIds,
-    /* [size_is][out] */ DWORD *pdwTypeIds) { return E_NOTIMPL; }
-
-  STDMETHODIMP get_objectPointerType(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_udtKind(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE get_undecoratedNameEx(
-    /* [in] */ DWORD undecorateOptions,
-    /* [out] */ BSTR *name) { return E_NOTIMPL; }
-
-  STDMETHODIMP get_noReturn(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_customCallingConvention(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_noInline(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_optimizedCodeDebugInfo(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_notReached(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_interruptReturn(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_farReturn(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isStatic(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasDebugInfo(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isLTCG(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isDataAligned(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasSecurityChecks(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_compilerName(
-    /* [retval][out] */ BSTR *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasAlloca(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasSetJump(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasLongJump(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasInlAsm(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasEH(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasSEH(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasEHa(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isNaked(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isAggregated(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isSplitted(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_container(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_inlSpec(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_noStackOrdering(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_virtualBaseTableType(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasManagedCode(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isHotpatchable(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isCVTCIL(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isMSILNetmodule(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isCTypes(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isStripped(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_frontEndQFE(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_backEndQFE(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_wasInlined(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_strictGSCheck(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isCxxReturnUdt(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isConstructorVirtualBase(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_RValueReference(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_unmodifiedType(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_framePointerPresent(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isSafeBuffers(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_intrinsic(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_sealed(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hfaFloat(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hfaDouble(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_liveRangeStartAddressSection(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_liveRangeStartAddressOffset(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_liveRangeStartRelativeVirtualAddress(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_countLiveRanges(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_liveRangeLength(
-    /* [retval][out] */ ULONGLONG *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_offsetInUdt(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_paramBasePointerRegisterId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_localBasePointerRegisterId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isLocationControlFlowDependent(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_stride(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_numberOfRows(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_numberOfColumns(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isMatrixRowMajor(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE get_numericProperties(
-    /* [in] */ DWORD cnt,
-    /* [out] */ DWORD *pcnt,
-    /* [size_is][out] */ DWORD *pProperties) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE get_modifierValues(
-    /* [in] */ DWORD cnt,
-    /* [out] */ DWORD *pcnt,
-    /* [size_is][out] */ WORD *pModifiers) { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isReturnValue(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isOptimizedAway(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_builtInKind(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_registerType(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_baseDataSlot(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_baseDataOffset(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_textureSlot(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_samplerSlot(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_uavSlot(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_sizeInUdt(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_memorySpaceKind(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_unmodifiedTypeId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_subTypeId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_subType(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_numberOfModifiers(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_numberOfRegisterIndices(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isHLSLData(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isPointerToDataMember(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isPointerToMemberFunction(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isSingleInheritance(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isMultipleInheritance(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isVirtualInheritance(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_restrictedType(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isPointerBasedOnSymbolValue(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_baseSymbol(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_baseSymbolId(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_objectFileName(
-    /* [retval][out] */ BSTR *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isAcceleratorGroupSharedLocal(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isAcceleratorPointerTagLiveRange(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isAcceleratorStubFunction(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_numberOfAcceleratorPointerTags(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isSdl(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isWinRTPointer(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isRefUdt(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isValueUdt(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isInterfaceUdt(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findInlineFramesByAddr(
-    /* [in] */ DWORD isect,
-    /* [in] */ DWORD offset,
-    /* [out] */ IDiaEnumSymbols **ppResult) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findInlineFramesByRVA(
-    /* [in] */ DWORD rva,
-    /* [out] */ IDiaEnumSymbols **ppResult) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findInlineFramesByVA(
-    /* [in] */ ULONGLONG va,
-    /* [out] */ IDiaEnumSymbols **ppResult) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findInlineeLines(
-    /* [out] */ IDiaEnumLineNumbers **ppResult) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findInlineeLinesByAddr(
-    /* [in] */ DWORD isect,
-    /* [in] */ DWORD offset,
-    /* [in] */ DWORD length,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findInlineeLinesByRVA(
-    /* [in] */ DWORD rva,
-    /* [in] */ DWORD length,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findInlineeLinesByVA(
-    /* [in] */ ULONGLONG va,
-    /* [in] */ DWORD length,
-    /* [out] */ IDiaEnumLineNumbers **ppResult) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findSymbolsForAcceleratorPointerTag(
-    /* [in] */ DWORD tagValue,
-    /* [out] */ IDiaEnumSymbols **ppResult) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findSymbolsByRVAForAcceleratorPointerTag(
-    /* [in] */ DWORD tagValue,
-    /* [in] */ DWORD rva,
-    /* [out] */ IDiaEnumSymbols **ppResult) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE get_acceleratorPointerTags(
-    /* [in] */ DWORD cnt,
-    /* [out] */ DWORD *pcnt,
-    /* [size_is][out] */ DWORD *pPointerTags) { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE getSrcLineOnTypeDefn(
-    /* [out] */ IDiaLineNumber **ppResult) { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isPGO(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasValidPGOCounts(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_isOptimizedForSpeed(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_PGOEntryCount(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_PGOEdgeCount(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_PGODynamicInstructionCount(
-    /* [retval][out] */ ULONGLONG *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_staticSize(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_finalLiveStaticSize(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_phaseName(
-    /* [retval][out] */ BSTR *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_hasControlFlowCheck(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_constantExport(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_dataExport(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_privateExport(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_noNameExport(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_exportHasExplicitlyAssignedOrdinal(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_exportIsForwarder(
-    /* [retval][out] */ BOOL *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_ordinal(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_frameSize(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_exceptionHandlerAddressSection(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_exceptionHandlerAddressOffset(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_exceptionHandlerRelativeVirtualAddress(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_exceptionHandlerVirtualAddress(
-    /* [retval][out] */ ULONGLONG *pRetVal) override { return E_NOTIMPL; }
-
-  virtual HRESULT STDMETHODCALLTYPE findInputAssemblyFile(
-    /* [out] */ IDiaInputAssemblyFile **ppResult) { return E_NOTIMPL; }
-
-  STDMETHODIMP get_characteristics(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_coffGroup(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  virtual STDMETHODIMP get_bindID(
-    /* [retval][out] */ DWORD *pRetVal) { return E_NOTIMPL; }
-
-  virtual STDMETHODIMP get_bindSpace(
-    /* [retval][out] */ DWORD *pRetVal) { return E_NOTIMPL; }
-
-  virtual STDMETHODIMP get_bindSlot(
-    /* [retval][out] */ DWORD *pRetVal) { return E_NOTIMPL; }
-
-#pragma endregion IDiaSymbol implementation.
-};
-
-class DxcDiaTableSymbols : public DxcDiaTableBase<IDiaEnumSymbols, IDiaSymbol> {
-public:
-  DxcDiaTableSymbols(IMalloc *pMalloc, DxcDiaSession *pSession) : DxcDiaTableBase(pMalloc, pSession, DiaTableKind::Symbols) {
-    // The count is as follows:
-    // One symbol for the program.
-    // One Compiland per compilation unit.
-    // One CompilandDetails per compilation unit.
-    // Three CompilandEnv per Compliands: hlslFlags, hlslTarget, hlslEntry, hlslDefines, hlslArguments.
-    // One Function/Data for each global.
-    // One symbol for each type.
-    const size_t SymbolsPerCU = 1 + 1 + 5;
-    m_count = 1 + pSession->InfoRef().compile_unit_count() * SymbolsPerCU;
-              //pSession->InfoRef().global_variable_count() +
-              //pSession->InfoRef().type_count();
-  }
-
-  HRESULT GetItem(DWORD index, IDiaSymbol **ppItem) override {
-    DxcThreadMalloc TM(m_pMalloc);
-
-    // Ids are one-based, so adjust the index.
-    ++index;
-
-    // Program symbol.
-    CComPtr<DxcDiaSymbol> item;
-    if (index == HlslProgramId) {
-      IFR(DxcDiaSymbol::Create(m_pMalloc, m_pSession, index, SymTagExe, &item));
-      item->SetName(L"HLSL");
-    }
-    else if (index == HlslCompilandId) {
-      IFR(DxcDiaSymbol::Create(m_pMalloc, m_pSession, index, SymTagCompiland, &item));
-      item->SetName(L"main");
-      item->SetLexicalParent(HlslProgramId);
-      if (m_pSession->MainFileName()) {
-        StringRef strRef = dyn_cast<MDString>(m_pSession->MainFileName()->getOperand(0)->getOperand(0))->getString();
-        std::string str(strRef.begin(), strRef.size()); // To make sure str is null terminated
-        item->SetSourceFileName(_bstr_t(Unicode::UTF8ToUTF16StringOrThrow(str.data()).c_str()));
-      }
-    }
-    else if (index == HlslCompilandDetailsId) {
-      IFR(DxcDiaSymbol::Create(m_pMalloc, m_pSession, index, SymTagCompilandDetails, &item));
-      item->SetLexicalParent(HlslCompilandId);
-      // TODO: complete the rest of the compiland details
-      // platform: 256, language: 16, frontEndMajor: 6, frontEndMinor: 3, value: 0, hasDebugInfo: 1, compilerName: comiler string goes here
-    }
-    else if (index == HlslCompilandEnvFlagsId) {
-      IFR(DxcDiaSymbol::Create(m_pMalloc, m_pSession, index, SymTagCompilandEnv, &item));
-      item->SetLexicalParent(HlslCompilandId);
-      item->SetName(L"hlslFlags");
-      item->SetValue(m_pSession->DxilModuleRef().GetGlobalFlags());
-    }
-    else if (index == HlslCompilandEnvTargetId) {
-      IFR(DxcDiaSymbol::Create(m_pMalloc, m_pSession, index, SymTagCompilandEnv, &item));
-      item->SetLexicalParent(HlslCompilandId);
-      item->SetName(L"hlslTarget");
-      item->SetValue(m_pSession->DxilModuleRef().GetShaderModel()->GetName());
-    }
-    else if (index == HlslCompilandEnvEntryId) {
-      IFR(DxcDiaSymbol::Create(m_pMalloc, m_pSession, index, SymTagCompilandEnv, &item));
-      item->SetLexicalParent(HlslCompilandId);
-      item->SetName(L"hlslEntry");
-      item->SetValue(m_pSession->DxilModuleRef().GetEntryFunctionName().c_str());
-    }
-    else if (index == HlslCompilandEnvDefinesId) {
-      IFR(DxcDiaSymbol::Create(m_pMalloc, m_pSession, index, SymTagCompilandEnv, &item));
-      item->SetLexicalParent(HlslCompilandId);
-      item->SetName(L"hlslDefines");
-      UINT32 charSize = 0;
-      llvm::MDNode *definesNode = m_pSession->Defines()->getOperand(0);
-      // Construct a double null terminated string for defines with L"\0" as a delimiter
-      CComBSTR pBSTR;
-      for (llvm::MDNode::op_iterator it = definesNode->op_begin(); it != definesNode->op_end(); ++it) {
-        StringRef strRef = dyn_cast<MDString>(*it)->getString();
-        std::string str(strRef.begin(), strRef.size());
-        CA2W cv(str.c_str());
-        pBSTR.Append(cv);
-        pBSTR.Append(L"\0", 1);
-      }
-      pBSTR.Append(L"\0", 1);
-      VARIANT Variant;
-      Variant.bstrVal = pBSTR;
-      Variant.vt = VARENUM::VT_BSTR;
-      item->SetValue(&Variant);
-    }
-    else if (index == HlslCompilandEnvArgumentsId) {
-      IFR(DxcDiaSymbol::Create(m_pMalloc, m_pSession, index, SymTagCompilandEnv, &item));
-      item->SetLexicalParent(HlslCompilandId);
-      item->SetName(L"hlslArguments");
-      auto Arguments = m_pSession->Arguments()->getOperand(0);
-      auto NumArguments = Arguments->getNumOperands();
-      std::string args;
-      for (unsigned i = 0; i < NumArguments; ++i) {
-        StringRef strRef = dyn_cast<MDString>(Arguments->getOperand(i))->getString();
-        if (!args.empty())
-          args.push_back(' ');
-        args = args + strRef.str();
-      }
-      item->SetValue(args.c_str());
-    }
-
-    // TODO: add support for global data and functions as well as types.
-
-    *ppItem = item.Detach();
-    return (*ppItem == nullptr) ? E_FAIL : S_OK;
-  }
-};
-
-class DxcDiaSourceFile : public IDiaSourceFile {
-  DXC_MICROCOM_TM_REF_FIELDS()
-  CComPtr<DxcDiaSession> m_pSession;
-  DWORD m_index;
-public:
-  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
-  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
-    return DoBasicQueryInterface<IDiaSourceFile>(this, iid, ppvObject);
-  }
-
-  DxcDiaSourceFile(IMalloc *pMalloc, DxcDiaSession *pSession, DWORD index)
-    : m_pMalloc(pMalloc), m_pSession(pSession), m_index(index) {}
-
-  llvm::MDTuple *NameContent() {
-    return cast<llvm::MDTuple>(m_pSession->Contents()->getOperand(m_index));
-  }
-  llvm::StringRef Name() {
-    return dyn_cast<llvm::MDString>(NameContent()->getOperand(0))->getString();
-  }
-
-  STDMETHODIMP get_uniqueId(
-    /* [retval][out] */ DWORD *pRetVal) override {
-    *pRetVal = m_index;
-    return S_OK;
-  }
-
-  STDMETHODIMP get_fileName(
-    /* [retval][out] */ BSTR *pRetVal) override {
-    DxcThreadMalloc TM(m_pMalloc);
-    return StringRefToBSTR(Name(), pRetVal);
-  }
-
-  STDMETHODIMP get_checksumType(
-    /* [retval][out] */ DWORD *pRetVal) override {
-    return E_NOTIMPL;
-  }
-
-  STDMETHODIMP get_compilands(
-    /* [retval][out] */ IDiaEnumSymbols **pRetVal) override {
-    return E_NOTIMPL;
-  }
-
-  STDMETHODIMP get_checksum(
-    /* [in] */ DWORD cbData,
-    /* [out] */ DWORD *pcbData,
-    /* [size_is][out] */ BYTE *pbData) override {
-    return E_NOTIMPL;
-  }
-};
-
-class DxcDiaTableSourceFiles : public DxcDiaTableBase<IDiaEnumSourceFiles, IDiaSourceFile> {
-public:
-  DxcDiaTableSourceFiles(IMalloc *pMalloc, DxcDiaSession *pSession) : DxcDiaTableBase(pMalloc, pSession, DiaTableKind::SourceFiles) { 
-    m_count =
-      (m_pSession->Contents() == nullptr) ? 0 : m_pSession->Contents()->getNumOperands();
-    m_items.assign(m_count, nullptr);
-  }
-
-  HRESULT GetItem(DWORD index, IDiaSourceFile **ppItem) override {
-    if (!m_items[index]) {
-      m_items[index] = CreateOnMalloc<DxcDiaSourceFile>(m_pMalloc, m_pSession, index);
-      if (m_items[index] == nullptr)
-        return E_OUTOFMEMORY;
-    }
-    m_items[index].p->AddRef();
-    *ppItem = m_items[index];
-    (*ppItem)->AddRef();
-    return S_OK;
-  }
-private:
-  std::vector<CComPtr<IDiaSourceFile>> m_items;
-};
-
-class DxcDiaLineNumber : public IDiaLineNumber {
-  DXC_MICROCOM_TM_REF_FIELDS()
-  CComPtr<DxcDiaSession> m_pSession;
-  const Instruction *m_inst;
-public:
-  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
-  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
-    return DoBasicQueryInterface<IDiaLineNumber>(this, iid, ppvObject);
-  }
-
-  DxcDiaLineNumber(IMalloc *pMalloc, DxcDiaSession *pSession, const Instruction * inst)
-    : m_pMalloc(pMalloc), m_pSession(pSession), m_inst(inst) {}
-
-  const llvm::DebugLoc &DL() {
-    DXASSERT(bool(m_inst->getDebugLoc()), "Trying to read line info from invalid debug location");
-    return m_inst->getDebugLoc();
-  }
-
-  STDMETHODIMP get_compiland(
-    /* [retval][out] */ IDiaSymbol **pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_sourceFile(
-    /* [retval][out] */ IDiaSourceFile **pRetVal) override {
-    DWORD id;
-    HRESULT hr = get_sourceFileId(&id);
-    if (hr != S_OK)
-      return hr;
-    return m_pSession->findFileById(id, pRetVal);
-  }
-
-  STDMETHODIMP get_lineNumber(
-    /* [retval][out] */ DWORD *pRetVal) override {
-    *pRetVal = DL().getLine();
-    return S_OK;
-  }
-
-  STDMETHODIMP get_lineNumberEnd(
-    /* [retval][out] */ DWORD *pRetVal) override {
-    *pRetVal = DL().getLine();
-    return S_OK;
-  }
-
-  STDMETHODIMP get_columnNumber(
-    /* [retval][out] */ DWORD *pRetVal) override {
-    *pRetVal = DL().getCol();
-    return S_OK;
-  }
-
-  STDMETHODIMP get_columnNumberEnd(
-    /* [retval][out] */ DWORD *pRetVal) override {
-    *pRetVal = DL().getCol();
-    return S_OK;
-  }
-
-  STDMETHODIMP get_addressSection(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_addressOffset(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_relativeVirtualAddress(
-    /* [retval][out] */ DWORD *pRetVal) override { 
-    *pRetVal = m_pSession->RvaMapRef()[m_inst];
-    return S_OK;
-  }
-
-  STDMETHODIMP get_virtualAddress(
-    /* [retval][out] */ ULONGLONG *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_length(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_sourceFileId(
-    /* [retval][out] */ DWORD *pRetVal) override {
-    MDNode *pScope = DL().getScope();
-    DILexicalBlock *pBlock = dyn_cast_or_null<DILexicalBlock>(pScope);
-    if (pBlock != nullptr) {
-      return m_pSession->getSourceFileIdByName(pBlock->getFile()->getFilename(), pRetVal);
-    }
-    DISubprogram *pSubProgram= dyn_cast_or_null<DISubprogram>(pScope);
-    if (pSubProgram != nullptr) {
-      return m_pSession->getSourceFileIdByName(pSubProgram->getFile()->getFilename(), pRetVal);
-    }
-    *pRetVal = 0;
-    return S_FALSE;
-  }
-
-  STDMETHODIMP get_statement(
-    /* [retval][out] */ BOOL *pRetVal) override {
-    return E_NOTIMPL;
-  }
-
-  STDMETHODIMP get_compilandId(
-    /* [retval][out] */ DWORD *pRetVal) override {
-    // Single compiland for now, so pretty simple.
-    *pRetVal = HlslCompilandId;
-    return S_OK;
-  }
-};
-
-// This class implements the line number table for dxc.
-//
-// It keeps a reference to the list of instructions that contain
-// line number debug info. By default, it points to the full list
-// of instructions that contain line info.
-//
-// It can also be passed a list of instructions that contain line info so
-// that we can iterate over a subset of lines. When passed an explicit list
-// it takes ownership of the list and points its reference to the internal
-// copy of the list.
-class DxcDiaTableLineNumbers : public DxcDiaTableBase<IDiaEnumLineNumbers, IDiaLineNumber> {
-public:
-  DxcDiaTableLineNumbers(IMalloc *pMalloc, DxcDiaSession *pSession) 
-    : DxcDiaTableBase(pMalloc, pSession, DiaTableKind::LineNumbers)
-    , m_instructions(pSession->InstructionLinesRef())
-  {
-    m_count = m_instructions.size();
-  }
-  
-  DxcDiaTableLineNumbers(IMalloc *pMalloc, DxcDiaSession *pSession, std::vector<const Instruction*> &&instructions) 
-    : DxcDiaTableBase(pMalloc, pSession, DiaTableKind::LineNumbers)
-    , m_instructions(m_instructionsStorage)
-    , m_instructionsStorage(std::move(instructions))
-  {
-    m_count = m_instructions.size();
-  }
-  
-
-  HRESULT GetItem(DWORD index, IDiaLineNumber **ppItem) override {
-    if (index >= m_instructions.size())
-      return E_INVALIDARG;
-    *ppItem = CreateOnMalloc<DxcDiaLineNumber>(m_pMalloc, m_pSession, m_instructions[index]);
-    if (*ppItem == nullptr)
-      return E_OUTOFMEMORY;
-    (*ppItem)->AddRef();
-    return S_OK;
-  }
-
-private:
-  // Keep a reference to the instructions that contain the line numbers.
-  const std::vector<const Instruction *> &m_instructions;
-  
-  // Provide storage space for instructions for when the table contains
-  // a subset of all instructions.
-  std::vector<const Instruction *> m_instructionsStorage;
-};
-
-static HRESULT DxcDiaFindLineNumbersByRVA(
-  DxcDiaSession *pSession,
-  DWORD rva,
-  DWORD length,
-  IDiaEnumLineNumbers **ppResult) 
-{
-  if (!ppResult)
-    return E_POINTER;
-
-  std::vector<const Instruction*> instructions;
-  const std::vector<const Instruction*> &allInstructions = pSession->InstructionsRef();
-
-  // Gather the list of insructions that map to the given rva range.
-  for (DWORD i = rva; i < rva + length; ++i) {
-    if (i >= allInstructions.size())
-      return E_INVALIDARG;
-
-    // Only include the instruction if it has debug info for line mappings.
-    const Instruction *inst = allInstructions[i];
-    if (inst->getDebugLoc())
-      instructions.push_back(inst);
-  }
-
-  // Create line number table from explicit instruction list.
-  IMalloc *pMalloc = pSession->GetMallocNoRef();
-  *ppResult = CreateOnMalloc<DxcDiaTableLineNumbers>(pMalloc, pSession, std::move(instructions));
-  if (*ppResult == nullptr)
-    return E_OUTOFMEMORY;
-  (*ppResult)->AddRef();
-  return S_OK;
-}
-
-class DxcDiaTableSections : public DxcDiaTableBase<IDiaEnumSectionContribs, IDiaSectionContrib> {
-public:
-  DxcDiaTableSections(IMalloc *pMalloc, DxcDiaSession *pSession) : DxcDiaTableBase(pMalloc, pSession, DiaTableKind::Sections) { }
-  HRESULT GetItem(DWORD index, IDiaSectionContrib **ppItem) override {
-    *ppItem = nullptr;
-    return E_FAIL;
-  }
-};
-
-class DxcDiaTableSegmentMap : public DxcDiaTableBase<IDiaEnumSegments, IDiaSegment> {
-public:
-  DxcDiaTableSegmentMap(IMalloc *pMalloc, DxcDiaSession *pSession) : DxcDiaTableBase(pMalloc, pSession, DiaTableKind::SegmentMap) { }
-  HRESULT GetItem(DWORD index, IDiaSegment **ppItem) override {
-    *ppItem = nullptr;
-    return E_FAIL;
-  }
-};
-
-class DxcDiaInjectedSource : public IDiaInjectedSource {
-  DXC_MICROCOM_TM_REF_FIELDS()
-  CComPtr<DxcDiaSession> m_pSession;
-  DWORD m_index;
-public:
-  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
-  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
-    return DoBasicQueryInterface<IDiaInjectedSource>(this, iid, ppvObject);
-  }
-
-  DxcDiaInjectedSource(IMalloc *pMalloc, DxcDiaSession *pSession, DWORD index)
-    : m_pMalloc(pMalloc), m_pSession(pSession), m_index(index) {}
-
-  llvm::MDTuple *NameContent() {
-    return cast<llvm::MDTuple>(m_pSession->Contents()->getOperand(m_index));
-  }
-  llvm::StringRef Name() {
-    return dyn_cast<llvm::MDString>(NameContent()->getOperand(0))->getString();
-  }
-  llvm::StringRef Content() {
-    return dyn_cast<llvm::MDString>(NameContent()->getOperand(1))->getString();
-  }
-
-  STDMETHODIMP get_crc(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_length(_Out_ ULONGLONG *pRetVal) override {
-    *pRetVal = Content().size();
-    return S_OK;
-  }
-
-  STDMETHODIMP get_filename(BSTR *pRetVal) override {
-    DxcThreadMalloc TM(m_pMalloc);
-    return StringRefToBSTR(Name(), pRetVal);
-  }
-
-  STDMETHODIMP get_objectFilename(BSTR *pRetVal) override {
-    *pRetVal = nullptr;
-    return S_OK;
-  }
-
-  STDMETHODIMP get_virtualFilename(BSTR *pRetVal) override {
-    return get_filename(pRetVal);
-  }
-
-  STDMETHODIMP get_sourceCompression(
-    /* [retval][out] */ DWORD *pRetVal) override { return E_NOTIMPL; }
-
-  STDMETHODIMP get_source(
-    /* [in] */ DWORD cbData,
-    /* [out] */ DWORD *pcbData,
-    /* [size_is][out] */ BYTE *pbData) override {
-    if (pbData == nullptr) {
-      if (pcbData != nullptr) {
-        *pcbData = Content().size();
-      }
-      return S_OK;
-    }
-
-    cbData = std::min((DWORD)Content().size(), cbData);
-    memcpy(pbData, Content().begin(), cbData);
-    if (pcbData) {
-      *pcbData = cbData;
-    }
-    return S_OK;
-  }
-};
-
-class DxcDiaTableInjectedSource : public DxcDiaTableBase<IDiaEnumInjectedSources, IDiaInjectedSource> {
-public:
-  DxcDiaTableInjectedSource(IMalloc *pMalloc, DxcDiaSession *pSession) : DxcDiaTableBase(pMalloc, pSession, DiaTableKind::InjectedSource) {
-    // Count the number of source files available.
-    // m_count = m_pSession->InfoRef().compile_unit_count();
-    m_count =
-      (m_pSession->Contents() == nullptr) ? 0 : m_pSession->Contents()->getNumOperands();
-  }
-
-  HRESULT GetItem(DWORD index, IDiaInjectedSource **ppItem) override {
-    if (index >= m_count)
-      return E_INVALIDARG;
-    unsigned itemIndex = index;
-    if (m_count == m_indexList.size())
-      itemIndex = m_indexList[index];
-    *ppItem = CreateOnMalloc<DxcDiaInjectedSource>(m_pMalloc, m_pSession, itemIndex);
-    if (*ppItem == nullptr)
-      return E_OUTOFMEMORY;
-    (*ppItem)->AddRef();
-    return S_OK;
-  }
-  void Init(StringRef filename) {
-    for (unsigned i = 0; i < m_pSession->Contents()->getNumOperands(); ++i) {
-      StringRef fn =
-          dyn_cast<MDString>(m_pSession->Contents()->getOperand(i)->getOperand(0))
-              ->getString();
-      if (fn.equals(filename)) {
-        m_indexList.emplace_back(i);
-      }
-    }
-    m_count = m_indexList.size();
-  }
-private:
-  std::vector<unsigned> m_indexList;
-};
-
-class DxcDiaTableFrameData : public DxcDiaTableBase<IDiaEnumFrameData, IDiaFrameData> {
-public:
-  DxcDiaTableFrameData(IMalloc *pMalloc, DxcDiaSession *pSession) : DxcDiaTableBase(pMalloc, pSession, DiaTableKind::FrameData) { }
-  // HLSL inlines functions for a program, so no data to return.
-  STDMETHODIMP frameByRVA(
-    /* [in] */ DWORD relativeVirtualAddress,
-    /* [retval][out] */ IDiaFrameData **frame) override {
-    return E_NOTIMPL;
-  }
-
-  STDMETHODIMP frameByVA(
-    /* [in] */ ULONGLONG virtualAddress,
-    /* [retval][out] */ IDiaFrameData **frame) override {
-    return E_NOTIMPL;
-  }
-};
-
-class DxcDiaTableInputAssemblyFile : public DxcDiaTableBase<IDiaEnumInputAssemblyFiles, IDiaInputAssemblyFile> {
-public:
-  DxcDiaTableInputAssemblyFile(IMalloc *pMalloc, DxcDiaSession *pSession) : DxcDiaTableBase(pMalloc, pSession, DiaTableKind::InputAssemblyFile) { }
-  // HLSL is not based on IL, so no data to return.
-};
-
-STDMETHODIMP DxcDiaSession::findInjectedSource(
-    /* [in] */ LPCOLESTR srcFile,
-    /* [out] */ IDiaEnumInjectedSources **ppResult) {
-  if (Contents() != nullptr) {
-    CW2A pUtf8FileName(srcFile);
-    DxcThreadMalloc TM(m_pMalloc);
-    IDiaTable *pTable;
-    IFT(CreateDxcDiaTable(this, DiaTableKind::InjectedSource, &pTable));
-    DxcDiaTableInjectedSource *pInjectedSource =
-        reinterpret_cast<DxcDiaTableInjectedSource *>(pTable);
-    pInjectedSource->Init(pUtf8FileName.m_psz);
-    *ppResult = pInjectedSource;
-    return S_OK;
-  }
-  return S_FALSE;
-}
-
-static
-HRESULT CreateDxcDiaTable(DxcDiaSession *pSession, DiaTableKind kind, IDiaTable **ppTable) {
-  *ppTable = nullptr;
-  IMalloc *pMalloc = pSession->GetMallocNoRef();
-  switch (kind) {
-  case DiaTableKind::Symbols: *ppTable = CreateOnMalloc<DxcDiaTableSymbols>(pMalloc, pSession); break;
-  case DiaTableKind::SourceFiles: *ppTable = CreateOnMalloc<DxcDiaTableSourceFiles>(pMalloc, pSession); break;
-  case DiaTableKind::LineNumbers: *ppTable = CreateOnMalloc<DxcDiaTableLineNumbers>(pMalloc, pSession); break;
-  case DiaTableKind::Sections: *ppTable = CreateOnMalloc<DxcDiaTableSections>(pMalloc, pSession); break;
-  case DiaTableKind::SegmentMap: *ppTable = CreateOnMalloc<DxcDiaTableSegmentMap>(pMalloc, pSession); break;
-  case DiaTableKind::InjectedSource: *ppTable = CreateOnMalloc<DxcDiaTableInjectedSource>(pMalloc, pSession); break;
-  case DiaTableKind::FrameData: *ppTable = CreateOnMalloc<DxcDiaTableFrameData>(pMalloc, pSession); break;
-  case DiaTableKind::InputAssemblyFile: *ppTable = CreateOnMalloc<DxcDiaTableInputAssemblyFile>(pMalloc, pSession); break;
-  default: return E_FAIL;
-  }
-  if (*ppTable == nullptr)
-    return E_OUTOFMEMORY;
-  (*ppTable)->AddRef();
-  return S_OK;
-}
-
-class DxcDiaDataSource : public IDiaDataSource {
-private:
-  DXC_MICROCOM_TM_REF_FIELDS()
-  std::shared_ptr<llvm::Module> m_module;
-  std::shared_ptr<llvm::LLVMContext> m_context;
-  std::shared_ptr<llvm::DebugInfoFinder> m_finder;
-public:
-  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
-
-  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
-    return DoBasicQueryInterface<IDiaDataSource>(this, iid, ppvObject);
-  }
-
-  DxcDiaDataSource(IMalloc *pMalloc) : m_pMalloc(pMalloc) {}
-  ~DxcDiaDataSource() {
-    // These are cross-referenced, so let's be explicit.
-    m_finder.reset();
-    m_module.reset();
-    m_context.reset();
-  }
-
-  HRESULT STDMETHODCALLTYPE get_lastError(BSTR *pRetVal) override {
-    *pRetVal = nullptr;
-    return S_OK;
-  }
-
-  HRESULT STDMETHODCALLTYPE loadDataFromPdb(_In_ LPCOLESTR pdbPath) override {
-    return E_NOTIMPL;
-  }
-
-  HRESULT STDMETHODCALLTYPE loadAndValidateDataFromPdb(
-    _In_ LPCOLESTR pdbPath,
-    _In_ GUID *pcsig70,
-    _In_ DWORD sig,
-    _In_ DWORD age) override {
-    return E_NOTIMPL;
-  }
-
-  HRESULT STDMETHODCALLTYPE loadDataForExe(
-    _In_ LPCOLESTR executable,
-    _In_ LPCOLESTR searchPath,
-    _In_ IUnknown *pCallback) override {
-    return E_NOTIMPL;
-  }
-
-  STDMETHODIMP loadDataFromIStream(_In_ IStream *pIStream) override {
-    DxcThreadMalloc TM(m_pMalloc);
-    if (m_module.get() != nullptr) {
-      return E_FAIL;
-    }
-    m_context.reset();
-    m_finder.reset();
-    try {
-      m_context = std::make_shared<LLVMContext>();
-      MemoryBuffer *pBitcodeBuffer;
-      std::unique_ptr<MemoryBuffer> pEmbeddedBuffer;
-      std::unique_ptr<MemoryBuffer> pBuffer =
-          getMemBufferFromStream(pIStream, "data");
-      size_t bufferSize = pBuffer->getBufferSize();
-
-      // The buffer can hold LLVM bitcode for a module, or the ILDB
-      // part from a container.
-      if (bufferSize < sizeof(UINT32)) {
-        return DXC_E_MALFORMED_CONTAINER;
-      }
-      const UINT32 BC_C0DE = ((INT32)(INT8)'B' | (INT32)(INT8)'C' << 8 | (INT32)0xDEC0 << 16); // BC0xc0de in big endian
-      if (BC_C0DE == *(const UINT32*)pBuffer->getBufferStart()) {
-        pBitcodeBuffer = pBuffer.get();
-      }
-      else {
-        if (bufferSize <= sizeof(hlsl::DxilProgramHeader)) {
-          return DXC_E_MALFORMED_CONTAINER;
-        }
-
-        hlsl::DxilProgramHeader *pDxilProgramHeader = (hlsl::DxilProgramHeader *)pBuffer->getBufferStart();
-        if (pDxilProgramHeader->BitcodeHeader.DxilMagic != DxilMagicValue) {
-          return DXC_E_MALFORMED_CONTAINER;
-        }
-
-        UINT32 BlobSize;
-        const char *pBitcode = nullptr;
-        hlsl::GetDxilProgramBitcode(pDxilProgramHeader, &pBitcode, &BlobSize);
-        UINT32 offset = (UINT32)(pBitcode - (const char *)pDxilProgramHeader);
-        std::unique_ptr<MemoryBuffer> p = MemoryBuffer::getMemBuffer(
-            StringRef(pBitcode, bufferSize - offset), "data");
-        pEmbeddedBuffer.swap(p);
-        pBitcodeBuffer = pEmbeddedBuffer.get();
-      }
-
-      std::string DiagStr;
-      std::unique_ptr<llvm::Module> pModule = dxilutil::LoadModuleFromBitcode(
-          pBitcodeBuffer, *m_context.get(), DiagStr);
-      if (!pModule.get())
-        return E_FAIL;
-      m_finder = std::make_shared<DebugInfoFinder>();
-      m_finder->processModule(*pModule.get());
-      m_module.reset(pModule.release());
-    }
-    CATCH_CPP_RETURN_HRESULT();
-    return S_OK;
-  }
-
-  STDMETHODIMP openSession(_COM_Outptr_ IDiaSession **ppSession) override {
-    DxcThreadMalloc TM(m_pMalloc);
-    *ppSession = nullptr;
-    if (m_module.get() == nullptr)
-      return E_FAIL;
-    CComPtr<DxcDiaSession> pSession = DxcDiaSession::Alloc(DxcGetThreadMallocNoRef());
-    IFROOM(pSession.p);
-    pSession->Init(m_context, m_module, m_finder);
-    *ppSession = pSession.Detach();
-    return S_OK;
-  }
-
-  HRESULT STDMETHODCALLTYPE loadDataFromCodeViewInfo(
-    _In_ LPCOLESTR executable,
-    _In_ LPCOLESTR searchPath,
-    _In_ DWORD cbCvInfo,
-    _In_ BYTE *pbCvInfo,
-    _In_ IUnknown *pCallback) override {
-    return E_NOTIMPL;
-  }
-
-  HRESULT STDMETHODCALLTYPE loadDataFromMiscInfo(
-    _In_ LPCOLESTR executable,
-    _In_ LPCOLESTR searchPath,
-    _In_ DWORD timeStampExe,
-    _In_ DWORD timeStampDbg,
-    _In_ DWORD sizeOfExe,
-    _In_ DWORD cbMiscInfo,
-    _In_ BYTE *pbMiscInfo,
-    _In_ IUnknown *pCallback) override {
-    return E_NOTIMPL;
-  }
-};
-
-HRESULT CreateDxcDiaDataSource(_In_ REFIID riid, _Out_ LPVOID* ppv) {
-  CComPtr<DxcDiaDataSource> result = CreateOnMalloc<DxcDiaDataSource>(DxcGetThreadMallocNoRef());
-  if (result == nullptr) {
-    *ppv = nullptr;
-    return E_OUTOFMEMORY;
-  }
-
-  return result.p->QueryInterface(riid, ppv);
-}
diff --git a/tools/clang/tools/dxcompiler/dxcdisassembler.cpp b/tools/clang/tools/dxcompiler/dxcdisassembler.cpp
index 933117c80..3b3094f92 100644
--- a/tools/clang/tools/dxcompiler/dxcdisassembler.cpp
+++ b/tools/clang/tools/dxcompiler/dxcdisassembler.cpp
@@ -784,9 +784,9 @@ void PrintFieldLayout(llvm::Type *Ty, DxilFieldAnnotation &annotation,
     llvm::Type *EltTy = Ty;
     unsigned arraySize = 0;
     unsigned arrayLevel = 0;
-    if (!HLMatrixLower::IsMatrixType(EltTy) && EltTy->isArrayTy()) {
+    if (!dxilutil::IsHLSLMatrixType(EltTy) && EltTy->isArrayTy()) {
       arraySize = 1;
-      while (!HLMatrixLower::IsMatrixType(EltTy) && EltTy->isArrayTy()) {
+      while (!dxilutil::IsHLSLMatrixType(EltTy) && EltTy->isArrayTy()) {
         arraySize *= EltTy->getArrayNumElements();
         EltTy = EltTy->getArrayElementType();
         arrayLevel++;
@@ -817,7 +817,7 @@ void PrintFieldLayout(llvm::Type *Ty, DxilFieldAnnotation &annotation,
     }
 
     std::string StreamStr;
-    if (!HLMatrixLower::IsMatrixType(EltTy) && EltTy->isStructTy()) {
+    if (!dxilutil::IsHLSLMatrixType(EltTy) && EltTy->isStructTy()) {
       std::string NameTypeStr = annotation.GetFieldName();
       raw_string_ostream Stream(NameTypeStr);
       if (arraySize)
@@ -901,7 +901,7 @@ void PrintStructBufferDefinition(DxilResource *buf,
   OS << comment << "\n";
   llvm::Type *RetTy = buf->GetRetType();
   // Skip none struct type.
-  if (!RetTy->isStructTy() || HLMatrixLower::IsMatrixType(RetTy)) {
+  if (!RetTy->isStructTy() || dxilutil::IsHLSLMatrixType(RetTy)) {
     llvm::Type *Ty = buf->GetGlobalSymbol()->getType()->getPointerElementType();
     // For resource array, use element type.
     if (Ty->isArrayTy())
diff --git a/tools/clang/tools/dxcompiler/dxcompilerobj.cpp b/tools/clang/tools/dxcompiler/dxcompilerobj.cpp
index e607f61f3..5d59433ac 100644
--- a/tools/clang/tools/dxcompiler/dxcompilerobj.cpp
+++ b/tools/clang/tools/dxcompiler/dxcompilerobj.cpp
@@ -368,8 +368,8 @@ public:
         IFT(ppSrcCodeResult->GetStatus(&status));
         if (SUCCEEDED(status)) {
           IFT(ppSrcCodeResult->GetResult(&ppSrcCode));
+          pSource = ppSrcCode;
         }
-        pSource = ppSrcCode;
       }
 #endif // ENABLE_SPIRV_CODEGEN
 
@@ -923,6 +923,7 @@ public:
       compiler.getCodeGenOpts().UnrollLoops = true;
 
     compiler.getCodeGenOpts().HLSLHighLevel = Opts.CodeGenHighLevel;
+    compiler.getCodeGenOpts().HLSLResMayAlias = Opts.ResMayAlias;
     compiler.getCodeGenOpts().HLSLAllResourcesBound = Opts.AllResourcesBound;
     compiler.getCodeGenOpts().HLSLDefaultRowMajor = Opts.DefaultRowMajor;
     compiler.getCodeGenOpts().HLSLPreferControlFlow = Opts.PreferFlowControl;
diff --git a/tools/clang/unittests/HLSL/CompilerTest.cpp b/tools/clang/unittests/HLSL/CompilerTest.cpp
index 576fe2479..3b508e8a7 100644
--- a/tools/clang/unittests/HLSL/CompilerTest.cpp
+++ b/tools/clang/unittests/HLSL/CompilerTest.cpp
@@ -231,9 +231,7 @@ public:
   TEST_METHOD(CompileWhenODumpThenOptimizerMatch)
   TEST_METHOD(CompileWhenVdThenProducesDxilContainer)
 
-#ifndef DXC_ON_APPVEYOR_CI
   TEST_METHOD(CompileWhenNoMemThenOOM)
-#endif // DXC_ON_APPVEYOR_CI
   TEST_METHOD(CompileWhenShaderModelMismatchAttributeThenFail)
   TEST_METHOD(CompileBadHlslThenFail)
   TEST_METHOD(CompileLegacyShaderModelThenFail)
@@ -246,37 +244,12 @@ public:
   TEST_METHOD(CompileHlsl2017ThenOK)
   TEST_METHOD(CompileHlsl2018ThenOK)
   TEST_METHOD(CompileHlsl2019ThenFail)
-  TEST_METHOD(CompileCBufferTBufferASTDump)
 
   TEST_METHOD(DiaLoadBadBitcodeThenFail)
   TEST_METHOD(DiaLoadDebugThenOK)
   TEST_METHOD(DiaTableIndexThenOK)
 
-  TEST_METHOD(PixMSAAToSample0)
-  TEST_METHOD(PixRemoveDiscards)
-  TEST_METHOD(PixPixelCounter)
-  TEST_METHOD(PixPixelCounterEarlyZ)
-  TEST_METHOD(PixPixelCounterNoSvPosition)
-  TEST_METHOD(PixPixelCounterAddPixelCost)
-  TEST_METHOD(PixConstantColor)
-  TEST_METHOD(PixConstantColorInt)
-  TEST_METHOD(PixConstantColorMRT)
-  TEST_METHOD(PixConstantColorUAVs)
-  TEST_METHOD(PixConstantColorOtherSIVs)
-  TEST_METHOD(PixConstantColorFromCB)
-  TEST_METHOD(PixConstantColorFromCBint)
-  TEST_METHOD(PixForceEarlyZ)
-  TEST_METHOD(PixDebugBasic)
-  TEST_METHOD(PixDebugUAVSize)
-  TEST_METHOD(PixDebugGSParameters)
-  TEST_METHOD(PixDebugPSParameters)
-  TEST_METHOD(PixDebugVSParameters)
-  TEST_METHOD(PixDebugCSParameters)
-  TEST_METHOD(PixDebugFlowControl)
-  TEST_METHOD(PixDebugPreexistingSVPosition)
-  TEST_METHOD(PixDebugPreexistingSVVertex)
-  TEST_METHOD(PixDebugPreexistingSVInstance)
-  TEST_METHOD(PixAccessTracking)
+  TEST_METHOD(CodeGenPix)
 
   TEST_METHOD(CodeGenAbs1)
   TEST_METHOD(CodeGenAbs2)
@@ -304,6 +277,7 @@ public:
   TEST_METHOD(CodeGenCalcLod2DArray)
   TEST_METHOD(CodeGenCall1)
   TEST_METHOD(CodeGenCall3)
+  TEST_METHOD(CodeGenCastBetweenTypeShapes)
   TEST_METHOD(CodeGenCast1)
   TEST_METHOD(CodeGenCast2)
   TEST_METHOD(CodeGenCast3)
@@ -353,11 +327,6 @@ public:
   TEST_METHOD(CodeGenDot1)
   TEST_METHOD(CodeGenDynamic_Resources)
   TEST_METHOD(CodeGenEffectSkip)
-  TEST_METHOD(CodeGenEliminateDynamicIndexing)
-  TEST_METHOD(CodeGenEliminateDynamicIndexing2)
-  TEST_METHOD(CodeGenEliminateDynamicIndexing3)
-  TEST_METHOD(CodeGenEliminateDynamicIndexing4)
-  TEST_METHOD(CodeGenEliminateDynamicIndexing6)
   TEST_METHOD(CodeGenEmpty)
   TEST_METHOD(CodeGenEmptyStruct)
   TEST_METHOD(CodeGenEnum1)
@@ -486,6 +455,7 @@ public:
   TEST_METHOD(CodeGenMatSubscript6)
   TEST_METHOD(CodeGenMatSubscript7)
   TEST_METHOD(CodeGenMaxMin)
+  TEST_METHOD(CodeGenMaxMinLiteral)
   TEST_METHOD(CodeGenMinprec1)
   TEST_METHOD(CodeGenMinprec2)
   TEST_METHOD(CodeGenMinprec3)
@@ -530,7 +500,6 @@ public:
   TEST_METHOD(CodeGenPrecise4)
   TEST_METHOD(CodeGenPreciseOnCall)
   TEST_METHOD(CodeGenPreciseOnCallNot)
-  TEST_METHOD(CodeGenPreserveAllOutputs)
   TEST_METHOD(CodeGenRaceCond2)
   TEST_METHOD(CodeGenRaw_Buf1)
   TEST_METHOD(CodeGenRaw_Buf2)
@@ -601,7 +570,6 @@ public:
   TEST_METHOD(CodeGenSimpleHS9)
   TEST_METHOD(CodeGenSimpleHS10)
   TEST_METHOD(CodeGenSimpleHS11)
-  TEST_METHOD(CodeGenSMFail)
   TEST_METHOD(CodeGenSrv_Ms_Load1)
   TEST_METHOD(CodeGenSrv_Ms_Load2)
   TEST_METHOD(CodeGenSrv_Typed_Load1)
@@ -925,6 +893,7 @@ public:
   TEST_METHOD(CodeGenDx12MiniEngineParticlesortindirectargscs)
   TEST_METHOD(CodeGenDx12MiniEngineParticlespawncs)
   TEST_METHOD(CodeGenDx12MiniEngineParticletilecullingcs)
+  TEST_METHOD(CodeGenDx12MiniEngineParticletilecullingcs_fail_unroll)
   TEST_METHOD(CodeGenDx12MiniEngineParticletilerendercs)
   TEST_METHOD(CodeGenDx12MiniEngineParticletilerenderfastcs)
   TEST_METHOD(CodeGenDx12MiniEngineParticletilerenderfastdynamiccs)
@@ -953,12 +922,18 @@ public:
   TEST_METHOD(ViewID)
   TEST_METHOD(SubobjectCodeGenErrors)
   TEST_METHOD(ShaderCompatSuite)
+  TEST_METHOD(Unroll)
   TEST_METHOD(QuickTest)
   TEST_METHOD(QuickLlTest)
-  BEGIN_TEST_METHOD(SingleFileCheckTest)
+  BEGIN_TEST_METHOD(ManualFileCheckTest)
     TEST_METHOD_PROPERTY(L"Ignore", L"true")
   END_TEST_METHOD()
 
+  // Batch directories
+  TEST_METHOD(CodeGenDeclarations)
+  TEST_METHOD(CodeGenExpressions)
+  TEST_METHOD(CodeGenPreprocessor)
+
   dxc::DxcDllSupport m_dllSupport;
   VersionSupportInfo m_ver;
 
@@ -2580,7 +2555,6 @@ public:
   }
 };
 
-#ifndef DXC_ON_APPVEYOR_CI
 TEST_F(CompilerTest, CompileWhenNoMemThenOOM) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
 
@@ -2678,7 +2652,6 @@ TEST_F(CompilerTest, CompileWhenNoMemThenOOM) {
     VERIFY_ARE_EQUAL(initialRefCount, InstrMalloc.GetRefCount());
   }
 }
-#endif // DXC_ON_APPVEYOR_CI
 
 TEST_F(CompilerTest, CompileWhenShaderModelMismatchAttributeThenFail) {
   CComPtr<IDxcCompiler> pCompiler;
@@ -2864,10 +2837,6 @@ TEST_F(CompilerTest, CompileHlsl2019ThenFail) {
   CheckOperationResultMsgs(pResult, &pErrorMsg, 1, false, false);
 }
 
-TEST_F(CompilerTest, CompileCBufferTBufferASTDump) {
-  CodeGenTestCheck(L"ctbuf.hlsl");
-}
-
 #ifdef _WIN32 // - exclude dia stuff
 TEST_F(CompilerTest, DiaLoadBadBitcodeThenFail) {
   CComPtr<IDxcBlob> pBadBitcode;
@@ -2939,104 +2908,8 @@ TEST_F(CompilerTest, DiaTableIndexThenOK) {
 }
 #endif // _WIN32 - exclude dia stuff
 
-TEST_F(CompilerTest, PixMSAAToSample0) {
-  CodeGenTestCheck(L"pix\\msaaLoad.hlsl");
-}
-
-TEST_F(CompilerTest, PixRemoveDiscards) {
-  CodeGenTestCheck(L"pix\\removeDiscards.hlsl");
-}
-
-TEST_F(CompilerTest, PixPixelCounter) {
-  CodeGenTestCheck(L"pix\\pixelCounter.hlsl");
-}
-
-TEST_F(CompilerTest, PixPixelCounterEarlyZ) {
-  CodeGenTestCheck(L"pix\\pixelCounterEarlyZ.hlsl");
-}
-
-TEST_F(CompilerTest, PixPixelCounterNoSvPosition) {
-  CodeGenTestCheck(L"pix\\pixelCounterNoSvPosition.hlsl");
-}
-
-TEST_F(CompilerTest, PixPixelCounterAddPixelCost) {
-  CodeGenTestCheck(L"pix\\pixelCounterAddPixelCost.hlsl");
-}
-
-TEST_F(CompilerTest, PixConstantColor) {
-  CodeGenTestCheck(L"pix\\constantcolor.hlsl");
-}
-
-TEST_F(CompilerTest, PixConstantColorInt) {
-  CodeGenTestCheck(L"pix\\constantcolorint.hlsl");
-}
-
-TEST_F(CompilerTest, PixConstantColorMRT) {
-  CodeGenTestCheck(L"pix\\constantcolorMRT.hlsl");
-}
-
-TEST_F(CompilerTest, PixConstantColorUAVs) {
-  CodeGenTestCheck(L"pix\\constantcolorUAVs.hlsl");
-}
-
-TEST_F(CompilerTest, PixConstantColorOtherSIVs) {
-  CodeGenTestCheck(L"pix\\constantcolorOtherSIVs.hlsl");
-}
-
-TEST_F(CompilerTest, PixConstantColorFromCB) {
-  CodeGenTestCheck(L"pix\\constantcolorFromCB.hlsl");
-}
-
-TEST_F(CompilerTest, PixConstantColorFromCBint) {
-  CodeGenTestCheck(L"pix\\constantcolorFromCBint.hlsl");
-}
-
-TEST_F(CompilerTest, PixForceEarlyZ) {
-  CodeGenTestCheck(L"pix\\forceEarlyZ.hlsl");
-}
-
-TEST_F(CompilerTest, PixDebugBasic) {
-  CodeGenTestCheck(L"pix\\DebugBasic.hlsl");
-}
-
-TEST_F(CompilerTest, PixDebugUAVSize) {
-  CodeGenTestCheck(L"pix\\DebugUAVSize.hlsl");
-}
-
-TEST_F(CompilerTest, PixDebugGSParameters) {
-  CodeGenTestCheck(L"pix\\DebugGSParameters.hlsl");
-}
-
-TEST_F(CompilerTest, PixDebugPSParameters) {
-  CodeGenTestCheck(L"pix\\DebugPSParameters.hlsl");
-}
-
-TEST_F(CompilerTest, PixDebugVSParameters) {
-  CodeGenTestCheck(L"pix\\DebugVSParameters.hlsl");
-}
-
-TEST_F(CompilerTest, PixDebugCSParameters) {
-  CodeGenTestCheck(L"pix\\DebugCSParameters.hlsl");
-}
-
-TEST_F(CompilerTest, PixDebugFlowControl) {
-  CodeGenTestCheck(L"pix\\DebugFlowControl.hlsl");
-}
-
-TEST_F(CompilerTest, PixDebugPreexistingSVPosition) {
-  CodeGenTestCheck(L"pix\\DebugPreexistingSVPosition.hlsl");
-}
-
-TEST_F(CompilerTest, PixDebugPreexistingSVVertex) {
-  CodeGenTestCheck(L"pix\\DebugPreexistingSVVertex.hlsl");
-}
-
-TEST_F(CompilerTest, PixDebugPreexistingSVInstance) {
-  CodeGenTestCheck(L"pix\\DebugPreexistingSVInstance.hlsl");
-}
-
-TEST_F(CompilerTest, PixAccessTracking) {
-  CodeGenTestCheck(L"pix\\AccessTracking.hlsl");
+TEST_F(CompilerTest, CodeGenPix) {
+  CodeGenTestCheckBatchDir(L"..\\CodeGenHLSL\\pix");
 }
 
 TEST_F(CompilerTest, CodeGenAbs1) {
@@ -3149,6 +3022,10 @@ TEST_F(CompilerTest, CodeGenCall3) {
   CodeGenTest(L"..\\CodeGenHLSL\\call3.hlsl");
 }
 
+TEST_F(CompilerTest, CodeGenCastBetweenTypeShapes) {
+  CodeGenTestCheckBatchDir(L"..\\CodeGenHLSL\\cast_between_type_shapes");
+}
+
 TEST_F(CompilerTest, CodeGenCast1) {
   CodeGenTest(L"..\\CodeGenHLSL\\cast1.hlsl");
 }
@@ -3351,26 +3228,6 @@ TEST_F(CompilerTest, CodeGenEffectSkip) {
   CodeGenTestCheck(L"..\\CodeGenHLSL\\effect_skip.hlsl");
 }
 
-TEST_F(CompilerTest, CodeGenEliminateDynamicIndexing) {
-  CodeGenTestCheck(L"eliminate_dynamic_output.hlsl");
-}
-
-TEST_F(CompilerTest, CodeGenEliminateDynamicIndexing2) {
-  CodeGenTestCheck(L"eliminate_dynamic_output2.hlsl");
-}
-
-TEST_F(CompilerTest, CodeGenEliminateDynamicIndexing3) {
-  CodeGenTestCheck(L"eliminate_dynamic_output3.hlsl");
-}
-
-TEST_F(CompilerTest, CodeGenEliminateDynamicIndexing4) {
-  CodeGenTestCheck(L"eliminate_dynamic_output4.hlsl");
-}
-
-TEST_F(CompilerTest, CodeGenEliminateDynamicIndexing6) {
-  CodeGenTestCheck(L"eliminate_dynamic_output6.hlsl");
-}
-
 TEST_F(CompilerTest, CodeGenEmpty) {
   CodeGenTest(L"..\\CodeGenHLSL\\empty.hlsl");
 }
@@ -3433,21 +3290,7 @@ TEST_F(CompilerTest, CodeGenExternRes) {
 }
 
 TEST_F(CompilerTest, CodeGenExpandTrig) {
-  CodeGenTestCheck(L"expand_trig\\acos.hlsl");
-  CodeGenTestCheck(L"expand_trig\\acos_h.hlsl");
-  CodeGenTestCheck(L"expand_trig\\asin.hlsl");
-  CodeGenTestCheck(L"expand_trig\\asin_h.hlsl");
-  CodeGenTestCheck(L"expand_trig\\atan.hlsl");
-  CodeGenTestCheck(L"expand_trig\\atan_h.hlsl");
-  CodeGenTestCheck(L"expand_trig\\hcos.hlsl");
-  CodeGenTestCheck(L"expand_trig\\hcos_h.hlsl");
-  CodeGenTestCheck(L"expand_trig\\hsin.hlsl");
-  CodeGenTestCheck(L"expand_trig\\hsin_h.hlsl");
-  CodeGenTestCheck(L"expand_trig\\htan.hlsl");
-  CodeGenTestCheck(L"expand_trig\\htan_h.hlsl");
-  CodeGenTestCheck(L"expand_trig\\tan.hlsl");
-  CodeGenTestCheck(L"expand_trig\\keep_precise.0.hlsl");
-  CodeGenTestCheck(L"expand_trig\\keep_precise.1.hlsl");
+  CodeGenTestCheckBatchDir(L"..\\CodeGenHLSL\\expand_trig");
 }
 
 TEST_F(CompilerTest, CodeGenFloatCast) {
@@ -3907,6 +3750,10 @@ TEST_F(CompilerTest, CodeGenMaxMin) {
   CodeGenTestCheck(L"..\\CodeGenHLSL\\max_min.hlsl");
 }
 
+TEST_F(CompilerTest, CodeGenMaxMinLiteral) {
+  CodeGenTestCheck(L"..\\CodeGenHLSL\\max_min_literal.hlsl");
+}
+
 TEST_F(CompilerTest, CodeGenMinprec1) {
   CodeGenTestCheck(L"..\\CodeGenHLSL\\minprec1.hlsl");
 }
@@ -4088,16 +3935,6 @@ TEST_F(CompilerTest, CodeGenPreciseOnCallNot) {
   CodeGenTestCheck(L"..\\CodeGenHLSL\\precise_call_not.hlsl");
 }
 
-TEST_F(CompilerTest, CodeGenPreserveAllOutputs) {
-  CodeGenTestCheck(L"preserve_all_outputs_1.hlsl");
-  CodeGenTestCheck(L"preserve_all_outputs_2.hlsl");
-  CodeGenTestCheck(L"preserve_all_outputs_3.hlsl");
-  CodeGenTestCheck(L"preserve_all_outputs_4.hlsl");
-  CodeGenTestCheck(L"preserve_all_outputs_5.hlsl");
-  CodeGenTestCheck(L"preserve_all_outputs_6.hlsl");
-  CodeGenTestCheck(L"preserve_all_outputs_7.hlsl");
-}
-
 TEST_F(CompilerTest, CodeGenRaceCond2) {
   CodeGenTest(L"..\\CodeGenHLSL\\RaceCond2.hlsl");
 }
@@ -4382,10 +4219,6 @@ TEST_F(CompilerTest, CodeGenSimpleHS11) {
   CodeGenTestCheck(L"..\\CodeGenHLSL\\SimpleHs11.hlsl");
 }
 
-TEST_F(CompilerTest, CodeGenSMFail) {
-  CodeGenTestCheck(L"sm-fail.hlsl");
-}
-
 TEST_F(CompilerTest, CodeGenSrv_Ms_Load1) {
   CodeGenTestCheck(L"..\\CodeGenHLSL\\srv_ms_load1.hlsl");
 }
@@ -5656,6 +5489,10 @@ TEST_F(CompilerTest, CodeGenDx12MiniEngineParticletilecullingcs){
   CodeGenTestCheck(L"..\\CodeGenHLSL\\Samples\\MiniEngine\\ParticleTileCullingCS.hlsl");
 }
 
+TEST_F(CompilerTest, CodeGenDx12MiniEngineParticletilecullingcs_fail_unroll){
+  CodeGenTestCheck(L"..\\CodeGenHLSL\\Samples\\MiniEngine\\ParticleTileCullingCS_fail_unroll.hlsl");
+}
+
 TEST_F(CompilerTest, CodeGenDx12MiniEngineParticletilerendercs){
   CodeGenTestCheck(L"..\\CodeGenHLSL\\Samples\\MiniEngine\\ParticleTileRenderCS.hlsl");
 }
@@ -5745,71 +5582,11 @@ TEST_F(CompilerTest, DxilGen_StoreOutput) {
 }
 
 TEST_F(CompilerTest, ConstantFolding) {
-  CodeGenTestCheck(L"constprop\\FAbs.hlsl");
-  CodeGenTestCheck(L"constprop\\Saturate_half.hlsl");
-  CodeGenTestCheck(L"constprop\\Saturate_float.hlsl");
-  CodeGenTestCheck(L"constprop\\Saturate_double.hlsl");
-  CodeGenTestCheck(L"constprop\\Cos.hlsl");
-  CodeGenTestCheck(L"constprop\\Sin.hlsl");
-  CodeGenTestCheck(L"constprop\\Tan.hlsl");
-  CodeGenTestCheck(L"constprop\\Acos.hlsl");
-  CodeGenTestCheck(L"constprop\\Asin.hlsl");
-  CodeGenTestCheck(L"constprop\\Atan.hlsl");
-  CodeGenTestCheck(L"constprop\\Hcos.hlsl");
-  CodeGenTestCheck(L"constprop\\Hsin.hlsl");
-  CodeGenTestCheck(L"constprop\\Htan.hlsl");
-  CodeGenTestCheck(L"constprop\\Exp.hlsl");
-  CodeGenTestCheck(L"constprop\\Frc.hlsl");
-  CodeGenTestCheck(L"constprop\\Log.hlsl");
-  CodeGenTestCheck(L"constprop\\Sqrt.hlsl");
-  CodeGenTestCheck(L"constprop\\Rsqrt.hlsl");
-  CodeGenTestCheck(L"constprop\\Round_ne.hlsl");
-  CodeGenTestCheck(L"constprop\\Round_ni.hlsl");
-  CodeGenTestCheck(L"constprop\\Round_pi.hlsl");
-  CodeGenTestCheck(L"constprop\\Round_z.hlsl");
-  
-  CodeGenTestCheck(L"constprop\\Bfrev.hlsl");
-  CodeGenTestCheck(L"constprop\\Countbits.hlsl");
-  CodeGenTestCheck(L"constprop\\Firstbitlo.hlsl");
-  CodeGenTestCheck(L"constprop\\Firstbithi.hlsl");
-
-  CodeGenTestCheck(L"constprop\\FMin.hlsl");
-  CodeGenTestCheck(L"constprop\\FMax.hlsl");
-  CodeGenTestCheck(L"constprop\\IMin.hlsl");
-  CodeGenTestCheck(L"constprop\\IMax.hlsl");
-  CodeGenTestCheck(L"constprop\\UMin.hlsl");
-  CodeGenTestCheck(L"constprop\\UMax.hlsl");
-  
-  CodeGenTestCheck(L"constprop\\FMad.hlsl");
-  CodeGenTestCheck(L"constprop\\Fma.hlsl");
-  CodeGenTestCheck(L"constprop\\IMad.hlsl");
-  CodeGenTestCheck(L"constprop\\UMad.hlsl");
-  
-  CodeGenTestCheck(L"constprop\\Dot2.hlsl");
-  CodeGenTestCheck(L"constprop\\Dot3.hlsl");
-  CodeGenTestCheck(L"constprop\\Dot4.hlsl");
-
-  CodeGenTestCheck(L"constprop\\ibfe.ll");
-  CodeGenTestCheck(L"constprop\\ubfe.ll");
-  CodeGenTestCheck(L"constprop\\bfi.ll");
+  CodeGenTestCheckBatchDir(L"..\\CodeGenHLSL\\constprop");
 }
 
 TEST_F(CompilerTest, HoistConstantArray) {
-  CodeGenTestCheck(L"hca\\01.hlsl");
-  CodeGenTestCheck(L"hca\\02.hlsl");
-  CodeGenTestCheck(L"hca\\03.hlsl");
-  CodeGenTestCheck(L"hca\\04.hlsl");
-  CodeGenTestCheck(L"hca\\05.hlsl");
-  CodeGenTestCheck(L"hca\\06.hlsl");
-  CodeGenTestCheck(L"hca\\07.hlsl");
-  CodeGenTestCheck(L"hca\\08.hlsl");
-  CodeGenTestCheck(L"hca\\09.hlsl");
-  CodeGenTestCheck(L"hca\\10.hlsl");
-  CodeGenTestCheck(L"hca\\11.hlsl");
-  CodeGenTestCheck(L"hca\\12.hlsl");
-  CodeGenTestCheck(L"hca\\13.hlsl");
-  CodeGenTestCheck(L"hca\\14.hlsl");
-  CodeGenTestCheck(L"hca\\15.ll");
+  CodeGenTestCheckBatchDir(L"..\\CodeGenHLSL\\hoist-constant-array");
 }
 
 TEST_F(CompilerTest, VecElemConstEval) {
@@ -5945,25 +5722,7 @@ TEST_F(CompilerTest, WhenSigMismatchPCFunctionThenFail) {
 
 TEST_F(CompilerTest, ViewID) {
   if (m_ver.SkipDxilVersion(1,1)) return;
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid01.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid02.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid03.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid04.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid05.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid06.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid07.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid08.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid09.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid10.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid11.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid12.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid13.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid14.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid15.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid16.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid17.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid18.hlsl");
-  CodeGenTestCheck(L"..\\CodeGenHLSL\\viewid\\viewid19.hlsl");
+  CodeGenTestCheckBatchDir(L"..\\CodeGenHLSL\\viewid");
 }
 
 TEST_F(CompilerTest, SubobjectCodeGenErrors) {
@@ -6003,17 +5762,12 @@ TEST_F(CompilerTest, SubobjectCodeGenErrors) {
   }
 }
 
+TEST_F(CompilerTest, Unroll) {
+  CodeGenTestCheckBatchDir(L"..\\CodeGenHLSL\\unroll");
+}
+
 TEST_F(CompilerTest, ShaderCompatSuite) {
-  using namespace WEX::TestExecution;
-  std::wstring suitePath = L"..\\CodeGenHLSL\\shader-compat-suite";
-
-  WEX::Common::String value;
-  if (!DXC_FAILED(RuntimeParameters::TryGetValue(L"SuitePath", value)))
-  {
-    suitePath = value;
-  }
-
-  CodeGenTestCheckBatchDir(suitePath);
+  CodeGenTestCheckBatchDir(L"..\\CodeGenHLSL\\shader-compat-suite");
 }
 
 TEST_F(CompilerTest, QuickTest) {
@@ -6025,19 +5779,48 @@ TEST_F(CompilerTest, QuickLlTest) {
 }
 
 #ifdef _WIN32
-TEST_F(CompilerTest, SingleFileCheckTest) {
+TEST_F(CompilerTest, ManualFileCheckTest) {
 #else
-TEST_F(CompilerTest, DISABLED_SingleFileCheckTest) {
+TEST_F(CompilerTest, DISABLED_ManualFileCheckTest) {
 #endif
   using namespace llvm;
   using namespace WEX::TestExecution;
+
   WEX::Common::String value;
-  VERIFY_SUCCEEDED(RuntimeParameters::TryGetValue(L"InputFile", value));
-  std::wstring filename = value;
-  CW2A pUtf8Filename(filename.c_str());
-  if (!llvm::sys::path::is_absolute(pUtf8Filename.m_psz)) {
-    filename = hlsl_test::GetPathToHlslDataFile(filename.c_str());
+  VERIFY_SUCCEEDED(RuntimeParameters::TryGetValue(L"InputPath", value));
+
+  std::wstring path = value;
+  if (!llvm::sys::path::is_absolute(CW2A(path.c_str()).m_psz)) {
+    path = hlsl_test::GetPathToHlslDataFile(path.c_str());
   }
 
-  CodeGenTestCheckBatch(filename.c_str(), 0);
+  bool isDirectory;
+  {
+    // Temporarily setup the filesystem for testing whether the path is a directory.
+    // If it is, CodeGenTestCheckBatchDir will create its own instance.
+    llvm::sys::fs::MSFileSystem *msfPtr;
+    VERIFY_SUCCEEDED(CreateMSFileSystemForDisk(&msfPtr));
+    std::unique_ptr<llvm::sys::fs::MSFileSystem> msf(msfPtr);
+    llvm::sys::fs::AutoPerThreadSystem pts(msf.get());
+    IFTLLVM(pts.error_code());
+    isDirectory = llvm::sys::fs::is_directory(CW2A(path.c_str()).m_psz);
+  }
+
+  if (isDirectory) {
+    CodeGenTestCheckBatchDir(path);
+  } else {
+    CodeGenTestCheckBatch(path.c_str(), 0);
+  }
+}
+
+TEST_F(CompilerTest, CodeGenDeclarations) {
+  CodeGenTestCheckBatchDir(L"..\\CodeGenHLSL\\declarations");
+}
+
+TEST_F(CompilerTest, CodeGenExpressions) {
+  CodeGenTestCheckBatchDir(L"..\\CodeGenHLSL\\expressions");
+}
+
+TEST_F(CompilerTest, CodeGenPreprocessor) {
+  CodeGenTestCheckBatchDir(L"..\\CodeGenHLSL\\preprocessor");
 }
diff --git a/tools/clang/unittests/HLSL/DxcTestUtils.h b/tools/clang/unittests/HLSL/DxcTestUtils.h
index 0144c8777..35bce687d 100644
--- a/tools/clang/unittests/HLSL/DxcTestUtils.h
+++ b/tools/clang/unittests/HLSL/DxcTestUtils.h
@@ -55,20 +55,12 @@ public:
 };
 
 class FileRunCommandPart {
-private:
-  void RunFileChecker(const FileRunCommandPart *Prior);
-  void RunStdErrChecker(const FileRunCommandPart *Prior);
-  void RunDxc(const FileRunCommandPart *Prior);
-  void RunDxv(const FileRunCommandPart *Prior);
-  void RunOpt(const FileRunCommandPart *Prior);
-  void RunD3DReflect(const FileRunCommandPart *Prior);
-  void RunTee(const FileRunCommandPart *Prior);
 public:
-  FileRunCommandPart(const FileRunCommandPart&) = default;
   FileRunCommandPart(const std::string &command, const std::string &arguments, LPCWSTR commandFileName);
-  FileRunCommandPart(FileRunCommandPart && other);
+  FileRunCommandPart(const FileRunCommandPart&) = default;
+  FileRunCommandPart(FileRunCommandPart&&) = default;
   
-  void Run(const FileRunCommandPart *Prior);
+  void Run(dxc::DxcDllSupport &DllSupport, const FileRunCommandPart *Prior);
   
   void ReadOptsForDxc(hlsl::options::MainArgs &argStrings, hlsl::options::DxcOpts &Opts);
 
@@ -76,13 +68,20 @@ public:
   std::string Arguments;    // Arguments to command
   LPCWSTR CommandFileName;  // File name replacement for %s
 
-  dxc::DxcDllSupport *DllSupport; // DLL support to use for Run().
-
   // These fields are set after an invocation to Run().
   CComPtr<IDxcOperationResult> OpResult;  // The operation result, if any.
   int RunResult;                          // The exit code for the operation.
   std::string StdOut;                     // Standard output text.
   std::string StdErr;                     // Standard error text.
+
+private:
+  void RunFileChecker(const FileRunCommandPart *Prior);
+  void RunDxc(dxc::DxcDllSupport &DllSupport, const FileRunCommandPart *Prior);
+  void RunDxv(dxc::DxcDllSupport &DllSupport, const FileRunCommandPart *Prior);
+  void RunOpt(dxc::DxcDllSupport &DllSupport, const FileRunCommandPart *Prior);
+  void RunD3DReflect(dxc::DxcDllSupport &DllSupport, const FileRunCommandPart *Prior);
+  void RunTee(const FileRunCommandPart *Prior);
+  void RunXFail(const FileRunCommandPart *Prior);
 };
 
 void ParseCommandParts(LPCSTR commands, LPCWSTR fileName, std::vector<FileRunCommandPart> &parts);
diff --git a/tools/clang/unittests/HLSL/DxilContainerTest.cpp b/tools/clang/unittests/HLSL/DxilContainerTest.cpp
index 4f8331029..725a994c2 100644
--- a/tools/clang/unittests/HLSL/DxilContainerTest.cpp
+++ b/tools/clang/unittests/HLSL/DxilContainerTest.cpp
@@ -345,53 +345,14 @@ public:
   }
 #endif // _WIN32 - Reflection unsupported
 
-  void split(const wstring &s, wchar_t delim, vector<wstring> &elems) {
-    wstringstream ss(s);
-    wstring item;
-    while (getline(ss, item, delim)) {
-      elems.push_back(item);
-    }
-  }
-
-  vector<wstring> split(const wstring &s, char delim) {
-    vector<wstring> elems;
-    split(s, delim, elems);
-    return elems;
-  }
-
-  wstring SplitFilename(const wstring &str) {
-    size_t found;
-    found = str.find_last_of(L"/\\");
-    return str.substr(0, found);
-  }
-
-  void NameParseCommandPartsFromFile(LPCWSTR path, std::vector<FileRunCommandPart> &parts) {
-    vector<wstring> Parts = split(wstring(path), '+');
-    std::wstring Name = Parts[0];
-    std::wstring EntryPoint = Parts[1];
-    std::wstring ShaderModel = Parts[2];
-    std::wstring Arguments = L"-T ";
-    Arguments += ShaderModel;
-    Arguments += L" -E ";
-    Arguments += EntryPoint;
-    Arguments += L" %s";
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t> > w;
-    string ArgumentsNarrow = w.to_bytes(Arguments);
-
-    FileRunCommandPart P(string("%dxc"), ArgumentsNarrow, path);
-    parts.push_back(P);
-  }
-
 #ifdef _WIN32  // - Reflection unsupported
   HRESULT CompileFromFile(LPCWSTR path, bool useDXBC, IDxcBlob **ppBlob) {
     std::vector<FileRunCommandPart> parts;
-    //NameParseCommandPartsFromFile(path, parts);
     ParseCommandPartsFromFile(path, parts);
     VERIFY_IS_TRUE(parts.size() > 0);
     VERIFY_ARE_EQUAL_STR(parts[0].Command.c_str(), "%dxc");
     FileRunCommandPart &dxc = parts[0];
     m_dllSupport.Initialize();
-    dxc.DllSupport = &m_dllSupport;
 
     hlsl::options::MainArgs args;
     hlsl::options::DxcOpts opts;
@@ -411,7 +372,7 @@ public:
       IFR(pDxbcBlob.QueryInterface(ppBlob));
     }
     else {
-      dxc.Run(nullptr);
+      dxc.Run(m_dllSupport, nullptr);
       IFRBOOL(dxc.RunResult == 0, E_FAIL);
       IFR(dxc.OpResult->GetResult(ppBlob));
     }
diff --git a/tools/clang/unittests/HLSL/ExecutionTest.cpp b/tools/clang/unittests/HLSL/ExecutionTest.cpp
index cb5292a29..f00a85feb 100644
--- a/tools/clang/unittests/HLSL/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSL/ExecutionTest.cpp
@@ -301,7 +301,9 @@ public:
   END_TEST_METHOD()
 
   TEST_METHOD(BasicShaderModel61);
-  TEST_METHOD(BasicShaderModel63);
+  BEGIN_TEST_METHOD(BasicShaderModel63)
+    TEST_METHOD_PROPERTY(L"Priority", L"2") // Remove this line once warp supports this feature in Shader Model 6.3
+  END_TEST_METHOD()
 
   BEGIN_TEST_METHOD(WaveIntrinsicsActiveIntTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsActiveIntTable")
@@ -410,16 +412,37 @@ public:
   
   TEST_METHOD(ComputeRawBufferLdStI32);
   TEST_METHOD(ComputeRawBufferLdStFloat);
-  TEST_METHOD(ComputeRawBufferLdStI64);
-  TEST_METHOD(ComputeRawBufferLdStDouble);
-  TEST_METHOD(ComputeRawBufferLdStI16);
-  TEST_METHOD(ComputeRawBufferLdStHalf);
+
+  BEGIN_TEST_METHOD(ComputeRawBufferLdStI64)
+    TEST_METHOD_PROPERTY(L"Priority", L"2") // Remove this line once warp supports this feature in Shader Model 6.3
+  END_TEST_METHOD()
+  BEGIN_TEST_METHOD(ComputeRawBufferLdStDouble)
+    TEST_METHOD_PROPERTY(L"Priority", L"2") // Remove this line once warp supports this feature in Shader Model 6.3
+  END_TEST_METHOD()
+    
+  BEGIN_TEST_METHOD(ComputeRawBufferLdStI16)
+    TEST_METHOD_PROPERTY(L"Priority", L"2") // This test is disabled because of a bug in WARP; TODO: enable once the bug is fixed
+  END_TEST_METHOD()
+  BEGIN_TEST_METHOD(ComputeRawBufferLdStHalf)
+    TEST_METHOD_PROPERTY(L"Priority", L"2") // This test is disabled because of a bug in WARP; TODO: enable once the bug is fixed
+  END_TEST_METHOD()
+
   TEST_METHOD(GraphicsRawBufferLdStI32);
   TEST_METHOD(GraphicsRawBufferLdStFloat);
-  TEST_METHOD(GraphicsRawBufferLdStI64);
-  TEST_METHOD(GraphicsRawBufferLdStDouble);
-  TEST_METHOD(GraphicsRawBufferLdStI16);
-  TEST_METHOD(GraphicsRawBufferLdStHalf);
+
+  BEGIN_TEST_METHOD(GraphicsRawBufferLdStI64)
+    TEST_METHOD_PROPERTY(L"Priority", L"2") // Remove this line once warp supports this feature in Shader Model 6.3
+  END_TEST_METHOD()
+  BEGIN_TEST_METHOD(GraphicsRawBufferLdStDouble)
+    TEST_METHOD_PROPERTY(L"Priority", L"2") // Remove this line once warp supports this feature in Shader Model 6.3
+  END_TEST_METHOD()
+
+  BEGIN_TEST_METHOD(GraphicsRawBufferLdStI16)
+    TEST_METHOD_PROPERTY(L"Priority", L"2") // This test is disabled because of a bug in WARP; TODO: enable once the bug is fixed
+  END_TEST_METHOD()
+  BEGIN_TEST_METHOD(GraphicsRawBufferLdStHalf)
+    TEST_METHOD_PROPERTY(L"Priority", L"2") // This test is disabled because of a bug in WARP; TODO: enable once the bug is fixed
+  END_TEST_METHOD()
 
   // This is defined in d3d.h for Windows 10 Anniversary Edition SDK, but we only
   // require the Windows 10 SDK.
@@ -429,14 +452,19 @@ public:
     D3D_SHADER_MODEL_6_1 = 0x61,
     D3D_SHADER_MODEL_6_2 = 0x62,
     D3D_SHADER_MODEL_6_3 = 0x63,
+    D3D_SHADER_MODEL_6_4 = 0x64,
 } D3D_SHADER_MODEL;
 
 #if WDK_NTDDI_VERSION == NTDDI_WIN10_RS2
   static const D3D_SHADER_MODEL HIGHEST_SHADER_MODEL = D3D_SHADER_MODEL_6_0;
 #elif WDK_NTDDI_VERSION == NTDDI_WIN10_RS3
   static const D3D_SHADER_MODEL HIGHEST_SHADER_MODEL = D3D_SHADER_MODEL_6_1;
-#else
+#elif WDK_NTDDI_VERSION == NTDDI_WIN10_RS4
   static const D3D_SHADER_MODEL HIGHEST_SHADER_MODEL = D3D_SHADER_MODEL_6_2;
+#elif WDK_NTDDI_VERSION == NTDDI_WIN10_RS5
+  static const D3D_SHADER_MODEL HIGHEST_SHADER_MODEL = D3D_SHADER_MODEL_6_3;
+#else
+  static const D3D_SHADER_MODEL HIGHEST_SHADER_MODEL = D3D_SHADER_MODEL_6_4;
 #endif
 
   dxc::DxcDllSupport m_support;
@@ -480,6 +508,11 @@ public:
 
   void RunBasicShaderModelTest(D3D_SHADER_MODEL shaderModel);
 
+  void RunDotOp();
+  void RunDot2AddOp();
+  void RunDot4AddI8PackedOp();
+  void RunDot4AddU8PackedOp();
+
   enum class RawBufferLdStType {
      I32,
      Float,
@@ -489,7 +522,29 @@ public:
      Half
   };
 
-  void RunRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType, char *shaderOpName = nullptr);
+  template <class Ty>
+  struct RawBufferLdStTestData {
+    Ty v1, v2[2], v3[3], v4[4];
+  };
+
+  template <class Ty>
+  struct RawBufferLdStUavData {
+    RawBufferLdStTestData<Ty> input, output, srvOut;
+  };
+
+  template <class Ty>
+  void RunComputeRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType,
+                            const char *shaderOpName, const RawBufferLdStTestData<Ty> &testData);
+
+  template <class Ty>
+  void RunGraphicsRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType,
+                            const char *shaderOpName, const RawBufferLdStTestData<Ty> &testData);
+
+  template <class Ty>
+  void VerifyRawBufferLdStTestResults(const std::shared_ptr<st::ShaderOpTest> test, const RawBufferLdStTestData<Ty> &testData);
+                                      
+  bool SetupRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType, CComPtr<ID3D12Device> &pDevice, 
+                              CComPtr<IStream> &pStream, char *&sTy, char *&additionalOptions);
 
   template <class Ty>
   void RunBasicShaderModelTest(CComPtr<ID3D12Device> pDevice, const char *pShaderModelStr, const char *pShader, Ty *pInputDataPairs, unsigned inputDataCount);
@@ -550,12 +605,16 @@ public:
   }
 
   bool CreateDevice(_COM_Outptr_ ID3D12Device **ppDevice,
-                    D3D_SHADER_MODEL testModel = D3D_SHADER_MODEL_6_0) {
+                    D3D_SHADER_MODEL testModel = D3D_SHADER_MODEL_6_0, bool skipUnsupported = true) {
     if (testModel > HIGHEST_SHADER_MODEL) {
       UINT minor = (UINT)testModel & 0x0f;
       LogCommentFmt(L"Installed SDK does not support "
           L"shader model 6.%1u", minor);
-      WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+
+      if (skipUnsupported) {
+        WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+      }
+
       return false;
     }
     const D3D_FEATURE_LEVEL FeatureLevelRequired = D3D_FEATURE_LEVEL_11_0;
@@ -572,7 +631,11 @@ public:
                                            IID_PPV_ARGS(&pDevice));
       if (FAILED(createHR)) {
         LogCommentFmt(L"The available version of WARP does not support d3d12.");
-        WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+
+        if (skipUnsupported) {
+          WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+        }
+
         return false;
       }
     } else {
@@ -614,7 +677,11 @@ public:
         UINT minor = (UINT)testModel & 0x0f;
         LogCommentFmt(L"The selected device does not support "
                       L"shader model 6.%1u", minor);
-        WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+
+        if (skipUnsupported) {
+          WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+        }
+
         return false;
       }
     }
@@ -2677,6 +2744,44 @@ struct SDotOp {
     float o_dot4;
 };
 
+struct Half2
+{
+    uint16_t x;
+    uint16_t y;
+
+    Half2() = default;
+
+    Half2(const Half2&) = default;
+    Half2& operator=(const Half2&) = default;
+
+    Half2(Half2&&) = default;
+    Half2& operator=(Half2&&) = default;
+
+    constexpr Half2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {}
+    explicit Half2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {}
+};
+
+struct SDot2AddOp {
+    Half2 input1;
+    Half2 input2;
+    float acc;
+    float result;
+};
+
+struct SDot4AddI8PackedOp {
+    uint32_t input1;
+    uint32_t input2;
+    int32_t acc;
+    int32_t result;
+};
+
+struct SDot4AddU8PackedOp {
+    uint32_t input1;
+    uint32_t input2;
+    uint32_t acc;
+    uint32_t result;
+};
+
 struct SMsad4 {
     unsigned int ref;
     XMUINT2 src;
@@ -3045,15 +3150,45 @@ static TableParameter TertiaryUint16OpParameters[] = {
 };
 
 static TableParameter DotOpParameters[] = {
-    { L"ShaderOp.Target", TableParameter::STRING, true },
-    { L"ShaderOp.Text", TableParameter::STRING, true },
-    { L"Validation.Input1", TableParameter::STRING_TABLE, true },
-    { L"Validation.Input2", TableParameter::STRING_TABLE, true },
-    { L"Validation.Expected1", TableParameter::STRING_TABLE, true },
-    { L"Validation.Expected2", TableParameter::STRING_TABLE, true },
-    { L"Validation.Expected3", TableParameter::STRING_TABLE, true },
-    { L"Validation.Type", TableParameter::STRING, true },
-    { L"Validation.Tolerance", TableParameter::DOUBLE, true },
+    { L"Dot.ShaderOp.Target", TableParameter::STRING, true },
+    { L"Dot.ShaderOp.Text", TableParameter::STRING, true },
+    { L"Dot.Validation.Input1", TableParameter::STRING_TABLE, true },
+    { L"Dot.Validation.Input2", TableParameter::STRING_TABLE, true },
+    { L"Dot.Validation.Expected1", TableParameter::STRING_TABLE, true },
+    { L"Dot.Validation.Expected2", TableParameter::STRING_TABLE, true },
+    { L"Dot.Validation.Expected3", TableParameter::STRING_TABLE, true },
+    { L"Dot.Validation.Type", TableParameter::STRING, true },
+    { L"Dot.Validation.Tolerance", TableParameter::DOUBLE, true },
+};
+
+static TableParameter Dot2AddOpParameters[] = {
+    { L"Dot2Add.ShaderOp.Target", TableParameter::STRING, true },
+    { L"Dot2Add.ShaderOp.Text", TableParameter::STRING, true },
+    { L"Dot2Add.ShaderOp.Arguments", TableParameter::STRING, true },
+    { L"Dot2Add.Validation.Input1", TableParameter::STRING_TABLE, true },
+    { L"Dot2Add.Validation.Input2", TableParameter::STRING_TABLE, true },
+    { L"Dot2Add.Validation.Input3", TableParameter::FLOAT_TABLE, true },
+    { L"Dot2Add.Validation.Expected1", TableParameter::FLOAT_TABLE, true },
+    { L"Dot2Add.Validation.Type", TableParameter::STRING, true },
+    { L"Dot2Add.Validation.Tolerance", TableParameter::DOUBLE, true },
+};
+
+static TableParameter Dot4AddI8PackedOpParameters[] = {
+    { L"Dot4AddI8Packed.ShaderOp.Target", TableParameter::STRING, true },
+    { L"Dot4AddI8Packed.ShaderOp.Text", TableParameter::STRING, true },
+    { L"Dot4AddI8Packed.Validation.Input1", TableParameter::UINT32_TABLE, true },
+    { L"Dot4AddI8Packed.Validation.Input2", TableParameter::UINT32_TABLE, true },
+    { L"Dot4AddI8Packed.Validation.Input3", TableParameter::INT32_TABLE, true },
+    { L"Dot4AddI8Packed.Validation.Expected1", TableParameter::INT32_TABLE, true },
+};
+
+static TableParameter Dot4AddU8PackedOpParameters[] = {
+    { L"Dot4AddU8Packed.ShaderOp.Target", TableParameter::STRING, true },
+    { L"Dot4AddU8Packed.ShaderOp.Text", TableParameter::STRING, true },
+    { L"Dot4AddU8Packed.Validation.Input1", TableParameter::UINT32_TABLE, true },
+    { L"Dot4AddU8Packed.Validation.Input2", TableParameter::UINT32_TABLE, true },
+    { L"Dot4AddU8Packed.Validation.Input3", TableParameter::UINT32_TABLE, true },
+    { L"Dot4AddU8Packed.Validation.Expected1", TableParameter::UINT32_TABLE, true },
 };
 
 static TableParameter Msad4OpParameters[] = {
@@ -3243,6 +3378,23 @@ static HRESULT ParseDataToVectorFloat(PCWSTR str, float *ptr, size_t count) {
     return S_OK;
 }
 
+static HRESULT ParseDataToVectorHalf(PCWSTR str, uint16_t *ptr, size_t count) {
+    std::wstring wstr(str);
+    size_t curPosition = 0;
+    // parse a string of dot product separated by commas
+    for (size_t i = 0; i < count; ++i) {
+        size_t nextPosition = wstr.find(L",", curPosition);
+        float floatValue;
+        if (FAILED(ParseDataToFloat(
+            wstr.substr(curPosition, nextPosition - curPosition).data(), floatValue))) {
+            return E_FAIL;
+        }
+        *(ptr + i) = ConvertFloat32ToFloat16(floatValue);
+        curPosition = nextPosition + 1;
+    }
+    return S_OK;
+}
+
 static HRESULT ParseDataToVectorUint(PCWSTR str, unsigned int *ptr, size_t count) {
     std::wstring wstr(str);
     size_t curPosition = 0;
@@ -3497,6 +3649,10 @@ static void VerifyOutputWithExpectedValueInt(int output, int ref, int tolerance)
     VERIFY_IS_TRUE(output - ref <= tolerance && ref - output <= tolerance);
 }
 
+static void VerifyOutputWithExpectedValueUInt(uint32_t output, uint32_t ref, uint32_t tolerance) {
+    VERIFY_IS_TRUE(output - ref <= tolerance && ref - output <= tolerance);
+}
+
 static void VerifyOutputWithExpectedValueFloat(
     float output, float ref, LPCWSTR type, double tolerance,
     hlsl::DXIL::Float32DenormMode mode = hlsl::DXIL::Float32DenormMode::Any) {
@@ -4879,7 +5035,18 @@ TEST_F(ExecutionTest, TertiaryUint16OpTest) {
   }
 }
 
+// TODO: Split into 4 different tests after 19H1 when we're allowed to add new tests
 TEST_F(ExecutionTest, DotTest) {
+    RunDotOp();
+    RunDot2AddOp();
+    RunDot4AddI8PackedOp();
+    RunDot4AddU8PackedOp();
+}
+
+// Helper for the Dot operator, which is part of DotTest
+void ExecutionTest::RunDotOp() {
+    WEX::Logging::Log::Comment(L"\nRunning Dot Op tests:\n");
+
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
@@ -4893,22 +5060,22 @@ TEST_F(ExecutionTest, DotTest) {
     int tableSize = sizeof(DotOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(DotOpParameters, tableSize);
 
-    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
-    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
+    CW2A Target(handler.GetTableParamByName(L"Dot.ShaderOp.Target")->m_str);
+    CW2A Text(handler.GetTableParamByName(L"Dot.ShaderOp.Text")->m_str);
 
     std::vector<WEX::Common::String> *Validation_Input1 =
-        &handler.GetTableParamByName(L"Validation.Input1")->m_StringTable;
+        &handler.GetTableParamByName(L"Dot.Validation.Input1")->m_StringTable;
     std::vector<WEX::Common::String> *Validation_Input2 =
-        &handler.GetTableParamByName(L"Validation.Input2")->m_StringTable;
+        &handler.GetTableParamByName(L"Dot.Validation.Input2")->m_StringTable;
     std::vector<WEX::Common::String> *Validation_dot2 =
-        &handler.GetTableParamByName(L"Validation.Expected1")->m_StringTable;
+        &handler.GetTableParamByName(L"Dot.Validation.Expected1")->m_StringTable;
     std::vector<WEX::Common::String> *Validation_dot3 =
-        &handler.GetTableParamByName(L"Validation.Expected2")->m_StringTable;
+        &handler.GetTableParamByName(L"Dot.Validation.Expected2")->m_StringTable;
     std::vector<WEX::Common::String> *Validation_dot4 =
-        &handler.GetTableParamByName(L"Validation.Expected3")->m_StringTable;
+        &handler.GetTableParamByName(L"Dot.Validation.Expected3")->m_StringTable;
 
-    PCWSTR Validation_type = handler.GetTableParamByName(L"Validation.Type")->m_str;
-    double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
+    PCWSTR Validation_type = handler.GetTableParamByName(L"Dot.Validation.Type")->m_str;
+    double tolerance = handler.GetTableParamByName(L"Dot.Validation.Tolerance")->m_double;
     size_t count = Validation_Input1->size();
 
     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
@@ -4962,6 +5129,217 @@ TEST_F(ExecutionTest, DotTest) {
     }
 }
 
+// Helper for the Dot2Add operator, which is part of DotTest
+void ExecutionTest::RunDot2AddOp() {
+    WEX::Logging::Log::Comment(L"\nRunning Dot2Add Op tests:\n");
+
+    WEX::TestExecution::SetVerifyOutput verifySettings(
+        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+    CComPtr<IStream> pStream;
+    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+    CComPtr<ID3D12Device> pDevice;
+    if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
+        return;
+    }
+
+    if (!DoesDeviceSupportNative16bitOps(pDevice)) {
+        WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
+        // Don't skip this test for now, otherwise the entire DotTest would be skipped
+        // TODO: Skip the test once the Dot tests have been split in 4 different tests
+        return;
+    }
+
+    int tableSize = sizeof(Dot2AddOpParameters) / sizeof(TableParameter);
+    TableParameterHandler handler(Dot2AddOpParameters, tableSize);
+
+    CW2A Target(handler.GetTableParamByName(L"Dot2Add.ShaderOp.Target")->m_str);
+    CW2A Text(handler.GetTableParamByName(L"Dot2Add.ShaderOp.Text")->m_str);
+    CW2A Arguments(handler.GetTableParamByName(L"Dot2Add.ShaderOp.Arguments")->m_str);
+
+    std::vector<WEX::Common::String> *validation_input1 =
+        &handler.GetTableParamByName(L"Dot2Add.Validation.Input1")->m_StringTable;
+    std::vector<WEX::Common::String> *validation_input2 =
+        &handler.GetTableParamByName(L"Dot2Add.Validation.Input2")->m_StringTable;
+    std::vector<float> *validation_acc = &handler.GetTableParamByName(L"Dot2Add.Validation.Input3")->m_floatTable;
+    std::vector<float> *validation_result = &handler.GetTableParamByName(L"Dot2Add.Validation.Expected1")->m_floatTable;
+
+    PCWSTR Validation_type = handler.GetTableParamByName(L"Dot2Add.Validation.Type")->m_str;
+    double tolerance = handler.GetTableParamByName(L"Dot2Add.Validation.Tolerance")->m_double;
+    size_t count = validation_input1->size();
+
+    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+        pDevice, m_support, pStream, "Dot2AddOp",
+        // this callback is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+        VERIFY_IS_TRUE(0 == _stricmp(Name, "SDot2AddOp"));
+        size_t size = sizeof(SDot2AddOp) * count;
+        Data.resize(size);
+        SDot2AddOp *pPrimitives = (SDot2AddOp*)Data.data();
+        for (size_t i = 0; i < count; ++i) {
+            SDot2AddOp *p = &pPrimitives[i];
+            Half2 val1,val2;
+            VERIFY_SUCCEEDED(ParseDataToVectorHalf((*validation_input1)[i],
+                                                    (uint16_t *)&val1, 2));
+            VERIFY_SUCCEEDED(ParseDataToVectorHalf((*validation_input2)[i],
+                                                    (uint16_t *)&val2, 2));
+            p->input1 = val1;
+            p->input2 = val2;
+            p->acc = (*validation_acc)[i];
+        }
+        // use shader from data table
+        pShaderOp->Shaders.at(0).Target = Target.m_psz;
+        pShaderOp->Shaders.at(0).Text = Text.m_psz;
+        pShaderOp->Shaders.at(0).Arguments = Arguments.m_psz;
+    });
+
+    MappedData data;
+    test->Test->GetReadBackData("SDot2AddOp", &data);
+
+    SDot2AddOp *pPrimitives = (SDot2AddOp*)data.data();
+    WEX::TestExecution::DisableVerifyExceptions dve;
+    for (size_t i = 0; i < count; ++i) {
+        SDot2AddOp *p = &pPrimitives[i];
+        float expectedResult = (*validation_result)[i];
+        float input1x = ConvertFloat16ToFloat32(p->input1.x);
+        float input1y = ConvertFloat16ToFloat32(p->input1.y);
+        float input2x = ConvertFloat16ToFloat32(p->input2.x);
+        float input2y = ConvertFloat16ToFloat32(p->input2.y);
+        LogCommentFmt(
+            L"element #%u, input1 = (%f, %f), input2 = (%f, %f), acc = %f\n"
+            L"result = %f, result_expected = %f",
+            i, input1x, input1y, input2x, input2y, p->acc, p->result, expectedResult);
+        VerifyOutputWithExpectedValueFloat(p->result, expectedResult, Validation_type, tolerance);
+    }
+}
+
+// Helper for the Dot4AddI8Packed operator, which is part of DotTest
+void ExecutionTest::RunDot4AddI8PackedOp() {
+    WEX::Logging::Log::Comment(L"\nRunning Dot4AddI8Packed Op tests:\n");
+
+    WEX::TestExecution::SetVerifyOutput verifySettings(
+        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+    CComPtr<IStream> pStream;
+    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+    CComPtr<ID3D12Device> pDevice;
+    if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
+        return;
+    }
+
+    int tableSize = sizeof(Dot4AddI8PackedOpParameters) / sizeof(TableParameter);
+    TableParameterHandler handler(Dot4AddI8PackedOpParameters, tableSize);
+
+    CW2A Target(handler.GetTableParamByName(L"Dot4AddI8Packed.ShaderOp.Target")->m_str);
+    CW2A Text(handler.GetTableParamByName(L"Dot4AddI8Packed.ShaderOp.Text")->m_str);
+
+    std::vector<uint32_t> *validation_input1 = &handler.GetTableParamByName(L"Dot4AddI8Packed.Validation.Input1")->m_uint32Table;
+    std::vector<uint32_t> *validation_input2 = &handler.GetTableParamByName(L"Dot4AddI8Packed.Validation.Input2")->m_uint32Table;
+    std::vector<int32_t> *validation_acc = &handler.GetTableParamByName(L"Dot4AddI8Packed.Validation.Input3")->m_int32Table;
+    std::vector<int32_t> *validation_result = &handler.GetTableParamByName(L"Dot4AddI8Packed.Validation.Expected1")->m_int32Table;
+
+    size_t count = validation_input1->size();
+
+    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+        pDevice, m_support, pStream, "Dot4AddI8PackedOp",
+        // this callback is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+        VERIFY_IS_TRUE(0 == _stricmp(Name, "SDot4AddI8PackedOp"));
+        size_t size = sizeof(SDot4AddI8PackedOp) * count;
+        Data.resize(size);
+        SDot4AddI8PackedOp *pPrimitives = (SDot4AddI8PackedOp*)Data.data();
+        for (size_t i = 0; i < count; ++i) {
+            SDot4AddI8PackedOp *p = &pPrimitives[i];
+            p->input1 = (*validation_input1)[i];
+            p->input2 = (*validation_input2)[i];
+            p->acc = (*validation_acc)[i];
+        }
+        // use shader from data table
+        pShaderOp->Shaders.at(0).Target = Target.m_psz;
+        pShaderOp->Shaders.at(0).Text = Text.m_psz;
+    });
+
+    MappedData data;
+    test->Test->GetReadBackData("SDot4AddI8PackedOp", &data);
+
+    SDot4AddI8PackedOp *pPrimitives = (SDot4AddI8PackedOp*)data.data();
+    WEX::TestExecution::DisableVerifyExceptions dve;
+    for (size_t i = 0; i < count; ++i) {
+        SDot4AddI8PackedOp *p = &pPrimitives[i];
+        int32_t expectedResult = (*validation_result)[i];
+        LogCommentFmt(
+            L"element #%u, input1 = %u, input2 = %u, acc = %d \n"
+            L"result = %d, result_expected = %d",
+            i, p->input1, p->input2, p->acc, p->result, expectedResult);
+        VerifyOutputWithExpectedValueInt(p->result, expectedResult, 0);
+    }
+}
+
+// Helper for the Dot4AddU8Packed operator, which is part of DotTest
+void ExecutionTest::RunDot4AddU8PackedOp() {
+    WEX::Logging::Log::Comment(L"\nRunning Dot4AddU8Packed Op tests\n");
+
+    WEX::TestExecution::SetVerifyOutput verifySettings(
+        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+    CComPtr<IStream> pStream;
+    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+    CComPtr<ID3D12Device> pDevice;
+    if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
+        return;
+    }
+
+    int tableSize = sizeof(Dot4AddU8PackedOpParameters) / sizeof(TableParameter);
+    TableParameterHandler handler(Dot4AddU8PackedOpParameters, tableSize);
+
+    CW2A Target(handler.GetTableParamByName(L"Dot4AddU8Packed.ShaderOp.Target")->m_str);
+    CW2A Text(handler.GetTableParamByName(L"Dot4AddU8Packed.ShaderOp.Text")->m_str);
+
+    std::vector<uint32_t> *validation_input1 = &handler.GetTableParamByName(L"Dot4AddU8Packed.Validation.Input1")->m_uint32Table;
+    std::vector<uint32_t> *validation_input2 = &handler.GetTableParamByName(L"Dot4AddU8Packed.Validation.Input2")->m_uint32Table;
+    std::vector<uint32_t> *validation_acc = &handler.GetTableParamByName(L"Dot4AddU8Packed.Validation.Input3")->m_uint32Table;
+    std::vector<uint32_t> *validation_result = &handler.GetTableParamByName(L"Dot4AddU8Packed.Validation.Expected1")->m_uint32Table;
+
+    size_t count = validation_input1->size();
+
+    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+        pDevice, m_support, pStream, "Dot4AddU8PackedOp",
+        // this callback is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+        VERIFY_IS_TRUE(0 == _stricmp(Name, "SDot4AddU8PackedOp"));
+        size_t size = sizeof(SDot4AddU8PackedOp) * count;
+        Data.resize(size);
+        SDot4AddU8PackedOp *pPrimitives = (SDot4AddU8PackedOp*)Data.data();
+        for (size_t i = 0; i < count; ++i) {
+            SDot4AddU8PackedOp *p = &pPrimitives[i];
+            p->input1 = (*validation_input1)[i];
+            p->input2 = (*validation_input2)[i];
+            p->acc = (*validation_acc)[i];
+        }
+        // use shader from data table
+        pShaderOp->Shaders.at(0).Target = Target.m_psz;
+        pShaderOp->Shaders.at(0).Text = Text.m_psz;
+    });
+
+    MappedData data;
+    test->Test->GetReadBackData("SDot4AddU8PackedOp", &data);
+
+    SDot4AddU8PackedOp *pPrimitives = (SDot4AddU8PackedOp*)data.data();
+    WEX::TestExecution::DisableVerifyExceptions dve;
+    for (size_t i = 0; i < count; ++i) {
+        SDot4AddU8PackedOp *p = &pPrimitives[i];
+        uint32_t expectedResult = (*validation_result)[i];
+        LogCommentFmt(
+            L"element #%u, input1 = %u, input2 = %u, acc = %u \n"
+            L"result = %u, result_expected = %u, ",
+            i, p->input1, p->input2, p->acc, p->result, expectedResult);
+        VerifyOutputWithExpectedValueUInt(p->result, expectedResult, 0);
+    }
+}
+
 TEST_F(ExecutionTest, Msad4Test) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
@@ -5873,104 +6251,358 @@ TEST_F(ExecutionTest, BarycentricsTest) {
     //SavePixelsToFile(pPixels, DXGI_FORMAT_R32G32B32A32_FLOAT, width, height, L"barycentric.bmp");
 }
 
+static const char RawBufferTestShaderDeclarations[] =
+"// Note: COMPONENT_TYPE and COMPONENT_SIZE will be defined via compiler option -D\r\n"
+"typedef COMPONENT_TYPE scalar; \r\n"
+"typedef vector<COMPONENT_TYPE, 2> vector2; \r\n"
+"typedef vector<COMPONENT_TYPE, 3> vector3; \r\n"
+"typedef vector<COMPONENT_TYPE, 4> vector4; \r\n"
+"\r\n"
+"struct TestData { \r\n"
+"  scalar  v1; \r\n"
+"  vector2 v2; \r\n"
+"  vector3 v3; \r\n"
+"  vector4 v4; \r\n"
+"}; \r\n"
+"\r\n"
+"struct UavData {\r\n"
+"  TestData input; \r\n"
+"  TestData output; \r\n"
+"  TestData srvOut; \r\n"
+"}; \r\n"
+"\r\n"
+"ByteAddressBuffer           srv0 : register(t0); \r\n"
+"StructuredBuffer<TestData>  srv1 : register(t1); \r\n"
+"ByteAddressBuffer           srv2 : register(t2); \r\n"
+"StructuredBuffer<TestData>  srv3 : register(t3); \r\n"
+"\r\n"
+"RWByteAddressBuffer         uav0 : register(u0); \r\n"
+"RWStructuredBuffer<UavData> uav1 : register(u1); \r\n"
+"RWByteAddressBuffer         uav2 : register(u2); \r\n"
+"RWStructuredBuffer<UavData> uav3 : register(u3); \r\n";
+
+static const char RawBufferTestShaderBody[] =
+"  // offset of 'out' in 'UavData'\r\n"
+"  const int out_offset = COMPONENT_SIZE * 10; \r\n"
+"\r\n"
+"  // offset of 'srv_out' in 'UavData'\r\n"
+"  const int srv_out_offset = COMPONENT_SIZE * 10 * 2; \r\n"
+"\r\n"
+"  // offsets within the 'Data' struct\r\n"
+"  const int v1_offset = 0; \r\n"
+"  const int v2_offset = COMPONENT_SIZE; \r\n"
+"  const int v3_offset = COMPONENT_SIZE * 3; \r\n"
+"  const int v4_offset = COMPONENT_SIZE * 6; \r\n"
+"\r\n"
+"  uav0.Store(srv_out_offset + v1_offset, srv0.Load<scalar>(v1_offset)); \r\n"
+"  uav0.Store(srv_out_offset + v2_offset, srv0.Load<vector2>(v2_offset)); \r\n"
+"  uav0.Store(srv_out_offset + v3_offset, srv0.Load<vector3>(v3_offset)); \r\n"
+"  uav0.Store(srv_out_offset + v4_offset, srv0.Load<vector4>(v4_offset)); \r\n"
+"\r\n"
+"  uav1[0].srvOut.v1 = srv1[0].v1; \r\n"
+"  uav1[0].srvOut.v2 = srv1[0].v2; \r\n"
+"  uav1[0].srvOut.v3 = srv1[0].v3; \r\n"
+"  uav1[0].srvOut.v4 = srv1[0].v4; \r\n"
+"\r\n"
+"  uav2.Store(srv_out_offset + v1_offset, srv2.Load<scalar>(v1_offset)); \r\n"
+"  uav2.Store(srv_out_offset + v2_offset, srv2.Load<vector2>(v2_offset)); \r\n"
+"  uav2.Store(srv_out_offset + v3_offset, srv2.Load<vector3>(v3_offset)); \r\n"
+"  uav2.Store(srv_out_offset + v4_offset, srv2.Load<vector4>(v4_offset)); \r\n"
+"\r\n"
+"  uav3[0].srvOut.v1 = srv3[0].v1; \r\n"
+"  uav3[0].srvOut.v2 = srv3[0].v2; \r\n"
+"  uav3[0].srvOut.v3 = srv3[0].v3; \r\n"
+"  uav3[0].srvOut.v4 = srv3[0].v4; \r\n"
+"\r\n"
+"  uav0.Store(out_offset + v1_offset, uav0.Load<scalar>(v1_offset)); \r\n"
+"  uav0.Store(out_offset + v2_offset, uav0.Load<vector2>(v2_offset)); \r\n"
+"  uav0.Store(out_offset + v3_offset, uav0.Load<vector3>(v3_offset)); \r\n"
+"  uav0.Store(out_offset + v4_offset, uav0.Load<vector4>(v4_offset)); \r\n"
+"\r\n"
+"  uav1[0].output.v1 = uav1[0].input.v1; \r\n"
+"  uav1[0].output.v2 = uav1[0].input.v2; \r\n"
+"  uav1[0].output.v3 = uav1[0].input.v3; \r\n"
+"  uav1[0].output.v4 = uav1[0].input.v4; \r\n"
+"\r\n"
+"  uav2.Store(out_offset + v1_offset, uav2.Load<scalar>(v1_offset)); \r\n"
+"  uav2.Store(out_offset + v2_offset, uav2.Load<vector2>(v2_offset)); \r\n"
+"  uav2.Store(out_offset + v3_offset, uav2.Load<vector3>(v3_offset)); \r\n"
+"  uav2.Store(out_offset + v4_offset, uav2.Load<vector4>(v4_offset)); \r\n"
+"\r\n"
+"  uav3[0].output.v1 = uav3[0].input.v1; \r\n"
+"  uav3[0].output.v2 = uav3[0].input.v2; \r\n"
+"  uav3[0].output.v3 = uav3[0].input.v3; \r\n"
+"  uav3[0].output.v4 = uav3[0].input.v4; \r\n";
+
+
+static const char RawBufferTestComputeShaderTemplate[] =
+"%s\r\n" // <- RawBufferTestShaderDeclarations
+"[numthreads(1, 1, 1)]\r\n"
+"void main(uint GI : SV_GroupIndex) {\r\n"
+"%s\r\n" // <- RawBufferTestShaderBody
+"};";
+
+static const char RawBufferTestGraphicsPixelShaderTemplate[] =
+"%s\r\n" // <- RawBufferTestShaderDeclarations
+"struct PSInput { \r\n"
+"  float4 pos : SV_POSITION; \r\n"
+"}; \r\n"
+"uint4 main(PSInput input) : SV_TARGET{ \r\n"
+"  if (input.pos.x + input.pos.y == 1.0f) { // pixel { 0.5, 0.5, 0 } \r\n"
+"%s\r\n" // <- RawBufferTestShaderBody
+"  } \r\n"
+"  return uint4(1, 2, 3, 4); \r\n"
+"};";
+
 TEST_F(ExecutionTest, ComputeRawBufferLdStI32) {
-  RunRawBufferLdStTest(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I32, "ComputeRawBufferLdStI32");
+  RawBufferLdStTestData<int32_t> data = { { 1 }, { 2, -1 }, { 256, -10517, 980 }, { 465, 13, -89, MAXUINT32 / 2 } };
+  RunComputeRawBufferLdStTest<int32_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I32, "ComputeRawBufferLdSt32Bit", data);
 }
 
-TEST_F(ExecutionTest,    ComputeRawBufferLdStFloat)  {
-   RunRawBufferLdStTest(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Float, "ComputeRawBufferLdStFloat");
+TEST_F(ExecutionTest, ComputeRawBufferLdStFloat)  {
+  RawBufferLdStTestData<float> data = { { 3e-10f }, { 1.5f, -1.99988f }, { 256.0f, -105.17f, 980.0f }, { 465.1652f, -1.5694e2f, -0.8543e-2f, 1333.5f } };
+  RunComputeRawBufferLdStTest<float>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Float, "ComputeRawBufferLdSt32Bit", data);
 }
 
 TEST_F(ExecutionTest,  ComputeRawBufferLdStI64)  {
-   RunRawBufferLdStTest(D3D_SHADER_MODEL_6_3, RawBufferLdStType::I64, "ComputeRawBufferLdStI64");
+  RawBufferLdStTestData<int64_t> data = { { 1 }, { 2, -1 }, { 256, -105171532, 980 }, { 465, 13, -89, MAXUINT64 / 2 } };
+  RunComputeRawBufferLdStTest<int64_t>(D3D_SHADER_MODEL_6_3, RawBufferLdStType::I64, "ComputeRawBufferLdSt64Bit", data);
 }
 
 TEST_F(ExecutionTest,  ComputeRawBufferLdStDouble)  {
-   RunRawBufferLdStTest(D3D_SHADER_MODEL_6_3, RawBufferLdStType::Double, "ComputeRawBufferLdStDouble");
+  RawBufferLdStTestData<double> data = { { 3e-10 }, { 1.5, -1.99988 }, { 256.0, -105.17, 980.0 }, { 465.1652, -1.5694e2, -0.8543e-2, 1333.5 } };
+  RunComputeRawBufferLdStTest<double>(D3D_SHADER_MODEL_6_3, RawBufferLdStType::I64, "ComputeRawBufferLdSt64Bit", data);
 }
 
 TEST_F(ExecutionTest, ComputeRawBufferLdStI16) {
-  RunRawBufferLdStTest(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I16, "ComputeRawBufferLdStI16");
+  RawBufferLdStTestData<int16_t> data = { { 1 }, { 2, -1 }, { 256, -10517, 980 }, { 465, 13, -89, MAXUINT16 / 2 } };
+  RunComputeRawBufferLdStTest<int16_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I16, "ComputeRawBufferLdSt16Bit", data);
 }
 
 TEST_F(ExecutionTest,  ComputeRawBufferLdStHalf)  {
-   RunRawBufferLdStTest(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Half, "ComputeRawBufferLdStHalf");
+  RawBufferLdStTestData<float> floatData = { { 3e-10f }, { 1.5f, -1.99988f }, { 256.0f, 105.17f, 980.0f }, { 465.1652f, -1.5694e2f, -0.8543e-2f, 1333.5f } };
+  RawBufferLdStTestData<uint16_t> halfData;
+  for (int i = 0; i < sizeof(floatData)/sizeof(float); i++) {
+    ((uint16_t*)&halfData)[i] = ConvertFloat32ToFloat16(((float*)&floatData)[i]);
+  }
+  RunComputeRawBufferLdStTest<uint16_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Half, "ComputeRawBufferLdSt16Bit", halfData);
 }
 
 TEST_F(ExecutionTest,  GraphicsRawBufferLdStI32)  {
-   RunRawBufferLdStTest(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I32, "GraphicsRawBufferLdStI32");
+  RawBufferLdStTestData<int32_t> data = { { 1 }, { 2, -1 }, { 256, -10517, 980 }, { 465, 13, -89, MAXUINT32 / 2 } };
+  RunGraphicsRawBufferLdStTest<int32_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I32, "GraphicsRawBufferLdSt32Bit", data);
 }
 
 TEST_F(ExecutionTest,  GraphicsRawBufferLdStFloat)  {
-   RunRawBufferLdStTest(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Float, "GraphicsRawBufferLdStFloat");
+  RawBufferLdStTestData<float> data = { { 3e-10f }, { 1.5f, -1.99988f }, { 256.0f, -105.17f, 980.0f }, { 465.1652f, -1.5694e2f, -0.8543e-2f, 1333.5f } };
+  RunGraphicsRawBufferLdStTest<float>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Float, "GraphicsRawBufferLdSt32Bit", data);
 }
 
 TEST_F(ExecutionTest,  GraphicsRawBufferLdStI64)  {
-   RunRawBufferLdStTest(D3D_SHADER_MODEL_6_3, RawBufferLdStType::I64, "GraphicsRawBufferLdStI64");
+  RawBufferLdStTestData<int64_t> data = { { 1 }, { 2, -1 }, { 256, -105171532, 980 }, { 465, 13, -89, MAXUINT64 / 2 } };
+  RunGraphicsRawBufferLdStTest<int64_t>(D3D_SHADER_MODEL_6_3, RawBufferLdStType::I64, "GraphicsRawBufferLdSt64Bit", data);
 }
 
 TEST_F(ExecutionTest,  GraphicsRawBufferLdStDouble)  {
-   RunRawBufferLdStTest(D3D_SHADER_MODEL_6_3, RawBufferLdStType::Double, "GraphicsRawBufferLdStDouble");
+  RawBufferLdStTestData<double> data = { { 3e-10 }, { 1.5, -1.99988 }, { 256.0, -105.17, 980.0 }, { 465.1652, -1.5694e2, -0.8543e-2, 1333.5 } };
+  RunGraphicsRawBufferLdStTest<double>(D3D_SHADER_MODEL_6_3, RawBufferLdStType::Double, "GraphicsRawBufferLdSt64Bit", data);
 }
 
 TEST_F(ExecutionTest, GraphicsRawBufferLdStI16) {
-  RunRawBufferLdStTest(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I16, "GraphicsRawBufferLdStI16");
+  RawBufferLdStTestData<int16_t> data = { { 1 }, { 2, -1 }, { 256, -10517, 980 }, { 465, 13, -89, MAXUINT16 / 2 } };
+  RunGraphicsRawBufferLdStTest<int16_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I16, "GraphicsRawBufferLdSt16Bit", data);
 }
 
 TEST_F(ExecutionTest, GraphicsRawBufferLdStHalf) {
-   RunRawBufferLdStTest(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Half, "GraphicsRawBufferLdStHalf");
+  RawBufferLdStTestData<float> floatData = { { 3e-10f }, { 1.5f, -1.99988f }, { 256.0f, 105.17f, 0.0f }, { 465.1652f, -1.5694e2f, -0.8543e-2f, 1333.5f } };
+  RawBufferLdStTestData<uint16_t> halfData;
+  for (int i = 0; i < sizeof(floatData) / sizeof(float); i++) {
+    ((uint16_t*)&halfData)[i] = ConvertFloat32ToFloat16(((float*)&floatData)[i]);
+  }
+  RunGraphicsRawBufferLdStTest<uint16_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Half, "GraphicsRawBufferLdSt16Bit", halfData);
 }
 
-void ExecutionTest::RunRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType, char *shaderOpName) {
-   WEX::TestExecution::SetVerifyOutput verifySettings(
-   WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+bool ExecutionTest::SetupRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType,
+                                           CComPtr<ID3D12Device> &pDevice, CComPtr<IStream> &pStream, 
+                                           char *&sTy, char *&additionalOptions) {
+  if (!CreateDevice(&pDevice, shaderModel)) {
+    return false;
+  }
+
+  additionalOptions = "";
+
+  switch (dataType) {
+  case RawBufferLdStType::I64:
+    if (!DoesDeviceSupportInt64(pDevice)) {
+      WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
+      WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+      return false;
+    }
+    sTy = "int64_t";
+    break;
+  case RawBufferLdStType::Double:
+    if (!DoesDeviceSupportDouble(pDevice)) {
+      WEX::Logging::Log::Comment(L"Device does not support double operations.");
+      WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+      return false;
+    }
+    sTy = "double";
+    break;
+  case RawBufferLdStType::I16:
+  case RawBufferLdStType::Half:
+    if (!DoesDeviceSupportNative16bitOps(pDevice)) {
+      WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
+      WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+      return false;
+    }
+    additionalOptions = "-enable-16bit-types";
+    sTy = (dataType == RawBufferLdStType::I16 ? "int16_t" : "half");
+    break;
+  case RawBufferLdStType::I32:
+    sTy = "int32_t";
+    break;
+  case RawBufferLdStType::Float:
+    sTy = "float";
+    break;
+  default:
+    DXASSERT_NOMSG("Invalid RawBufferLdStType");
+  }
+
+  // read shader config
+  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+  return true;
+}
+
+template <class Ty>
+void ExecutionTest::VerifyRawBufferLdStTestResults(const std::shared_ptr<st::ShaderOpTest> test, const RawBufferLdStTestData<Ty> &testData) {
+  // read buffers back & verify expected values
+  static const int UavBufferCount = 4;
+  char bufferName[11] = "UAVBufferX";
+
+  for (unsigned i = 0; i < UavBufferCount; i++) {
+    MappedData dataUav;
+    RawBufferLdStUavData<Ty> *pOutData;
+
+    bufferName[sizeof(bufferName) - 2] = (char)(i + '0');
+
+    test->GetReadBackData(bufferName, &dataUav);
+    VERIFY_ARE_EQUAL(sizeof(RawBufferLdStUavData<Ty>), dataUav.size());
+    pOutData = (RawBufferLdStUavData<Ty> *)dataUav.data();
+
+    LogCommentFmt(L"Verifying UAVBuffer%d Load -> UAVBuffer%d Store", i, i);
+    // scalar
+    VERIFY_ARE_EQUAL(pOutData->output.v1, testData.v1);
+    // vector 2
+    VERIFY_ARE_EQUAL(pOutData->output.v2[0], testData.v2[0]);
+    VERIFY_ARE_EQUAL(pOutData->output.v2[1], testData.v2[1]);
+    // vector 3
+    VERIFY_ARE_EQUAL(pOutData->output.v3[0], testData.v3[0]);
+    VERIFY_ARE_EQUAL(pOutData->output.v3[1], testData.v3[1]);
+    VERIFY_ARE_EQUAL(pOutData->output.v3[2], testData.v3[2]);
+    // vector 4
+    VERIFY_ARE_EQUAL(pOutData->output.v4[0], testData.v4[0]);
+    VERIFY_ARE_EQUAL(pOutData->output.v4[1], testData.v4[1]);
+    VERIFY_ARE_EQUAL(pOutData->output.v4[2], testData.v4[2]);
+    VERIFY_ARE_EQUAL(pOutData->output.v4[3], testData.v4[3]);
+
+    // verify SRV Store
+    LogCommentFmt(L"Verifying SRVBuffer%d Load -> UAVBuffer%d Store", i, i);
+    // scalar
+    VERIFY_ARE_EQUAL(pOutData->srvOut.v1, testData.v1);
+    // vector 2
+    VERIFY_ARE_EQUAL(pOutData->srvOut.v2[0], testData.v2[0]);
+    VERIFY_ARE_EQUAL(pOutData->srvOut.v2[1], testData.v2[1]);
+    // vector 3
+    VERIFY_ARE_EQUAL(pOutData->srvOut.v3[0], testData.v3[0]);
+    VERIFY_ARE_EQUAL(pOutData->srvOut.v3[1], testData.v3[1]);
+    VERIFY_ARE_EQUAL(pOutData->srvOut.v3[2], testData.v3[2]);
+    // vector 4
+    VERIFY_ARE_EQUAL(pOutData->srvOut.v4[0], testData.v4[0]);
+    VERIFY_ARE_EQUAL(pOutData->srvOut.v4[1], testData.v4[1]);
+    VERIFY_ARE_EQUAL(pOutData->srvOut.v4[2], testData.v4[2]);
+    VERIFY_ARE_EQUAL(pOutData->srvOut.v4[3], testData.v4[3]);
+  }
+}
+
+template <class Ty>
+void ExecutionTest::RunComputeRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType, 
+                                                const char *shaderOpName, const RawBufferLdStTestData<Ty> &testData) {
+   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
 
    CComPtr<ID3D12Device> pDevice;
-   if (!CreateDevice(&pDevice, shaderModel)) {
-     return;
-   }
-
-   switch (dataType) {
-   case RawBufferLdStType::I64:
-     if (!DoesDeviceSupportInt64(pDevice)) {
-       WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
-       WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
-       return;
-     }
-     break;
-   case RawBufferLdStType::Double:
-     if (!DoesDeviceSupportDouble(pDevice)) {
-       WEX::Logging::Log::Comment(L"Device does not support double operations.");
-       WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
-       return;
-     }
-     break;
-   case RawBufferLdStType::I16:
-   case RawBufferLdStType::Half:
-     if (!DoesDeviceSupportNative16bitOps(pDevice)) {
-       WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
-       WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
-       return;
-     }
-     break;
-   case RawBufferLdStType::I32:
-   case RawBufferLdStType::Float:
-     break;
-   default:
-     DXASSERT_NOMSG("Invalid RawBufferLdStType");
-   }
-
-   if (shaderOpName == nullptr) {
-     // TODO: finish up all variations
-     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
-     return;
-   }
-
    CComPtr<IStream> pStream;
-   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+   char *sTy, *additionalOptions;
 
-   // TODO: finish up
-   WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
+   if (!SetupRawBufferLdStTest(shaderModel, dataType, pDevice, pStream, sTy, additionalOptions)) {
+     return;
+   }
+
+   // format shader source
+   char rawBufferTestShaderText[sizeof(RawBufferTestComputeShaderTemplate) + sizeof(RawBufferTestShaderDeclarations) + sizeof(RawBufferTestShaderBody)];
+   VERIFY_IS_TRUE(sprintf_s(rawBufferTestShaderText, sizeof(rawBufferTestShaderText), 
+                            RawBufferTestComputeShaderTemplate, RawBufferTestShaderDeclarations, RawBufferTestShaderBody) != -1);
+
+   // format compiler args
+   char compilerOptions[256];
+   VERIFY_IS_TRUE(sprintf_s(compilerOptions, sizeof(compilerOptions), "-D COMPONENT_TYPE=%s -D COMPONENT_SIZE=%d %s", sTy, (int)sizeof(Ty), additionalOptions) != -1);
+
+   // run the shader
+   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, shaderOpName,
+     [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+     VERIFY_IS_TRUE(((0 == strncmp(Name, "SRVBuffer", 9)) || (0 == strncmp(Name, "UAVBuffer", 9))) &&
+                    (Name[9] >= '0' && Name[9] <= '3'));
+     pShaderOp->Shaders.at(0).Arguments = compilerOptions;
+     pShaderOp->Shaders.at(0).Text = rawBufferTestShaderText;
+
+     VERIFY_IS_TRUE(sizeof(RawBufferLdStTestData<Ty>) <= Data.size());
+     RawBufferLdStTestData<Ty> *pInData = (RawBufferLdStTestData<Ty>*)Data.data();
+     memcpy(pInData, &testData, sizeof(RawBufferLdStTestData<Ty>));
+   });
+
+   // verify expected values
+   VerifyRawBufferLdStTestResults<Ty>(test->Test, testData);
+}
+
+template <class Ty>
+void ExecutionTest::RunGraphicsRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType,
+                                                 const char *shaderOpName, const RawBufferLdStTestData<Ty> &testData) {
+
+  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+
+  CComPtr<ID3D12Device> pDevice;
+  CComPtr<IStream> pStream;
+  char *sTy, *additionalOptions;
+
+  if (!SetupRawBufferLdStTest(shaderModel, dataType, pDevice, pStream, sTy, additionalOptions)) {
+    return;
+  }
+
+  // format shader source
+  char rawBufferTestPixelShaderText[sizeof(RawBufferTestGraphicsPixelShaderTemplate) + sizeof(RawBufferTestShaderDeclarations) + sizeof(RawBufferTestShaderBody)];
+  VERIFY_IS_TRUE(sprintf_s(rawBufferTestPixelShaderText, sizeof(rawBufferTestPixelShaderText),
+                           RawBufferTestGraphicsPixelShaderTemplate, RawBufferTestShaderDeclarations, RawBufferTestShaderBody) != -1);
+
+  // format compiler args
+  char compilerOptions[256];
+  VERIFY_IS_TRUE(sprintf_s(compilerOptions, sizeof(compilerOptions), "-D COMPONENT_TYPE=%s -D COMPONENT_SIZE=%d %s", sTy, (int)sizeof(Ty), additionalOptions) != -1);
+
+  // run the shader
+  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, shaderOpName,
+    [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+    VERIFY_IS_TRUE(((0 == strncmp(Name, "SRVBuffer", 9)) || (0 == strncmp(Name, "UAVBuffer", 9))) &&
+      (Name[9] >= '0' && Name[9] <= '3'));
+    // pixel shader is at index 1, vertex shader at index 0
+    pShaderOp->Shaders.at(1).Arguments = compilerOptions;
+    pShaderOp->Shaders.at(1).Text = rawBufferTestPixelShaderText;
+
+    VERIFY_IS_TRUE(sizeof(RawBufferLdStTestData<Ty>) <= Data.size());
+    RawBufferLdStTestData<Ty> *pInData = (RawBufferLdStTestData<Ty>*)Data.data();
+    memcpy(pInData, &testData, sizeof(RawBufferLdStTestData<Ty>));
+  });
+
+  // verify expected values
+  VerifyRawBufferLdStTestResults<Ty>(test->Test, testData);
 }
 
 #ifndef _HLK_CONF
diff --git a/tools/clang/unittests/HLSL/ExtensionTest.cpp b/tools/clang/unittests/HLSL/ExtensionTest.cpp
index ea7737f09..d70e7655c 100644
--- a/tools/clang/unittests/HLSL/ExtensionTest.cpp
+++ b/tools/clang/unittests/HLSL/ExtensionTest.cpp
@@ -111,6 +111,12 @@ static const HLSL_INTRINSIC_ARGUMENT TestIBFE[] = {
   { "val",    AR_QUAL_IN, 1, LITEMPLATE_SCALAR, 1, LICOMPTYPE_UINT, 1, 1},
 };
 
+// float2 = MySamplerOp(uint2 addr)
+static const HLSL_INTRINSIC_ARGUMENT TestMySamplerOp[] = {
+  { "MySamplerOp", AR_QUAL_OUT, 0, LITEMPLATE_VECTOR, 0, LICOMPTYPE_FLOAT, 1, 2 },
+  { "addr", AR_QUAL_IN, 1, LITEMPLATE_VECTOR, 1, LICOMPTYPE_UINT, 1, 2},
+};
+
 struct Intrinsic {
   LPCWSTR hlslName;
   const char *dxilName;
@@ -146,6 +152,11 @@ Intrinsic BufferIntrinsics[] = {
   {L"MyBufferOp",   "MyBufferOp",      "m", { 12, false, true, -1, countof(TestMyBufferOp), TestMyBufferOp}},
 };
 
+// Test adding a method to an object that normally has no methods (SamplerState will do).
+Intrinsic SamplerIntrinsics[] = {
+  {L"MySamplerOp",   "MySamplerOp",    "m", { 15, false, true, -1, countof(TestMySamplerOp), TestMySamplerOp}},
+};
+
 class IntrinsicTable {
 public:
   IntrinsicTable(const wchar_t *ns, Intrinsic *begin, Intrinsic *end)
@@ -214,6 +225,7 @@ public:
   TestIntrinsicTable() : m_dwRef(0) { 
     m_tables.push_back(IntrinsicTable(L"",       std::begin(Intrinsics), std::end(Intrinsics)));
     m_tables.push_back(IntrinsicTable(L"Buffer", std::begin(BufferIntrinsics), std::end(BufferIntrinsics)));
+    m_tables.push_back(IntrinsicTable(L"SamplerState", std::begin(SamplerIntrinsics), std::end(SamplerIntrinsics)));
   }
   DXC_MICROCOM_ADDREF_RELEASE_IMPL(m_dwRef)
   HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void** ppvObject) override {
@@ -441,6 +453,7 @@ public:
   TEST_METHOD(DxilLoweringVector1)
   TEST_METHOD(DxilLoweringVector2)
   TEST_METHOD(DxilLoweringScalar)
+  TEST_METHOD(SamplerExtensionIntrinsic)
 };
 
 TEST_F(ExtensionTest, DefineWhenRegisteredThenPreserved) {
@@ -840,3 +853,30 @@ TEST_F(ExtensionTest, DxilLoweringScalar) {
     disassembly.npos !=
     disassembly.find("call i32 @dx.op.tertiary.i32(i32 51"));
 }
+
+TEST_F(ExtensionTest, SamplerExtensionIntrinsic) {
+  // Test adding methods to objects that don't have any methods normally,
+  // and therefore have null default intrinsic table.
+  Compiler c(m_dllSupport);
+  c.RegisterIntrinsicTable(new TestIntrinsicTable());
+  auto result = c.Compile(
+    "SamplerState samp;"
+    "float2 main(uint2 v1 : V1) : SV_Target {\n"
+    "  return samp.MySamplerOp(uint2(1, 2));\n"
+    "}\n",
+    { L"/Vd" }, {}
+  );
+  CheckOperationResultMsgs(result, {}, true, false);
+  std::string disassembly = c.Disassemble();
+
+  // Things to check
+  // - works when SamplerState normally has no methods
+  // - return type is translated to dx.types.ResRet
+  // - buffer is translated to dx.types.Handle
+  // - vector is exploded
+  LPCSTR expected[] = {
+    "call %dx.types.ResRet.f32 @MySamplerOp\\(i32 15, %dx.types.Handle %.*, i32 1, i32 2\\)"
+  };
+  CheckMsgs(disassembly.c_str(), disassembly.length(), expected, 1, true);
+}
+
diff --git a/tools/clang/unittests/HLSL/FileCheckerTest.cpp b/tools/clang/unittests/HLSL/FileCheckerTest.cpp
index d0ff77d9a..4c846f1a6 100644
--- a/tools/clang/unittests/HLSL/FileCheckerTest.cpp
+++ b/tools/clang/unittests/HLSL/FileCheckerTest.cpp
@@ -41,12 +41,15 @@
 using namespace std;
 using namespace hlsl_test;
 
-static std::string strltrim(std::string value) {
-  return value.erase(0, value.find_first_not_of(" \t\r\n"));
+static constexpr char whitespaceChars[] = " \t\r\n";
+
+static std::string strltrim(const std::string &value) {
+  size_t first = value.find_first_not_of(whitespaceChars);
+  return first == string::npos ? value : value.substr(first);
 }
 
-static std::string strrtrim(std::string value) {
-  size_t last = value.find_last_not_of(" \t\r\n");
+static std::string strrtrim(const std::string &value) {
+  size_t last = value.find_last_not_of(whitespaceChars);
   return last == string::npos ? value : value.substr(0, last + 1);
 }
 
@@ -54,429 +57,431 @@ static std::string strtrim(const std::string &value) {
   return strltrim(strrtrim(value));
 }
 
-static string trim(string value) {
-  size_t leading = value.find_first_not_of(' ');
-  if (leading != std::string::npos) {
-    value.erase(0, leading);
+static bool strstartswith(const std::string& value, const char* pattern) {
+  for (size_t i = 0; ; ++i) {
+    if (pattern[i] == '\0') return true;
+    if (i == value.size() || value[i] != pattern[i]) return false;
   }
-  size_t trailing = value.find_last_not_of(' ');
-  if (leading != std::string::npos) {
-    value.erase(trailing + 1);
-  }
-  return value;
 }
 
-    FileRunCommandPart::FileRunCommandPart(const std::string &command, const std::string &arguments, LPCWSTR commandFileName) :
-      Command(command), Arguments(arguments), CommandFileName(commandFileName) { }
-    FileRunCommandPart::FileRunCommandPart(FileRunCommandPart && other) :
-      Command(std::move(other.Command)),
-      Arguments(std::move(other.Arguments)),
-      CommandFileName(other.CommandFileName),
-      RunResult(other.RunResult),
-      StdOut(std::move(other.StdOut)),
-      StdErr(std::move(other.StdErr)) { }
+static std::vector<std::string> strtok(const std::string &value, const char *delimiters = whitespaceChars) {
+  size_t searchOffset = 0;
+  std::vector<std::string> tokens;
+  while (searchOffset != value.size()) {
+    size_t tokenStartIndex = value.find_first_not_of(delimiters, searchOffset);
+    if (tokenStartIndex == std::string::npos) break;
+    size_t tokenEndIndex = value.find_first_of(delimiters, tokenStartIndex);
+    if (tokenEndIndex == std::string::npos) tokenEndIndex = value.size();
+    tokens.emplace_back(value.substr(tokenStartIndex, tokenEndIndex - tokenStartIndex));
+    searchOffset = tokenEndIndex;
+  }
+  return tokens;
+}
 
-    void FileRunCommandPart::Run(const FileRunCommandPart *Prior) {
-      bool isFileCheck =
-        0 == _stricmp(Command.c_str(), "FileCheck") ||
-        0 == _stricmp(Command.c_str(), "%FileCheck");
-      // For now, propagate errors.
-      if (Prior && Prior->RunResult) {
-        if (isFileCheck) {
-          RunFileChecker(Prior);
-        } else {
-          StdErr = Prior->StdErr;
-          RunResult = Prior->RunResult;
-        }
-        return;
-      }
+FileRunCommandPart::FileRunCommandPart(const std::string &command, const std::string &arguments, LPCWSTR commandFileName) :
+  Command(command), Arguments(arguments), CommandFileName(commandFileName) { }
 
-      // We would add support for 'not' and 'llc' here.
-      if (isFileCheck) {
-        RunFileChecker(Prior);
-      }
-      else if (0 == _stricmp(Command.c_str(), "StdErrCheck")) {
-        RunStdErrChecker(Prior);
-      }
-      else if (0 == _stricmp(Command.c_str(), "tee")) {
-        RunTee(Prior);
-      }
-      else if (0 == _stricmp(Command.c_str(), "%dxc")) {
-        RunDxc(Prior);
-      }
-      else if (0 == _stricmp(Command.c_str(), "%dxv")) {
-        RunDxv(Prior);
-      }
-      else if (0 == _stricmp(Command.c_str(), "%opt")) {
-        RunOpt(Prior);
-      }
-      else if (0 == _stricmp(Command.c_str(), "%D3DReflect")) {
-        RunD3DReflect(Prior);
-      }
-      else {
-        RunResult = 1;
-        StdErr = "Unrecognized command ";
-        StdErr += Command;
-      }
+void FileRunCommandPart::Run(dxc::DxcDllSupport &DllSupport, const FileRunCommandPart *Prior) {
+  bool isFileCheck =
+    0 == _stricmp(Command.c_str(), "FileCheck") ||
+    0 == _stricmp(Command.c_str(), "%FileCheck");
+  bool isXFail = 0 == _stricmp(Command.c_str(), "xfail");
+
+  // For now, propagate errors.
+  if (Prior && Prior->RunResult && !isFileCheck && !isXFail) {
+    StdErr = Prior->StdErr;
+    RunResult = Prior->RunResult;
+    return;
+  }
+
+  // We would add support for 'not' and 'llc' here.
+  if (isFileCheck) {
+    RunFileChecker(Prior);
+  }
+  else if (isXFail) {
+    RunXFail(Prior);
+  }
+  else if (0 == _stricmp(Command.c_str(), "tee")) {
+    RunTee(Prior);
+  }
+  else if (0 == _stricmp(Command.c_str(), "%dxc")) {
+    RunDxc(DllSupport, Prior);
+  }
+  else if (0 == _stricmp(Command.c_str(), "%dxv")) {
+    RunDxv(DllSupport, Prior);
+  }
+  else if (0 == _stricmp(Command.c_str(), "%opt")) {
+    RunOpt(DllSupport, Prior);
+  }
+  else if (0 == _stricmp(Command.c_str(), "%D3DReflect")) {
+    RunD3DReflect(DllSupport, Prior);
+  }
+  else {
+    RunResult = 1;
+    StdErr = "Unrecognized command ";
+    StdErr += Command;
+  }
+}
+
+void FileRunCommandPart::RunFileChecker(const FileRunCommandPart *Prior) {
+  if (!Prior) {
+    StdErr = "Prior command required to generate stdin";
+    RunResult = 1;
+    return;
+  }
+
+  FileCheckForTest t;
+  t.CheckFilename = CW2A(CommandFileName, CP_UTF8);
+  if (Prior->RunResult)
+    t.InputForStdin = Prior->StdErr;
+  else
+    t.InputForStdin = Prior->StdOut;
+
+  // Parse command arguments
+  static constexpr char checkPrefixStr[] = "-check-prefix=";
+  bool hasInputFilename = false;
+  for (const std::string& arg : strtok(Arguments)) {
+    if (arg == "%s") hasInputFilename = true;
+    else if (arg == "-input=stderr") t.InputForStdin = Prior->StdErr;
+    else if (strstartswith(arg, checkPrefixStr))
+      t.CheckPrefixes.emplace_back(arg.substr(sizeof(checkPrefixStr) - 1));
+    else {
+      StdErr = "Invalid argument";
+      RunResult = 1;
+      return;
     }
+  }
+  if (!hasInputFilename) {
+    StdErr = "Missing input filename";
+    RunResult = 1;
+    return;
+  }
 
-    void FileRunCommandPart::RunFileChecker(const FileRunCommandPart *Prior) {
-      std::string args(strtrim(Arguments));
-      if (args != "%s") {
-        StdErr = "Only supported pattern is a plain input file";
-        RunResult = 1;
-        return;
-      }
-      if (!Prior) {
-        StdErr = "Prior command required to generate stdin";
-        RunResult = 1;
-        return;
-      }
+  // Run
+  RunResult = t.Run();
 
-      CW2A fileName(CommandFileName, CP_UTF8);
-      FileCheckForTest t;
-      t.CheckFilename = fileName;
-      if (Prior->RunResult)
-        t.InputForStdin = Prior->StdErr;
+  StdOut = t.test_outs;
+  StdErr = t.test_errs;
+  // Capture the input as well.
+  if (RunResult != 0 && Prior != nullptr) {
+    StdErr += "\n<full input to FileCheck>\n";
+    StdErr += t.InputForStdin;
+  }
+}
+
+void FileRunCommandPart::ReadOptsForDxc(hlsl::options::MainArgs &argStrings,
+                                        hlsl::options::DxcOpts &Opts) {
+  std::string args(strtrim(Arguments));
+  const char *inputPos = strstr(args.c_str(), "%s");
+  if (inputPos == nullptr) {
+    StdErr = "Only supported pattern includes input file as argument";
+    RunResult = 1;
+    return;
+  }
+  args.erase(inputPos - args.c_str(), strlen("%s"));
+
+  llvm::StringRef argsRef = args;
+  llvm::SmallVector<llvm::StringRef, 8> splitArgs;
+  argsRef.split(splitArgs, " ");
+  argStrings = hlsl::options::MainArgs(splitArgs);
+  std::string errorString;
+  llvm::raw_string_ostream errorStream(errorString);
+  RunResult = ReadDxcOpts(hlsl::options::getHlslOptTable(), /*flagsToInclude*/ 0,
+                          argStrings, Opts, errorStream);
+  errorStream.flush();
+  if (RunResult) {
+    StdErr = errorString;
+  }
+}
+
+void FileRunCommandPart::RunDxc(dxc::DxcDllSupport &DllSupport, const FileRunCommandPart *Prior) {
+  // Support piping stdin from prior if needed.
+  UNREFERENCED_PARAMETER(Prior);
+  hlsl::options::MainArgs args;
+  hlsl::options::DxcOpts opts;
+  ReadOptsForDxc(args, opts);
+
+  std::wstring entry =
+      Unicode::UTF8ToUTF16StringOrThrow(opts.EntryPoint.str().c_str());
+  std::wstring profile =
+      Unicode::UTF8ToUTF16StringOrThrow(opts.TargetProfile.str().c_str());
+  std::vector<LPCWSTR> flags;
+  if (opts.CodeGenHighLevel) {
+    flags.push_back(L"-fcgl");
+  }
+
+  std::vector<std::wstring> argWStrings;
+  CopyArgsToWStrings(opts.Args, hlsl::options::CoreOption, argWStrings);
+  for (const std::wstring &a : argWStrings)
+    flags.push_back(a.data());
+
+  CComPtr<IDxcLibrary> pLibrary;
+  CComPtr<IDxcCompiler> pCompiler;
+  CComPtr<IDxcOperationResult> pResult;
+  CComPtr<IDxcBlobEncoding> pSource;
+  CComPtr<IDxcBlobEncoding> pDisassembly;
+  CComPtr<IDxcBlob> pCompiledBlob;
+  CComPtr<IDxcIncludeHandler> pIncludeHandler;
+
+  HRESULT resultStatus;
+
+  if (RunResult)  // opt parsing already failed
+    return;
+
+  IFT(DllSupport.CreateInstance(CLSID_DxcLibrary, &pLibrary));
+  IFT(pLibrary->CreateBlobFromFile(CommandFileName, nullptr, &pSource));
+  IFT(pLibrary->CreateIncludeHandler(&pIncludeHandler));
+  IFT(DllSupport.CreateInstance(CLSID_DxcCompiler, &pCompiler));
+  IFT(pCompiler->Compile(pSource, CommandFileName, entry.c_str(), profile.c_str(),
+                          flags.data(), flags.size(), nullptr, 0, pIncludeHandler, &pResult));
+  IFT(pResult->GetStatus(&resultStatus));
+  if (SUCCEEDED(resultStatus)) {
+    IFT(pResult->GetResult(&pCompiledBlob));
+    if (!opts.AstDump) {
+      IFT(pCompiler->Disassemble(pCompiledBlob, &pDisassembly));
+      StdOut = BlobToUtf8(pDisassembly);
+    } else {
+      StdOut = BlobToUtf8(pCompiledBlob);
+    }
+    CComPtr<IDxcBlobEncoding> pStdErr;
+    IFT(pResult->GetErrorBuffer(&pStdErr));
+    StdErr = BlobToUtf8(pStdErr);
+    RunResult = 0;
+  }
+  else {
+    IFT(pResult->GetErrorBuffer(&pDisassembly));
+    StdErr = BlobToUtf8(pDisassembly);
+    RunResult = resultStatus;
+  }
+
+  OpResult = pResult;
+}
+
+void FileRunCommandPart::RunDxv(dxc::DxcDllSupport &DllSupport, const FileRunCommandPart *Prior) {
+  std::string args(strtrim(Arguments));
+  const char *inputPos = strstr(args.c_str(), "%s");
+  if (inputPos == nullptr) {
+    StdErr = "Only supported pattern includes input file as argument";
+    RunResult = 1;
+    return;
+  }
+  args.erase(inputPos - args.c_str(), strlen("%s"));
+
+  llvm::StringRef argsRef = args;
+  llvm::SmallVector<llvm::StringRef, 8> splitArgs;
+  argsRef.split(splitArgs, " ");
+  IFTMSG(splitArgs.size()==1, "wrong arg num for dxv");
+      
+  CComPtr<IDxcLibrary> pLibrary;
+  CComPtr<IDxcAssembler> pAssembler;
+  CComPtr<IDxcValidator> pValidator;
+  CComPtr<IDxcOperationResult> pResult;
+
+  CComPtr<IDxcBlobEncoding> pSource;
+
+  CComPtr<IDxcBlob> pContainerBlob;
+  HRESULT resultStatus;
+
+  IFT(DllSupport.CreateInstance(CLSID_DxcLibrary, &pLibrary));
+  IFT(pLibrary->CreateBlobFromFile(CommandFileName, nullptr, &pSource));
+  IFT(DllSupport.CreateInstance(CLSID_DxcAssembler, &pAssembler));
+  IFT(pAssembler->AssembleToContainer(pSource, &pResult));
+  IFT(pResult->GetStatus(&resultStatus));
+  if (FAILED(resultStatus)) {
+    CComPtr<IDxcBlobEncoding> pAssembleBlob;
+    IFT(pResult->GetErrorBuffer(&pAssembleBlob));
+    StdErr = BlobToUtf8(pAssembleBlob);
+    RunResult = resultStatus;
+    return;
+  }
+  IFT(pResult->GetResult(&pContainerBlob));
+
+  IFT(DllSupport.CreateInstance(CLSID_DxcValidator, &pValidator));
+  CComPtr<IDxcOperationResult> pValidationResult;
+  IFT(pValidator->Validate(pContainerBlob, DxcValidatorFlags_InPlaceEdit,
+                            &pValidationResult));
+  IFT(pValidationResult->GetStatus(&resultStatus));
+  if (resultStatus) {
+    CComPtr<IDxcBlobEncoding> pValidateBlob;
+    IFT(pValidationResult->GetErrorBuffer(&pValidateBlob));
+    StdOut = BlobToUtf8(pValidateBlob);
+  }
+  RunResult = 0;
+}
+
+void FileRunCommandPart::RunOpt(dxc::DxcDllSupport &DllSupport, const FileRunCommandPart *Prior) {
+  std::string args(strtrim(Arguments));
+  const char *inputPos = strstr(args.c_str(), "%s");
+  if (inputPos == nullptr && Prior == nullptr) {
+    StdErr = "Only supported patterns are input file as argument or prior "
+              "command with disassembly";
+    RunResult = 1;
+    return;
+  }
+
+  CComPtr<IDxcLibrary> pLibrary;
+  CComPtr<IDxcOptimizer> pOptimizer;
+  CComPtr<IDxcBlobEncoding> pSource;
+  CComPtr<IDxcBlobEncoding> pOutputText;
+  CComPtr<IDxcBlob> pOutputModule;
+
+  IFT(DllSupport.CreateInstance(CLSID_DxcLibrary, &pLibrary));
+  IFT(DllSupport.CreateInstance(CLSID_DxcOptimizer, &pOptimizer));
+
+  if (inputPos != nullptr) {
+    args.erase(inputPos - args.c_str(), strlen("%s"));
+    IFT(pLibrary->CreateBlobFromFile(CommandFileName, nullptr, &pSource));
+  }
+  else {
+    assert(Prior != nullptr && "else early check should have returned");
+    CComPtr<IDxcAssembler> pAssembler;
+    IFT(DllSupport.CreateInstance(CLSID_DxcAssembler, &pAssembler));
+    IFT(pLibrary->CreateBlobWithEncodingFromPinned(
+        Prior->StdOut.c_str(), Prior->StdOut.size(), CP_UTF8,
+        &pSource));
+  }
+
+  args = strtrim(args);
+  llvm::StringRef argsRef = args;
+  llvm::SmallVector<llvm::StringRef, 8> splitArgs;
+  argsRef.split(splitArgs, " ");
+  std::vector<LPCWSTR> options;
+  std::vector<std::wstring> optionStrings;
+  for (llvm::StringRef S : splitArgs) {
+    optionStrings.push_back(
+        Unicode::UTF8ToUTF16StringOrThrow(strtrim(S.str()).c_str()));
+  }
+
+  // Add the options outside the above loop in case the vector is resized.
+  for (const std::wstring& str : optionStrings)
+    options.push_back(str.c_str());
+
+  IFT(pOptimizer->RunOptimizer(pSource, options.data(), options.size(),
+                                &pOutputModule, &pOutputText));
+  StdOut = BlobToUtf8(pOutputText);
+  RunResult = 0;
+}
+
+void FileRunCommandPart::RunD3DReflect(dxc::DxcDllSupport &DllSupport, const FileRunCommandPart *Prior) {
+  std::string args(strtrim(Arguments));
+  if (args != "%s") {
+    StdErr = "Only supported pattern is a plain input file";
+    RunResult = 1;
+    return;
+  }
+  if (!Prior) {
+    StdErr = "Prior command required to generate stdin";
+    RunResult = 1;
+    return;
+  }
+
+  CComPtr<IDxcLibrary> pLibrary;
+  CComPtr<IDxcBlobEncoding> pSource;
+  CComPtr<IDxcAssembler> pAssembler;
+  CComPtr<IDxcOperationResult> pResult;
+  CComPtr<ID3D12ShaderReflection> pShaderReflection;
+  CComPtr<ID3D12LibraryReflection> pLibraryReflection;
+  CComPtr<IDxcContainerReflection> containerReflection;
+  uint32_t partCount;
+  CComPtr<IDxcBlob> pContainerBlob;
+  HRESULT resultStatus;
+  bool blobFound = false;
+  std::ostringstream ss;
+  D3DReflectionDumper dumper(ss);
+
+  IFT(DllSupport.CreateInstance(CLSID_DxcLibrary, &pLibrary));
+  IFT(DllSupport.CreateInstance(CLSID_DxcAssembler, &pAssembler));
+
+  IFT(pLibrary->CreateBlobWithEncodingFromPinned(
+      (LPBYTE)Prior->StdOut.c_str(), Prior->StdOut.size(), CP_UTF8,
+      &pSource));
+
+  IFT(pAssembler->AssembleToContainer(pSource, &pResult));
+  IFT(pResult->GetStatus(&resultStatus));
+  if (FAILED(resultStatus)) {
+    CComPtr<IDxcBlobEncoding> pAssembleBlob;
+    IFT(pResult->GetErrorBuffer(&pAssembleBlob));
+    StdErr = BlobToUtf8(pAssembleBlob);
+    RunResult = resultStatus;
+    return;
+  }
+  IFT(pResult->GetResult(&pContainerBlob));
+
+  VERIFY_SUCCEEDED(DllSupport.CreateInstance(CLSID_DxcContainerReflection, &containerReflection));
+  VERIFY_SUCCEEDED(containerReflection->Load(pContainerBlob));
+  VERIFY_SUCCEEDED(containerReflection->GetPartCount(&partCount));
+
+  for (uint32_t i = 0; i < partCount; ++i) {
+    uint32_t kind;
+    VERIFY_SUCCEEDED(containerReflection->GetPartKind(i, &kind));
+    if (kind == (uint32_t)hlsl::DxilFourCC::DFCC_DXIL) {
+      blobFound = true;
+      CComPtr<IDxcBlob> pPart;
+      IFT(containerReflection->GetPartContent(i, &pPart));
+      const hlsl::DxilProgramHeader *pProgramHeader =
+        reinterpret_cast<const hlsl::DxilProgramHeader*>(pPart->GetBufferPointer());
+      VERIFY_IS_TRUE(IsValidDxilProgramHeader(pProgramHeader, (uint32_t)pPart->GetBufferSize()));
+      hlsl::DXIL::ShaderKind SK = hlsl::GetVersionShaderType(pProgramHeader->ProgramVersion);
+      if (SK == hlsl::DXIL::ShaderKind::Library)
+        VERIFY_SUCCEEDED(containerReflection->GetPartReflection(i, IID_PPV_ARGS(&pLibraryReflection)));
       else
-        t.InputForStdin = Prior->StdOut;
-      RunResult = t.Run();
-      StdOut = t.test_outs;
-      StdErr = t.test_errs;
-      // Capture the input as well.
-      if (RunResult != 0 && Prior != nullptr) {
-        StdErr += "\n<full input to FileCheck>\n";
-        StdErr += t.InputForStdin;
-      }
+        VERIFY_SUCCEEDED(containerReflection->GetPartReflection(i, IID_PPV_ARGS(&pShaderReflection)));
+      break;
     }
+  }
 
-    void FileRunCommandPart::RunStdErrChecker(const FileRunCommandPart *Prior) {
-      std::string args(strtrim(Arguments));
-      if (args != "%s") {
-        StdErr = "Only supported pattern is a plain input file";
-        RunResult = 1;
-        return;
-      }
-      if (!Prior) {
-        StdErr = "Prior command required to generate stdin";
-        RunResult = 1;
-        return;
-      }
+  if (!blobFound) {
+    StdErr = "Unable to find DXIL part";
+    RunResult = 1;
+    return;
+  } else if (pShaderReflection) {
+    dumper.Dump(pShaderReflection);
+  } else if (pLibraryReflection) {
+    dumper.Dump(pLibraryReflection);
+  }
 
-      CW2A fileName(CommandFileName, CP_UTF8);
-      FileCheckForTest t;
-      t.CheckFilename = fileName;
-      
-      t.InputForStdin = Prior->StdErr;
-      
-      RunResult = t.Run();
-      StdOut = t.test_outs;
-      StdErr = t.test_errs;
-      // Capture the input as well.
-      if (RunResult != 0 && Prior != nullptr) {
-        StdErr += "\n<full input to StdErrCheck>\n";
-        StdErr += t.InputForStdin;
-      }
-    }
+  ss.flush();
+  StdOut = ss.str();
+  RunResult = 0;
+}
 
-    void FileRunCommandPart::ReadOptsForDxc(hlsl::options::MainArgs &argStrings,
-                                            hlsl::options::DxcOpts &Opts) {
-      std::string args(strtrim(Arguments));
-      const char *inputPos = strstr(args.c_str(), "%s");
-      if (inputPos == nullptr) {
-        StdErr = "Only supported pattern includes input file as argument";
-        RunResult = 1;
-        return;
-      }
-      args.erase(inputPos - args.c_str(), strlen("%s"));
+void FileRunCommandPart::RunTee(const FileRunCommandPart *Prior) {
+  if (Prior == nullptr) {
+    StdErr = "tee requires a prior command";
+    RunResult = 1;
+    return;
+  }
 
-      llvm::StringRef argsRef = args;
-      llvm::SmallVector<llvm::StringRef, 8> splitArgs;
-      argsRef.split(splitArgs, " ");
-      argStrings = hlsl::options::MainArgs(splitArgs);
-      std::string errorString;
-      llvm::raw_string_ostream errorStream(errorString);
-      RunResult = ReadDxcOpts(hlsl::options::getHlslOptTable(), /*flagsToInclude*/ 0,
-                              argStrings, Opts, errorStream);
-      errorStream.flush();
-      if (RunResult) {
-        StdErr = errorString;
-      }
-    }
+  // Ignore commands for now - simply log out through test framework.
+  {
+    CA2W outWide(Prior->StdOut.c_str(), CP_UTF8);
+    WEX::Logging::Log::Comment(outWide.m_psz);
+  }
+  if (!Prior->StdErr.empty()) {
+    CA2W errWide(Prior->StdErr.c_str(), CP_UTF8);
+    WEX::Logging::Log::Comment(L"<stderr>");
+    WEX::Logging::Log::Comment(errWide.m_psz);
+  }
 
-    void FileRunCommandPart::RunDxc(const FileRunCommandPart *Prior) {
-      // Support piping stdin from prior if needed.
-      UNREFERENCED_PARAMETER(Prior);
-      hlsl::options::MainArgs args;
-      hlsl::options::DxcOpts opts;
-      ReadOptsForDxc(args, opts);
+  StdErr = Prior->StdErr;
+  StdOut = Prior->StdOut;
+  RunResult = Prior->RunResult;
+}
 
-      std::wstring entry =
-          Unicode::UTF8ToUTF16StringOrThrow(opts.EntryPoint.str().c_str());
-      std::wstring profile =
-          Unicode::UTF8ToUTF16StringOrThrow(opts.TargetProfile.str().c_str());
-      std::vector<LPCWSTR> flags;
-      if (opts.CodeGenHighLevel) {
-        flags.push_back(L"-fcgl");
-      }
+void FileRunCommandPart::RunXFail(const FileRunCommandPart *Prior) {
+  if (Prior == nullptr) {
+    StdErr = "XFail requires a prior command";
+    RunResult = 1;
+    return;
+  }
 
-      std::vector<std::wstring> argWStrings;
-      CopyArgsToWStrings(opts.Args, hlsl::options::CoreOption, argWStrings);
-      for (const std::wstring &a : argWStrings)
-        flags.push_back(a.data());
-
-      CComPtr<IDxcLibrary> pLibrary;
-      CComPtr<IDxcCompiler> pCompiler;
-      CComPtr<IDxcOperationResult> pResult;
-      CComPtr<IDxcBlobEncoding> pSource;
-      CComPtr<IDxcBlobEncoding> pDisassembly;
-      CComPtr<IDxcBlob> pCompiledBlob;
-      CComPtr<IDxcIncludeHandler> pIncludeHandler;
-
-      HRESULT resultStatus;
-
-      if (RunResult)  // opt parsing already failed
-        return;
-
-      IFT(DllSupport->CreateInstance(CLSID_DxcLibrary, &pLibrary));
-      IFT(pLibrary->CreateBlobFromFile(CommandFileName, nullptr, &pSource));
-      IFT(pLibrary->CreateIncludeHandler(&pIncludeHandler));
-      IFT(DllSupport->CreateInstance(CLSID_DxcCompiler, &pCompiler));
-      IFT(pCompiler->Compile(pSource, CommandFileName, entry.c_str(), profile.c_str(),
-                             flags.data(), flags.size(), nullptr, 0, pIncludeHandler, &pResult));
-      IFT(pResult->GetStatus(&resultStatus));
-      if (SUCCEEDED(resultStatus)) {
-        IFT(pResult->GetResult(&pCompiledBlob));
-        if (!opts.AstDump) {
-          IFT(pCompiler->Disassemble(pCompiledBlob, &pDisassembly));
-          StdOut = BlobToUtf8(pDisassembly);
-        } else {
-          StdOut = BlobToUtf8(pCompiledBlob);
-        }
-        CComPtr<IDxcBlobEncoding> pStdErr;
-        IFT(pResult->GetErrorBuffer(&pStdErr));
-        StdErr = BlobToUtf8(pStdErr);
-        RunResult = 0;
-      }
-      else {
-        IFT(pResult->GetErrorBuffer(&pDisassembly));
-        StdErr = BlobToUtf8(pDisassembly);
-        RunResult = resultStatus;
-      }
-
-      OpResult = pResult;
-    }
-
-    void FileRunCommandPart::RunDxv(const FileRunCommandPart *Prior) {
-      std::string args(strtrim(Arguments));
-      const char *inputPos = strstr(args.c_str(), "%s");
-      if (inputPos == nullptr) {
-        StdErr = "Only supported pattern includes input file as argument";
-        RunResult = 1;
-        return;
-      }
-      args.erase(inputPos - args.c_str(), strlen("%s"));
-
-      llvm::StringRef argsRef = args;
-      llvm::SmallVector<llvm::StringRef, 8> splitArgs;
-      argsRef.split(splitArgs, " ");
-      IFTMSG(splitArgs.size()==1, "wrong arg num for dxv");
-      
-      CComPtr<IDxcLibrary> pLibrary;
-      CComPtr<IDxcAssembler> pAssembler;
-      CComPtr<IDxcValidator> pValidator;
-      CComPtr<IDxcOperationResult> pResult;
-
-      CComPtr<IDxcBlobEncoding> pSource;
-
-      CComPtr<IDxcBlob> pContainerBlob;
-      HRESULT resultStatus;
-
-      IFT(DllSupport->CreateInstance(CLSID_DxcLibrary, &pLibrary));
-      IFT(pLibrary->CreateBlobFromFile(CommandFileName, nullptr, &pSource));
-      IFT(DllSupport->CreateInstance(CLSID_DxcAssembler, &pAssembler));
-      IFT(pAssembler->AssembleToContainer(pSource, &pResult));
-      IFT(pResult->GetStatus(&resultStatus));
-      if (FAILED(resultStatus)) {
-        CComPtr<IDxcBlobEncoding> pAssembleBlob;
-        IFT(pResult->GetErrorBuffer(&pAssembleBlob));
-        StdErr = BlobToUtf8(pAssembleBlob);
-        RunResult = resultStatus;
-        return;
-      }
-      IFT(pResult->GetResult(&pContainerBlob));
-
-      IFT(DllSupport->CreateInstance(CLSID_DxcValidator, &pValidator));
-      CComPtr<IDxcOperationResult> pValidationResult;
-      IFT(pValidator->Validate(pContainerBlob, DxcValidatorFlags_InPlaceEdit,
-                               &pValidationResult));
-      IFT(pValidationResult->GetStatus(&resultStatus));
-      if (resultStatus) {
-        CComPtr<IDxcBlobEncoding> pValidateBlob;
-        IFT(pValidationResult->GetErrorBuffer(&pValidateBlob));
-        StdOut = BlobToUtf8(pValidateBlob);
-      }
-      RunResult = 0;
-    }
-
-    void FileRunCommandPart::RunOpt(const FileRunCommandPart *Prior) {
-      std::string args(strtrim(Arguments));
-      const char *inputPos = strstr(args.c_str(), "%s");
-      if (inputPos == nullptr && Prior == nullptr) {
-        StdErr = "Only supported patterns are input file as argument or prior "
-                 "command with disassembly";
-        RunResult = 1;
-        return;
-      }
-
-      CComPtr<IDxcLibrary> pLibrary;
-      CComPtr<IDxcOptimizer> pOptimizer;
-      CComPtr<IDxcBlobEncoding> pSource;
-      CComPtr<IDxcBlobEncoding> pOutputText;
-      CComPtr<IDxcBlob> pOutputModule;
-
-      IFT(DllSupport->CreateInstance(CLSID_DxcLibrary, &pLibrary));
-      IFT(DllSupport->CreateInstance(CLSID_DxcOptimizer, &pOptimizer));
-
-      if (inputPos != nullptr) {
-        args.erase(inputPos - args.c_str(), strlen("%s"));
-        IFT(pLibrary->CreateBlobFromFile(CommandFileName, nullptr, &pSource));
-      }
-      else {
-        assert(Prior != nullptr && "else early check should have returned");
-        CComPtr<IDxcAssembler> pAssembler;
-        IFT(DllSupport->CreateInstance(CLSID_DxcAssembler, &pAssembler));
-        IFT(pLibrary->CreateBlobWithEncodingFromPinned(
-            Prior->StdOut.c_str(), Prior->StdOut.size(), CP_UTF8,
-            &pSource));
-      }
-
-      args = trim(args);
-      llvm::StringRef argsRef = args;
-      llvm::SmallVector<llvm::StringRef, 8> splitArgs;
-      argsRef.split(splitArgs, " ");
-      std::vector<LPCWSTR> options;
-      std::vector<std::wstring> optionStrings;
-      for (llvm::StringRef S : splitArgs) {
-        optionStrings.push_back(
-            Unicode::UTF8ToUTF16StringOrThrow(trim(S.str()).c_str()));
-      }
-
-      // Add the options outside the above loop in case the vector is resized.
-      for (const std::wstring& str : optionStrings)
-        options.push_back(str.c_str());
-
-      IFT(pOptimizer->RunOptimizer(pSource, options.data(), options.size(),
-                                   &pOutputModule, &pOutputText));
-      StdOut = BlobToUtf8(pOutputText);
-      RunResult = 0;
-    }
-
-    void FileRunCommandPart::RunD3DReflect(const FileRunCommandPart *Prior) {
-      std::string args(strtrim(Arguments));
-      if (args != "%s") {
-        StdErr = "Only supported pattern is a plain input file";
-        RunResult = 1;
-        return;
-      }
-      if (!Prior) {
-        StdErr = "Prior command required to generate stdin";
-        RunResult = 1;
-        return;
-      }
-
-      CComPtr<IDxcLibrary> pLibrary;
-      CComPtr<IDxcBlobEncoding> pSource;
-      CComPtr<IDxcAssembler> pAssembler;
-      CComPtr<IDxcOperationResult> pResult;
-      CComPtr<ID3D12ShaderReflection> pShaderReflection;
-      CComPtr<ID3D12LibraryReflection> pLibraryReflection;
-      CComPtr<IDxcContainerReflection> containerReflection;
-      uint32_t partCount;
-      CComPtr<IDxcBlob> pContainerBlob;
-      HRESULT resultStatus;
-      bool blobFound = false;
-      std::ostringstream ss;
-      D3DReflectionDumper dumper(ss);
-
-      IFT(DllSupport->CreateInstance(CLSID_DxcLibrary, &pLibrary));
-      IFT(DllSupport->CreateInstance(CLSID_DxcAssembler, &pAssembler));
-
-      IFT(pLibrary->CreateBlobWithEncodingFromPinned(
-          (LPBYTE)Prior->StdOut.c_str(), Prior->StdOut.size(), CP_UTF8,
-          &pSource));
-
-      IFT(pAssembler->AssembleToContainer(pSource, &pResult));
-      IFT(pResult->GetStatus(&resultStatus));
-      if (FAILED(resultStatus)) {
-        CComPtr<IDxcBlobEncoding> pAssembleBlob;
-        IFT(pResult->GetErrorBuffer(&pAssembleBlob));
-        StdErr = BlobToUtf8(pAssembleBlob);
-        RunResult = resultStatus;
-        return;
-      }
-      IFT(pResult->GetResult(&pContainerBlob));
-
-      VERIFY_SUCCEEDED(DllSupport->CreateInstance(CLSID_DxcContainerReflection, &containerReflection));
-      VERIFY_SUCCEEDED(containerReflection->Load(pContainerBlob));
-      VERIFY_SUCCEEDED(containerReflection->GetPartCount(&partCount));
-
-      for (uint32_t i = 0; i < partCount; ++i) {
-        uint32_t kind;
-        VERIFY_SUCCEEDED(containerReflection->GetPartKind(i, &kind));
-        if (kind == (uint32_t)hlsl::DxilFourCC::DFCC_DXIL) {
-          blobFound = true;
-          CComPtr<IDxcBlob> pPart;
-          IFT(containerReflection->GetPartContent(i, &pPart));
-          const hlsl::DxilProgramHeader *pProgramHeader =
-            reinterpret_cast<const hlsl::DxilProgramHeader*>(pPart->GetBufferPointer());
-          VERIFY_IS_TRUE(IsValidDxilProgramHeader(pProgramHeader, (uint32_t)pPart->GetBufferSize()));
-          hlsl::DXIL::ShaderKind SK = hlsl::GetVersionShaderType(pProgramHeader->ProgramVersion);
-          if (SK == hlsl::DXIL::ShaderKind::Library)
-            VERIFY_SUCCEEDED(containerReflection->GetPartReflection(i, IID_PPV_ARGS(&pLibraryReflection)));
-          else
-            VERIFY_SUCCEEDED(containerReflection->GetPartReflection(i, IID_PPV_ARGS(&pShaderReflection)));
-          break;
-        }
-      }
-
-      if (!blobFound) {
-        StdErr = "Unable to find DXIL part";
-        RunResult = 1;
-        return;
-      } else if (pShaderReflection) {
-        dumper.Dump(pShaderReflection);
-      } else if (pLibraryReflection) {
-        dumper.Dump(pLibraryReflection);
-      }
-
-      ss.flush();
-      StdOut = ss.str();
-      RunResult = 0;
-    }
-
-    void FileRunCommandPart::RunTee(const FileRunCommandPart *Prior) {
-      if (Prior == nullptr) {
-        StdErr = "tee requires a prior command";
-        RunResult = 1;
-        return;
-      }
-
-      // Ignore commands for now - simply log out through test framework.
-      {
-        CA2W outWide(Prior->StdOut.c_str(), CP_UTF8);
-        WEX::Logging::Log::Comment(outWide.m_psz);
-      }
-      if (!Prior->StdErr.empty()) {
-        CA2W errWide(Prior->StdErr.c_str(), CP_UTF8);
-        WEX::Logging::Log::Comment(L"<stderr>");
-        WEX::Logging::Log::Comment(errWide.m_psz);
-      }
-
-      StdErr = Prior->StdErr;
-      StdOut = Prior->StdOut;
-      RunResult = Prior->RunResult;
-    }
+  if (Prior->RunResult == 0) {
+    StdErr = "XFail expected a failure from previous command";
+    RunResult = 1;
+  } else {
+    RunResult = 0;
+  }
+}
 
 class FileRunTestResultImpl : public FileRunTestResult {
   dxc::DxcDllSupport &m_support;
@@ -486,8 +491,7 @@ class FileRunTestResultImpl : public FileRunTestResult {
     ParseCommandParts(commands, fileName, parts);
     FileRunCommandPart *prior = nullptr;
     for (FileRunCommandPart & part : parts) {
-      part.DllSupport = &m_support;
-      part.Run(prior);
+      part.Run(m_support, prior);
       prior = &part;
     }
     if (prior == nullptr) {
diff --git a/tools/clang/unittests/HLSL/RewriterTest.cpp b/tools/clang/unittests/HLSL/RewriterTest.cpp
index 93a081747..0087f5f85 100644
--- a/tools/clang/unittests/HLSL/RewriterTest.cpp
+++ b/tools/clang/unittests/HLSL/RewriterTest.cpp
@@ -60,6 +60,7 @@ public:
   TEST_METHOD(RunIndexingOperator);
   TEST_METHOD(RunIntrinsicExamples);
   TEST_METHOD(RunMatrixAssignments);
+  TEST_METHOD(RunMatrixPackOrientation);
   TEST_METHOD(RunMatrixSyntax);
   TEST_METHOD(RunPackReg);
   TEST_METHOD(RunScalarAssignments);
@@ -332,6 +333,10 @@ TEST_F(RewriterTest, RunMatrixAssignments) {
     CheckVerifiesHLSL(L"rewriter\\matrix-assignments_noerr.hlsl", L"rewriter\\correct_rewrites\\matrix-assignments_gold.hlsl");
 }
 
+TEST_F(RewriterTest, RunMatrixPackOrientation) {
+  CheckVerifiesHLSL(L"rewriter\\matrix-pack-orientation.hlsl", L"rewriter\\correct_rewrites\\matrix-pack-orientation_gold.hlsl");
+}
+
 TEST_F(RewriterTest, RunMatrixSyntax) {
     CheckVerifiesHLSL(L"rewriter\\matrix-syntax_noerr.hlsl", L"rewriter\\correct_rewrites\\matrix-syntax_gold.hlsl");
 }
diff --git a/tools/clang/unittests/HLSL/ShaderOpArithTable.xml b/tools/clang/unittests/HLSL/ShaderOpArithTable.xml
index 3c18024ea..7a645ca8d 100644
--- a/tools/clang/unittests/HLSL/ShaderOpArithTable.xml
+++ b/tools/clang/unittests/HLSL/ShaderOpArithTable.xml
@@ -4704,23 +4704,53 @@
             <Parameter Name="ShaderOp.Arguments">-enable-16bit-types</Parameter>
         </Row>
     </Table>
+    <!-- TODO: Split into 4 separate tables after 19H1 -->
     <Table Id="DotOpTable">
         <ParameterTypes>
-            <ParameterType Name="ShaderOp.Target">String</ParameterType>
-            <ParameterType Name="ShaderOp.Arguments">String</ParameterType>
-            <ParameterType Name="ShaderOp.Text">String</ParameterType>
-            <ParameterType Name="Validation.Type">String</ParameterType>
-            <ParameterType Name="Validation.Tolerance">double</ParameterType>
-            <ParameterType Array="true" Name="Validation.Input1">String</ParameterType>
-            <ParameterType Array="true" Name="Validation.Input2">String</ParameterType>
-            <ParameterType Array="true" Name="Validation.Expected1">String</ParameterType>
-            <ParameterType Array="true" Name="Validation.Expected2">String</ParameterType>
-            <ParameterType Array="true" Name="Validation.Expected3">String</ParameterType>
+            <!-- DotAdd Parameters -->
+            <ParameterType Name="Dot.ShaderOp.Target">String</ParameterType>
+            <ParameterType Name="Dot.ShaderOp.Arguments">String</ParameterType>
+            <ParameterType Name="Dot.ShaderOp.Text">String</ParameterType>
+            <ParameterType Name="Dot.Validation.Type">String</ParameterType>
+            <ParameterType Name="Dot.Validation.Tolerance">double</ParameterType>
+            <ParameterType Array="true" Name="Dot.Validation.Input1">String</ParameterType>
+            <ParameterType Array="true" Name="Dot.Validation.Input2">String</ParameterType>
+            <ParameterType Array="true" Name="Dot.Validation.Expected1">String</ParameterType>
+            <ParameterType Array="true" Name="Dot.Validation.Expected2">String</ParameterType>
+            <ParameterType Array="true" Name="Dot.Validation.Expected3">String</ParameterType>
+
+            <!-- Dot2Add Parameters -->
+            <ParameterType Name="Dot2Add.ShaderOp.Target">String</ParameterType>
+            <ParameterType Name="Dot2Add.ShaderOp.Arguments">String</ParameterType>
+            <ParameterType Name="Dot2Add.ShaderOp.Text">String</ParameterType>
+            <ParameterType Name="Dot2Add.Validation.Type">String</ParameterType>
+            <ParameterType Name="Dot2Add.Validation.Tolerance">double</ParameterType>
+            <ParameterType Array="true" Name="Dot2Add.Validation.Input1">String</ParameterType>
+            <ParameterType Array="true" Name="Dot2Add.Validation.Input2">String</ParameterType>
+            <ParameterType Array="true" Name="Dot2Add.Validation.Input3">String</ParameterType>
+            <ParameterType Array="true" Name="Dot2Add.Validation.Expected1">String</ParameterType>
+
+            <!-- Dot4AddI8Packed Parameters -->
+            <ParameterType Name="Dot4AddI8Packed.ShaderOp.Target">String</ParameterType>
+            <ParameterType Name="Dot4AddI8Packed.ShaderOp.Text">String</ParameterType>
+            <ParameterType Array="true" Name="Dot4AddI8Packed.Validation.Input1">String</ParameterType>
+            <ParameterType Array="true" Name="Dot4AddI8Packed.Validation.Input2">String</ParameterType>
+            <ParameterType Array="true" Name="Dot4AddI8Packed.Validation.Input3">String</ParameterType>
+            <ParameterType Array="true" Name="Dot4AddI8Packed.Validation.Expected1">String</ParameterType>
+
+            <!-- Dot4AddU8Packed Parameters -->
+            <ParameterType Name="Dot4AddU8Packed.ShaderOp.Target">String</ParameterType>
+            <ParameterType Name="Dot4AddU8Packed.ShaderOp.Text">String</ParameterType>
+            <ParameterType Array="true" Name="Dot4AddU8Packed.Validation.Input1">String</ParameterType>
+            <ParameterType Array="true" Name="Dot4AddU8Packed.Validation.Input2">String</ParameterType>
+            <ParameterType Array="true" Name="Dot4AddU8Packed.Validation.Input3">String</ParameterType>
+            <ParameterType Array="true" Name="Dot4AddU8Packed.Validation.Expected1">String</ParameterType>
         </ParameterTypes>
         <Row Name="Dot">
-            <Parameter Name="Validation.Type">epsilon</Parameter>
-            <Parameter Name="Validation.Tolerance">0.008</Parameter>
-            <Parameter Name="ShaderOp.Text"> struct SDotOp {
+            <!-- Dot Parameters -->
+            <Parameter Name="Dot.Validation.Type">epsilon</Parameter>
+            <Parameter Name="Dot.Validation.Tolerance">0.008</Parameter>
+            <Parameter Name="Dot.ShaderOp.Text"> struct SDotOp {
                    float4 input1;
                    float4 input2;
                    float o_dot2;
@@ -4736,8 +4766,8 @@
                     l.o_dot4 = dot(l.input1.xyzw, l.input2.xyzw);
                     g_buf[GI] = l;
                 };</Parameter>
-            <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-            <Parameter Name="Validation.Input1">
+            <Parameter Name="Dot.ShaderOp.Target">cs_6_0</Parameter>
+            <Parameter Name="Dot.Validation.Input1">
                 <Value>NaN,NaN,NaN,NaN</Value>
                 <Value>-Inf,-Inf,-Inf,-Inf</Value>
                 <Value>-denorm,-denorm,-denorm,-denorm</Value>
@@ -4749,7 +4779,7 @@
                 <Value>-10,0,0,10</Value>
                 <Value>Inf,Inf,Inf,-Inf</Value>
             </Parameter>
-            <Parameter Name="Validation.Input2">
+            <Parameter Name="Dot.Validation.Input2">
                 <Value>NaN,NaN,NaN,NaN</Value>
                 <Value>-Inf,-Inf,-Inf,-Inf</Value>
                 <Value>-denorm,-denorm,-denorm,-denorm</Value>
@@ -4761,7 +4791,7 @@
                 <Value>10,0,0,10</Value>
                 <Value>Inf,Inf,Inf,Inf</Value>
             </Parameter>
-            <Parameter Name="Validation.Expected1">
+            <Parameter Name="Dot.Validation.Expected1">
                 <Value>nan</Value>
                 <Value>inf</Value>
                 <Value>0</Value>
@@ -4773,7 +4803,7 @@
                 <Value>-100</Value>
                 <Value>inf</Value>
             </Parameter>
-            <Parameter Name="Validation.Expected2">
+            <Parameter Name="Dot.Validation.Expected2">
                 <Value>nan</Value>
                 <Value>inf</Value>
                 <Value>0</Value>
@@ -4785,7 +4815,7 @@
                 <Value>-100</Value>
                 <Value>inf</Value>
             </Parameter>
-            <Parameter Name="Validation.Expected3">
+            <Parameter Name="Dot.Validation.Expected3">
                 <Value>nan</Value>
                 <Value>inf</Value>
                 <Value>0</Value>
@@ -4797,6 +4827,209 @@
                 <Value>0</Value>
                 <Value>nan</Value>
             </Parameter>
+
+            <!-- Dot2Add Parameters -->
+            <Parameter Name="Dot2Add.Validation.Type">epsilon</Parameter>
+            <Parameter Name="Dot2Add.Validation.Tolerance">0.008</Parameter>
+            <Parameter Name="Dot2Add.ShaderOp.Text"> struct SDot2AddOp {
+                   half2 input1;
+                   half2 input2;
+                   float acc;
+                   float result;
+                };
+                RWStructuredBuffer&lt;SDot2AddOp&gt; g_buf : register(u0);
+                [numthreads(8,8,1)]
+                void main(uint GI : SV_GroupIndex) {
+                    SDot2AddOp l = g_buf[GI];
+                    l.result = dot2add(l.input1, l.input2, l.acc);
+                    g_buf[GI] = l;
+                };</Parameter>
+            <Parameter Name="Dot2Add.ShaderOp.Target">cs_6_4</Parameter>
+            <Parameter Name="Dot2Add.Validation.Input1">
+                <Value>1,2</Value>
+                <Value>1,-2</Value>
+                <Value>1,2</Value>
+                <Value>-1,2</Value>
+                <Value>1,2</Value>
+                <Value>-1,2</Value>
+                <Value>1,2</Value>
+                <Value>-1,-2</Value>
+                <Value>65504,1</Value>
+                <Value>-65504,1</Value>
+                <Value>1,65504</Value>
+                <Value>1,-65504</Value>
+                <Value>65504,65504</Value>
+                <Value>inf,inf</Value>
+                <Value>denorm,denorm</Value>
+                <Value>-denorm,-denorm</Value>
+                <Value>nan,nan</Value>
+            </Parameter>
+            <Parameter Name="Dot2Add.Validation.Input2">
+                <Value>3,4</Value>
+                <Value>-3,4</Value>
+                <Value>3,4</Value>
+                <Value>3,-4</Value>
+                <Value>3,4</Value>
+                <Value>-3,4</Value>
+                <Value>3,4</Value>
+                <Value>-3,-4</Value>
+                <Value>1,65504</Value>
+                <Value>1,-65504</Value>
+                <Value>65504,1</Value>
+                <Value>-65504,1</Value>
+                <Value>65504,65504</Value>
+                <Value>inf,inf</Value>
+                <Value>denorm,denorm</Value>
+                <Value>-denorm,-denorm</Value>
+                <Value>nan,nan</Value>
+            </Parameter>
+            <Parameter Name="Dot2Add.Validation.Input3">
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>10</Value>
+                <Value>10</Value>
+                <Value>-5</Value>
+                <Value>-5</Value>
+                <Value>-30</Value>
+                <Value>-30</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>10000000</Value>
+                <Value>-10000000</Value>
+                <Value>0</Value>
+                <Value>inf</Value>
+                <Value>denorm</Value>
+                <Value>-denorm</Value>
+                <Value>nan</Value>
+            </Parameter>
+            <Parameter Name="Dot2Add.Validation.Expected1">
+                <Value>11</Value>
+                <Value>-11</Value>
+                <Value>21</Value>
+                <Value>-1</Value>
+                <Value>6</Value>
+                <Value>6</Value>
+                <Value>-19</Value>
+                <Value>-19</Value>
+                <Value>131008</Value>
+                <Value>-131008</Value>
+                <Value>10131008</Value>
+                <Value>-10131008</Value>
+                <Value>inf</Value>
+                <Value>inf</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>nan</Value>
+            </Parameter>
+            <Parameter Name="Dot2Add.ShaderOp.Arguments">-enable-16bit-types</Parameter>
+
+            <!-- Dot4AddI8Packed Parameters -->
+            <Parameter Name="Dot4AddI8Packed.ShaderOp.Text"> struct SDot4AddI8PackedOp {
+                   dword input1;
+                   dword input2;
+                   int acc;
+                   int result;
+                };
+                RWStructuredBuffer&lt;SDot4AddI8PackedOp&gt; g_buf : register(u0);
+                [numthreads(8,8,1)]
+                void main(uint GI : SV_GroupIndex) {
+                    SDot4AddI8PackedOp l = g_buf[GI];
+                    l.result = dot4add_i8packed(l.input1, l.input2, l.acc);
+                    g_buf[GI] = l;
+                };</Parameter>
+            <Parameter Name="Dot4AddI8Packed.ShaderOp.Target">cs_6_4</Parameter>
+            <Parameter Name="Dot4AddI8Packed.Validation.Input1">
+                <Value>0x00000102</Value>
+                <Value>0x00000102</Value>
+                <Value>0x00000102</Value>
+                <Value>0x00000102</Value>
+                <Value>0XFFFFFFFF</Value>
+                <Value>0x80808080</Value>
+                <Value>0x80808080</Value>
+                <Value>0x807F807F</Value>
+                <Value>0x7F7F7F7F</Value>
+                <Value>0x80808080</Value>
+            </Parameter>
+            <Parameter Name="Dot4AddI8Packed.Validation.Input2">
+                <Value>0x00000304</Value>
+                <Value>0x00000304</Value>
+                <Value>0x00000304</Value>
+                <Value>0x00000304</Value>
+                <Value>0xFFFFFFFF</Value>
+                <Value>0x01010101</Value>
+                <Value>0x7F7F7F7F</Value>
+                <Value>0x807F807F</Value>
+                <Value>0x7F7F7F7F</Value>
+                <Value>0x80808080</Value>
+            </Parameter>
+            <Parameter Name="Dot4AddI8Packed.Validation.Input3">
+                <Value>0</Value>
+                <Value>10</Value>
+                <Value>-5</Value>
+                <Value>-30</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+            </Parameter>
+            <Parameter Name="Dot4AddI8Packed.Validation.Expected1">
+                <Value>11</Value>
+                <Value>21</Value>
+                <Value>6</Value>
+                <Value>-19</Value>
+                <Value>4</Value>
+                <Value>-512</Value>
+                <Value>-65024</Value>
+                <Value>65026</Value>
+                <Value>64516</Value>
+                <Value>65536</Value>
+            </Parameter>
+
+            <!-- Dot4AddI8Packed Parameters -->
+            <Parameter Name="Dot4AddU8Packed.ShaderOp.Text"> struct SDot4AddU8PackedOp {
+                   dword input1;
+                   dword input2;
+                   dword acc;
+                   dword result;
+                };
+                RWStructuredBuffer&lt;SDot4AddU8PackedOp&gt; g_buf : register(u0);
+                [numthreads(8,8,1)]
+                void main(uint GI : SV_GroupIndex) {
+                    SDot4AddU8PackedOp l = g_buf[GI];
+                    l.result = dot4add_u8packed(l.input1, l.input2, l.acc);
+                    g_buf[GI] = l;
+                };</Parameter>
+            <Parameter Name="Dot4AddU8Packed.ShaderOp.Target">cs_6_4</Parameter>
+            <Parameter Name="Dot4AddU8Packed.Validation.Input1">
+                <Value>0x00000102</Value>
+                <Value>0x00000102</Value>
+                <Value>0x01234567</Value>
+                <Value>0xFFFFFFFF</Value>
+                <Value>0xFFFFFFFF</Value>
+            </Parameter>
+            <Parameter Name="Dot4AddU8Packed.Validation.Input2">
+                <Value>0x00000304</Value>
+                <Value>0x00000304</Value>
+                <Value>0x23456789</Value>
+                <Value>0xFFFFFFFF</Value>
+                <Value>0xFFFFFFFF</Value>
+            </Parameter>
+            <Parameter Name="Dot4AddU8Packed.Validation.Input3">
+                <Value>0</Value>
+                <Value>10</Value>
+                <Value>10000</Value>
+                <Value>0</Value>
+                <Value>3000000000</Value>
+            </Parameter>
+            <Parameter Name="Dot4AddU8Packed.Validation.Expected1">
+                <Value>11</Value>
+                <Value>21</Value>
+                <Value>33668</Value>
+                <Value>260100</Value>
+                <Value>3000260100</Value>
+            </Parameter>
         </Row>
     </Table>
     <Table Id="Msad4Table">
diff --git a/tools/clang/unittests/HLSL/ValidationTest.cpp b/tools/clang/unittests/HLSL/ValidationTest.cpp
index 4745bf382..e5ae07c1f 100644
--- a/tools/clang/unittests/HLSL/ValidationTest.cpp
+++ b/tools/clang/unittests/HLSL/ValidationTest.cpp
@@ -1016,11 +1016,11 @@ TEST_F(ValidationTest, LocalResCopy) {
 }
 
 TEST_F(ValidationTest, WhenIncorrectModelThenFail) {
-  TestCheck(L"val-failures.hlsl");
+  TestCheck(L"..\\CodeGenHLSL\\val-failures.hlsl");
 }
 
 TEST_F(ValidationTest, WhenIncorrectPSThenFail) {
-  TestCheck(L"val-failures-ps.hlsl");
+  TestCheck(L"..\\CodeGenHLSL\\val-failures-ps.hlsl");
 }
 
 TEST_F(ValidationTest, WhenSmUnknownThenFail) {
@@ -1569,7 +1569,7 @@ TEST_F(ValidationTest, AddUint64Odd) {
 }
 
 TEST_F(ValidationTest, WhenWaveAffectsGradientThenFail) {
-  TestCheck(L"val-wave-failures-ps.hlsl");
+  TestCheck(L"..\\CodeGenHLSL\\val-wave-failures-ps.hlsl");
 }
 
 TEST_F(ValidationTest, WhenMetaFlagsUsageThenFail) {
diff --git a/tools/clang/unittests/HLSL/VerifierTest.cpp b/tools/clang/unittests/HLSL/VerifierTest.cpp
index 2cc1ea423..b16c77c59 100644
--- a/tools/clang/unittests/HLSL/VerifierTest.cpp
+++ b/tools/clang/unittests/HLSL/VerifierTest.cpp
@@ -39,6 +39,7 @@ public:
   TEST_METHOD(RunConstExpr)
   TEST_METHOD(RunConstAssign)
   TEST_METHOD(RunConstDefault)
+  TEST_METHOD(RunConversionsBetweenTypeShapes)
   TEST_METHOD(RunCppErrors)
   TEST_METHOD(RunCppErrorsHV2015)
   TEST_METHOD(RunCXX11Attributes)
@@ -154,6 +155,10 @@ TEST_F(VerifierTest, RunConstDefault) {
   CheckVerifiesHLSL(L"const-default.hlsl");
 }
 
+TEST_F(VerifierTest, RunConversionsBetweenTypeShapes) {
+  CheckVerifiesHLSL(L"conversions-between-type-shapes.hlsl");
+}
+
 TEST_F(VerifierTest, RunCppErrors) {
   CheckVerifiesHLSL(L"cpp-errors.hlsl");
 }
diff --git a/tools/clang/unittests/SPIRV/CodeGenSPIRVTest.cpp b/tools/clang/unittests/SPIRV/CodeGenSPIRVTest.cpp
index 8ae2578fe..ec8ec4ecc 100644
--- a/tools/clang/unittests/SPIRV/CodeGenSPIRVTest.cpp
+++ b/tools/clang/unittests/SPIRV/CodeGenSPIRVTest.cpp
@@ -88,6 +88,7 @@ TEST_F(FileTest, StructuredBufferType) {
   runFileTest("type.structured-buffer.hlsl");
 }
 TEST_F(FileTest, StructuredByteBufferArray) {
+  setRelaxLogicalPointer();
   runFileTest("type.structured-buffer.array.hlsl");
 }
 TEST_F(FileTest, StructuredByteBufferArrayError) {
@@ -1335,6 +1336,11 @@ TEST_F(FileTest, SpirvLegalizationStructuredBufferCounterInMethod) {
   setRelaxLogicalPointer();
   runFileTest("spirv.legal.sbuffer.counter.method.hlsl");
 }
+TEST_F(FileTest,
+       SpirvLegalizationCounterVarAssignAcrossDifferentNestedStructLevel) {
+  setRelaxLogicalPointer();
+  runFileTest("spirv.legal.counter.nested-struct.hlsl");
+}
 TEST_F(FileTest, SpirvLegalizationStructuredBufferInStruct) {
   setRelaxLogicalPointer();
   runFileTest("spirv.legal.sbuffer.struct.hlsl");
@@ -1750,4 +1756,9 @@ TEST_F(FileTest, LegalizationExample21) {
   runFileTest("legal-examples/21-combined-ok.hlsl");
 }
 
+TEST_F(FileTest, PreprocessorError) {
+  // Tests that preprocessor error is surfaced
+  runFileTest("preprocess.error.hlsl", Expect::Failure);
+}
+
 } // namespace
diff --git a/tools/dxexp/dxexp.cpp b/tools/dxexp/dxexp.cpp
index 73d18a6bf..4ea4cf4a6 100644
--- a/tools/dxexp/dxexp.cpp
+++ b/tools/dxexp/dxexp.cpp
@@ -97,7 +97,47 @@ typedef struct D3D12_FEATURE_DATA_D3D12_OPTIONS4
     _Out_ D3D12_SHARED_RESOURCE_COMPATIBILITY_TIER SharedResourceCompatibilityTier;
     _Out_ BOOL Native16BitShaderOpsSupported;
 } D3D12_FEATURE_DATA_D3D12_OPTIONS4;
+#endif
 
+#ifndef NTDDI_WIN10_RS4
+#define NTDDI_WIN10_RS4 0x0A000005
+#endif
+
+#if WDK_NTDDI_VERSION <= NTDDI_WIN10_RS4
+#define D3D_SHADER_MODEL_6_3 ((D3D_SHADER_MODEL)0x63)
+#define D3D12_FEATURE_D3D12_OPTIONS5 ((D3D12_FEATURE)27)
+typedef enum D3D12_RENDER_PASS_TIER
+{
+    D3D12_RENDER_PASS_TIER_0  = 0,
+    D3D12_RENDER_PASS_TIER_1  = 1,
+    D3D12_RENDER_PASS_TIER_2  = 2
+}   D3D12_RENDER_PASS_TIER;
+
+typedef enum D3D12_RAYTRACING_TIER
+{
+    D3D12_RAYTRACING_TIER_NOT_SUPPORTED = 0,
+    D3D12_RAYTRACING_TIER_1_0 = 10
+}   D3D12_RAYTRACING_TIER;
+
+typedef struct D3D12_FEATURE_DATA_D3D12_OPTIONS5
+{
+    _Out_  BOOL SRVOnlyTiledResourceTier3;
+    _Out_  D3D12_RENDER_PASS_TIER RenderPassesTier;
+    _Out_  D3D12_RAYTRACING_TIER RaytracingTier;
+}   D3D12_FEATURE_DATA_D3D12_OPTIONS5;
+#endif
+
+#ifndef NTDDI_WIN10_RS5
+#define NTDDI_WIN10_RS5 0x0A000006
+#endif
+
+#if WDK_NTDDI_VERSION <= NTDDI_WIN10_RS5
+#define D3D_SHADER_MODEL_6_4 ((D3D_SHADER_MODEL)0x64)
+#endif
+
+// TODO: Place under new version once available
+#if WDK_NTDDI_VERSION <= NTDDI_WIN10_RS5
+#define D3D_SHADER_MODEL_6_5 ((D3D_SHADER_MODEL)0x65)
 #endif
 
 static char *BoolToStrJson(bool value) {
@@ -120,6 +160,9 @@ static char *ShaderModelToStr(D3D_SHADER_MODEL SM) {
   case D3D_SHADER_MODEL_6_0: return "6.0";
   case D3D_SHADER_MODEL_6_1: return "6.1";
   case D3D_SHADER_MODEL_6_2: return "6.2";
+  case D3D_SHADER_MODEL_6_3: return "6.3";
+  case D3D_SHADER_MODEL_6_4: return "6.4";
+  case D3D_SHADER_MODEL_6_5: return "6.5";
   default: return "ERROR";
   }
 }
@@ -134,6 +177,25 @@ static char *ViewInstancingTierToStr(D3D12_VIEW_INSTANCING_TIER Tier) {
   }
 }
 
+static char *RaytracingTierToStr(D3D12_RAYTRACING_TIER Tier) {
+  switch (Tier) {
+  case D3D12_RAYTRACING_TIER_NOT_SUPPORTED: return "NO";
+  case D3D12_RAYTRACING_TIER_1_0: return "1.0";
+  default: return "ERROR";
+  }
+}
+
+static HRESULT GetHighestShaderModel(ID3D12Device *pDevice, D3D12_FEATURE_DATA_SHADER_MODEL &DeviceSM) {
+  HRESULT hr = E_INVALIDARG;
+  D3D_SHADER_MODEL SM = D3D_SHADER_MODEL_6_5;
+  while (hr == E_INVALIDARG && SM >= D3D_SHADER_MODEL_6_0) {
+    DeviceSM.HighestShaderModel = SM;
+    hr = pDevice->CheckFeatureSupport(D3D12_FEATURE_SHADER_MODEL, &DeviceSM, sizeof(DeviceSM));
+    SM = (D3D_SHADER_MODEL)((UINT32)SM - 1);
+  }
+  return hr;
+}
+
 static HRESULT PrintAdapters() {
   HRESULT hr = S_OK;
   char comma = ' ';
@@ -153,28 +215,22 @@ static HRESULT PrintAdapters() {
       D3D12_FEATURE_DATA_D3D12_OPTIONS1 DeviceOptions;
       D3D12_FEATURE_DATA_D3D12_OPTIONS3 DeviceOptions3;
       D3D12_FEATURE_DATA_D3D12_OPTIONS4 DeviceOptions4;
+      D3D12_FEATURE_DATA_D3D12_OPTIONS5 DeviceOptions5;
       memset(&DeviceOptions, 0, sizeof(DeviceOptions));
       memset(&DeviceOptions3, 0, sizeof(DeviceOptions3));
       memset(&DeviceOptions4, 0, sizeof(DeviceOptions4));
+      memset(&DeviceOptions5, 0, sizeof(DeviceOptions5));
       D3D12_FEATURE_DATA_SHADER_MODEL DeviceSM;
       AtlCheck(pAdapter->GetDesc1(&AdapterDesc));
       AtlCheck(D3D12CreateDevice(pAdapter, D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&pDevice)));
       AtlCheck(pDevice->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS1, &DeviceOptions, sizeof(DeviceOptions)));
-      DeviceSM.HighestShaderModel = D3D_SHADER_MODEL_6_0;
-      // CheckFeatureSupport with D3D12_FEATURE_D3D12_OPTIONS3 will fail on Creators Update,
-      // but succeed on newer versions of Windows.  Use this to control the initial value
-      // for highest shader model.
-      if (SUCCEEDED(pDevice->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &DeviceOptions3, sizeof(DeviceOptions3))))
-        DeviceSM.HighestShaderModel = D3D_SHADER_MODEL_6_1;
-      // CheckFeatureSupport with D3D12_FEATURE_D3D12_OPTIONS3 will fail on Fall Creators Update,
-      // but succeed on newer versions of Windows.  Use this to control the initial value
-      // for highest shader model.
-      if (SUCCEEDED(pDevice->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS4, &DeviceOptions4, sizeof(DeviceOptions4))))
-        DeviceSM.HighestShaderModel = D3D_SHADER_MODEL_6_2;
-      AtlCheck(pDevice->CheckFeatureSupport(D3D12_FEATURE_SHADER_MODEL, &DeviceSM, sizeof(DeviceSM)));
+      pDevice->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &DeviceOptions3, sizeof(DeviceOptions3));
+      pDevice->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS4, &DeviceOptions4, sizeof(DeviceOptions4));
+      pDevice->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS5, &DeviceOptions5, sizeof(DeviceOptions5));
+      AtlCheck(GetHighestShaderModel(pDevice, DeviceSM));
       const char *Format = IsOutputJson ?
-        "%c { \"name\": \"%S\", \"sm\": \"%s\", \"wave\": %s, \"i64\": %s, \"bary\": %s, \"view-inst\": \"%s\" }\n" :
-        "%c %S - Highest SM [%s] Wave [%s] I64 [%s] Barycentrics [%s] View Instancing [%s] 16bit Support [%s]\n";
+        "%c { \"name\": \"%S\", \"sm\": \"%s\", \"wave\": %s, \"i64\": %s, \"bary\": %s, \"view-inst\": \"%s\", \"16bit\": %s, \"raytracing\": \"%s\" }\n" :
+        "%c %S - Highest SM [%s] Wave [%s] I64 [%s] Barycentrics [%s] View Instancing [%s] 16bit Support [%s] Raytracing [%s]\n";
       printf(Format,
              comma,
              AdapterDesc.Description,
@@ -183,7 +239,8 @@ static HRESULT PrintAdapters() {
              BoolToStr(DeviceOptions.Int64ShaderOps),
              BoolToStr(DeviceOptions3.BarycentricsSupported),
              ViewInstancingTierToStr(DeviceOptions3.ViewInstancingTier),
-             BoolToStr(DeviceOptions4.Native16BitShaderOpsSupported)
+             BoolToStr(DeviceOptions4.Native16BitShaderOpsSupported),
+             RaytracingTierToStr(DeviceOptions5.RaytracingTier)
             );
       AdapterIndex++;
       comma = IsOutputJson ? ',' : ' ';
diff --git a/utils/hct/gen_intrin_main.txt b/utils/hct/gen_intrin_main.txt
index 380ff927f..f3299f1ee 100644
--- a/utils/hct/gen_intrin_main.txt
+++ b/utils/hct/gen_intrin_main.txt
@@ -83,7 +83,7 @@ int<4> [[rn]] D3DCOLORtoUBYTE4(in $match<0, 1> float<4> x) : d3dcolortoubyte4;
 uint [[rn]]  GetRenderTargetSampleCount() : rtsampleinfo;
 float<2> [[rn]] GetRenderTargetSamplePosition(in int s) : rtsamplepos;
 void [[]]   abort();
-$type1 [[rn]] abs(in numeric<> x);
+$type1 [[rn,unsigned_op=uabs]] abs(in numeric<> x);
 $type1 [[rn]] acos(in float_like<> x);
 bool [[rn]] all(in any<> x);
 void [[]] AllMemoryBarrier() : syncallmemory_ug;
@@ -201,7 +201,7 @@ $type1 [[rn]] reversebits(in any_int<> x);
 $type1 [[rn]] round(in float_like<> x);
 $type1 [[rn]] rsqrt(in float_like<> x);
 $type1 [[rn]] saturate(in any_float<> x);
-$match<1, 0> int<> [[rn]] sign(in numeric<> x);
+$match<1, 0> int<> [[rn,unsigned_op=usign,overload=0]] sign(in numeric<> x);
 $type1 [[rn]] sin(in float_like<> x);
 void [[]] sincos(in float_like<> x, out $type1 s, out $type1 c);
 $type1 [[rn]] sinh(in float_like<> x);
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index fbfcf51aa..f371a7473 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -1483,6 +1483,7 @@ class db_dxil(object):
         # UseNewSROA is used by PassManagerBuilder::populateFunctionPassManager, not a pass per se.
         add_pass("sroa", "SROA", "Scalar Replacement Of Aggregates", [
             {'n':'RequiresDomTree', 't':'bool', 'c':1},
+            {'n':'SkipHLSLMat', 't':'bool', 'c':1},
             {'n':'force-ssa-updater', 'i':'ForceSSAUpdater', 't':'bool', 'd':'Force the pass to not use DomTree and mem2reg, insteadforming SSA values through the SSAUpdater infrastructure.'},
             {'n':'sroa-random-shuffle-slices', 'i':'SROARandomShuffleSlices', 't':'bool', 'd':'Enable randomly shuffling the slices to help uncover instability in their order.'},
             {'n':'sroa-strict-inbounds', 'i':'SROAStrictInbounds', 't':'bool', 'd':'Experiment with completely strict handling of inbounds GEPs.'}])
@@ -1538,6 +1539,7 @@ class db_dxil(object):
             {'n':'parameter0','t':'int','c':1},
             {'n':'parameter1','t':'int','c':1},
             {'n':'parameter2','t':'int','c':1}])
+        add_pass('dxil-annotate-with-virtual-regs', 'DxilAnnotateWithVirtualRegister', 'Annotates each instruction in the DXIL module with a virtual register number', [])
         add_pass('hlsl-dxil-reduce-msaa-to-single', 'DxilReduceMSAAToSingleSample', 'HLSL DXIL Reduce all MSAA reads to single-sample reads', [])
 
         category_lib="dxil_gen"
@@ -1562,6 +1564,7 @@ class db_dxil(object):
         add_pass('simplify-inst', 'SimplifyInst', 'Simplify Instructions', [])
         add_pass('hlsl-dxil-precise', 'DxilPrecisePropagatePass', 'DXIL precise attribute propagate', [])
         add_pass('dxil-legalize-sample-offset', 'DxilLegalizeSampleOffsetPass', 'DXIL legalize sample offset', [])
+        add_pass('dxil-gvn-hoist', 'DxilSimpleGVNHoist', 'DXIL simple gvn hoist', [])
         add_pass('hlsl-hlensure', 'HLEnsureMetadata', 'HLSL High-Level Metadata Ensure', [])
         add_pass('multi-dim-one-dim', 'MultiDimArrayToOneDimArray', 'Flatten multi-dim array into one-dim array', [])
         add_pass('resource-handle', 'ResourceToHandle', 'Lower resource into handle', [])
@@ -1615,6 +1618,7 @@ class db_dxil(object):
         # C:\nobackup\work\HLSLonLLVM\lib\Transforms\IPO\PassManagerBuilder.cpp:353
         add_pass('indvars', 'IndVarSimplify', "Induction Variable Simplification", [])
         add_pass('loop-idiom', 'LoopIdiomRecognize', "Recognize loop idioms", [])
+        add_pass('dxil-loop-unroll', 'DxilLoopUnroll', 'DxilLoopUnroll', [])
         add_pass('loop-deletion', 'LoopDeletion', "Delete dead loops", [])
         add_pass('loop-interchange', 'LoopInterchange', 'Interchanges loops for cache reuse', [])
         add_pass('loop-unroll', 'LoopUnroll', 'Unroll loops', [
diff --git a/utils/hct/hcttest.cmd b/utils/hct/hcttest.cmd
index 96e45c9dd..8501019c7 100644
--- a/utils/hct/hcttest.cmd
+++ b/utils/hct/hcttest.cmd
@@ -19,8 +19,8 @@ set TEST_CLANG_FILTER= /select: "@Priority<1"
 set TEST_EXEC_FILTER=ExecutionTest::*
 set LOG_FILTER=/logOutput:LowWithConsoleBuffering
 set TEST_COMPAT_SUITE=0
-set COMPAT_SUIT_PATH=
-set TEST_SINGLE_FILE_CHECK=0
+set MANUAL_FILE_CHECK_PATH=
+set TEST_MANUAL_FILE_CHECK=0
 set SINGLE_FILE_CHECK_NAME=0
 
 rem Begin SPIRV change
@@ -78,15 +78,10 @@ if "%1"=="-clean" (
   set TEST_CLANG=1
   set TEST_CLANG_FILTER= /name:%2
   shift /1
-) else if "%1"=="compat-suite" (
-  set TEST_ALL=0
-  set TEST_COMPAT_SUITE=1
-  set COMPAT_SUIT_PATH= /p:"SuitePath=%~2"
-  shift /1
 ) else if "%1"=="file-check" (
   set TEST_ALL=0
-  set TEST_SINGLE_FILE_CHECK=1
-  set COMPAT_SUIT_PATH= /p:"InputFile=%~2"
+  set TEST_MANUAL_FILE_CHECK=1
+  set MANUAL_FILE_CHECK_PATH=%~2
   shift /1
 ) else if "%1"=="v" (
   set TEST_ALL=0
@@ -281,13 +276,8 @@ if exist "%HCT_EXTRAS%\hcttest-after.cmd" (
   set RES_HCTTEST_AFTER=!ERRORLEVEL!
 )
 
-if "%TEST_SINGLE_FILE_CHECK%"=="1" (
-  call :runte clang-hlsl-tests.dll /p:"HlslDataDir=%HLSL_SRC_DIR%\tools\clang\test\HLSL" /name:CompilerTest::SingleFileCheckTest /runIgnoredTests %COMPAT_SUIT_PATH%
-  set RES_EXEC=!ERRORLEVEL!
-)
-
-if "%TEST_COMPAT_SUITE%"=="1" (
-  call :runte clang-hlsl-tests.dll /p:"HlslDataDir=%HLSL_SRC_DIR%\tools\clang\test\HLSL" /name:CompilerTest::ShaderCompatSuite %COMPAT_SUIT_PATH%
+if "%TEST_MANUAL_FILE_CHECK%"=="1" (
+  call :runte clang-hlsl-tests.dll /p:"HlslDataDir=%HLSL_SRC_DIR%\tools\clang\test\HLSL" /name:CompilerTest::ManualFileCheckTest /runIgnoredTests /p:"InputPath=%MANUAL_FILE_CHECK_PATH%"
   set RES_EXEC=!ERRORLEVEL!
 )
 
diff --git a/utils/hct/pkgesbuild/SignConfig.xml b/utils/hct/pkgesbuild/SignConfig.xml
new file mode 100644
index 000000000..17fa225c4
--- /dev/null
+++ b/utils/hct/pkgesbuild/SignConfig.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<SignConfigXML>
+ <job platform="x86" certSubject="CN=Microsoft Corporation, O=Microsoft Corporation, L=Redmond, S=Washington, C=US" configuration="Release" dest="__OUTPATHROOT__" jobname="DirectX Shader Compiler" approvers="">
+    <file signType="Authenticode" src="__INPATHROOT__\dxcompiler.dll" />
+    <file signType="Authenticode" src="__INPATHROOT__\dxc.exe" />
+    <file signType="Authenticode" src="__INPATHROOT__\dxrfallbackcompiler.dll"/>
+  </job>
+ <job platform="x64" certSubject="CN=Microsoft Corporation, O=Microsoft Corporation, L=Redmond, S=Washington, C=US" configuration="Release" dest="__OUTPATHROOT__" jobname="DirectX Shader Compiler" approvers="">
+    <file signType="Authenticode" src="__INPATHROOT__\dxcompiler.dll" />
+    <file signType="Authenticode" src="__INPATHROOT__\dxc.exe" />
+    <file signType="Authenticode" src="__INPATHROOT__\dxrfallbackcompiler.dll"/>
+  </job>
+ <job platform="arm64" certSubject="CN=Microsoft Corporation, O=Microsoft Corporation, L=Redmond, S=Washington, C=US" configuration="Release" dest="__OUTPATHROOT__" jobname="DirectX Shader Compiler" approvers="">
+    <file signType="Authenticode" src="__INPATHROOT__\dxcompiler.dll" />
+    <file signType="Authenticode" src="__INPATHROOT__\dxc.exe" />
+    <file signType="Authenticode" src="__INPATHROOT__\dxrfallbackcompiler.dll"/>
+  </job>
+</SignConfigXML>
\ No newline at end of file
diff --git a/utils/hct/pkgesbuild/common_setup.cmd b/utils/hct/pkgesbuild/common_setup.cmd
new file mode 100644
index 000000000..26bfb7a0c
--- /dev/null
+++ b/utils/hct/pkgesbuild/common_setup.cmd
@@ -0,0 +1,18 @@
+echo off
+
+REM %1 - $(BuildPlatform)
+REM %2 - $(Build.SourcesDirectory)
+REM %3 - $(Build.BinariesDirectory)
+
+set BUILD_PLATFORM=%1
+set HLSL_SRC_DIR=%~f2
+set HLSL_BLD_DIR=%~f3\%BUILD_PLATFORM%
+
+rem Add Windows 10 SDK on PATH
+set WIN10_SDK_PATH=%HLSL_SRC_DIR%\Packages\MS.Uwp.RS5RLS.Native.10.0.17763.1\l
+set WIN10_SDK_VERSION=10.0.17763
+set PATH=%WIN10_SDK_PATH%;%PATH%;
+
+rem Add Python and VS CMake on PATH
+set PATH=%PATH%;C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python36_64\
+set PATH=%PATH%;C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\;
diff --git a/utils/hct/pkgesbuild/nuget.config b/utils/hct/pkgesbuild/nuget.config
new file mode 100644
index 000000000..9a350915e
--- /dev/null
+++ b/utils/hct/pkgesbuild/nuget.config
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<configuration>
+  <packageSources>
+    <add key="nuget.org" value="https://api.nuget.org/v3/index.json" />
+    <add key="msftnuget" value="https://microsoft.pkgs.visualstudio.com/_packaging/MSFTNuget/nuget/v3/index.json" />
+    <add key="Taef" value="https://microsoft.pkgs.visualstudio.com/_packaging/Taef/nuget/v3/index.json" />
+  </packageSources>
+</configuration>
diff --git a/utils/hct/pkgesbuild/packages.config b/utils/hct/pkgesbuild/packages.config
new file mode 100644
index 000000000..64c900bb9
--- /dev/null
+++ b/utils/hct/pkgesbuild/packages.config
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="utf-8"?>
+<packages>
+  <!-- Windows 10 SDK package -->
+  <package id="MS.Uwp.rs5rls.Native" version="10.0.17763.1" /> 
+  <!-- TAEF package -->
+  <package id="Taef.Redist" version="10.33.181113003-develop" />
+</packages>
diff --git a/utils/hct/pkgesbuild/pre_build_setup.cmd b/utils/hct/pkgesbuild/pre_build_setup.cmd
new file mode 100644
index 000000000..557e8352b
--- /dev/null
+++ b/utils/hct/pkgesbuild/pre_build_setup.cmd
@@ -0,0 +1,14 @@
+echo off
+
+REM %1 - $(BuildPlatform)
+REM %2 - $(Build.SourcesDirectory)
+REM %3 - $(Build.BinariesDirectory)
+
+echo Running pre_build_setup.cmd
+
+call %~p0\common_setup.cmd %1 %2 %3
+
+echo Build platform: %BUILD_PLATFORM%
+echo HLSL source directory: %HLSL_SRC_DIR%
+echo HLSL build directory: %HLSL_BLD_DIR%
+echo SDK path: %WIN10_SDK_PATH%
\ No newline at end of file
diff --git a/utils/hct/pkgesbuild/pre_test_setup.cmd b/utils/hct/pkgesbuild/pre_test_setup.cmd
new file mode 100644
index 000000000..00ea351ce
--- /dev/null
+++ b/utils/hct/pkgesbuild/pre_test_setup.cmd
@@ -0,0 +1,17 @@
+echo off
+
+REM %1 - $(BuildPlatform)
+REM %2 - $(Build.SourcesDirectory)
+REM %3 - $(Build.BinariesDirectory)
+
+echo Running pre_test_setup.cmd
+
+call %~p0\common_setup.cmd %1 %2 %3
+
+echo Build platform: %BUILD_PLATFORM%
+echo HLSL build directory: %HLSL_BLD_DIR%
+
+rem Add TAEF package on PATH
+set TAEF_PATH=%HLSL_SRC_DIR%\Packages\Taef.Redist.10.33.181113003-develop\build\Binaries\Release\x64
+set PATH=%PATH%;%TAEF_PATH%
+echo TAEF path: %TAEF_PATH%
\ No newline at end of file