From 8226092fd7ff27069f75ef8f1a45f493712c1834 Mon Sep 17 00:00:00 2001
From: Tristan Labelle <github@tristanlabelle.com>
Date: Mon, 3 Dec 2018 13:28:22 -0800
Subject: [PATCH] Change in-memory representation of bool vectors to use i32's
 (#1742)

This change fixes an inconsistency between scalar and vector bools, where the formers were stored as i32s in memory (as opposed to i1s in registers) while the latter remained <n x i1>s. This lead to a number of crashes in more complex bool scenarios where scalar and vector element types were expected to match.

The solution is to extend Clang's strategy for scalar bools to vector of bools. Clang relies on three key functions to deal with differences between register and memory type representations which are special-cased for bools and now also for bool vectors. Several HLSL code additions did not properly leverage those functions and this was addressed wherever possible, in some cases removing the need for special cases for bools.

To deal with matrices, a similar concept of register/memory representation was introduced in the matrix lowering code and the lowering passes were updated accordingly.
---
 include/dxc/HLSL/HLMatrixLowerHelper.h        |  14 +-
 lib/HLSL/HLMatrixLowerPass.cpp                |  94 +++++++++++--
 lib/HLSL/HLOperationLower.cpp                 |  31 +++--
 lib/HLSL/HLSignatureLower.cpp                 | 124 ++++++++----------
 tools/clang/lib/AST/ASTContext.cpp            |   1 -
 tools/clang/lib/Basic/Targets.cpp             |   4 +-
 tools/clang/lib/CodeGen/CGExpr.cpp            |  45 ++++---
 tools/clang/lib/CodeGen/CGHLSLMS.cpp          |  47 ++++---
 tools/clang/lib/CodeGen/CodeGenTypes.cpp      |  23 +++-
 .../clang/test/CodeGenHLSL/RValSubscript.hlsl |   4 +-
 .../bool_loadbuf_storebuf_memrepr.hlsl        | 110 ++++++++++++++++
 .../CodeGenHLSL/quick-test/bool_memrepr.hlsl  |  52 ++++++++
 .../quick-test/bool_scalar_swizzle.hlsl       |  15 +++
 .../CodeGenHLSL/quick-test/bool_stress.hlsl   |  54 ++++++++
 .../quick-test/global-var-write-test04.hlsl   |   2 +-
 15 files changed, 482 insertions(+), 138 deletions(-)
 create mode 100644 tools/clang/test/CodeGenHLSL/quick-test/bool_loadbuf_storebuf_memrepr.hlsl
 create mode 100644 tools/clang/test/CodeGenHLSL/quick-test/bool_memrepr.hlsl
 create mode 100644 tools/clang/test/CodeGenHLSL/quick-test/bool_scalar_swizzle.hlsl
 create mode 100644 tools/clang/test/CodeGenHLSL/quick-test/bool_stress.hlsl
diff --git a/include/dxc/HLSL/HLMatrixLowerHelper.h b/include/dxc/HLSL/HLMatrixLowerHelper.h
index 39a432319..de61be469 100644
--- a/include/dxc/HLSL/HLMatrixLowerHelper.h
+++ b/include/dxc/HLSL/HLMatrixLowerHelper.h
@@ -31,17 +31,27 @@ bool IsMatrixType(llvm::Type *Ty);
 DxilFieldAnnotation *FindAnnotationFromMatUser(llvm::Value *Mat,
                                                DxilTypeSystem &typeSys);
 // Translate matrix type to vector type.
-llvm::Type *LowerMatrixType(llvm::Type *Ty);
+llvm::Type *LowerMatrixType(llvm::Type *Ty, bool forMem = false);
 // TODO: use type annotation.
 llvm::Type *GetMatrixInfo(llvm::Type *Ty, unsigned &col, unsigned &row);
 // TODO: use type annotation.
 bool IsMatrixArrayPointer(llvm::Type *Ty);
 // Translate matrix array pointer type to vector array pointer type.
-llvm::Type *LowerMatrixArrayPointer(llvm::Type *Ty);
+llvm::Type *LowerMatrixArrayPointer(llvm::Type *Ty, bool forMem = false);
 
 llvm::Value *BuildVector(llvm::Type *EltTy, unsigned size,
                          llvm::ArrayRef<llvm::Value *> elts,
                          llvm::IRBuilder<> &Builder);
+
+llvm::Value *VecMatrixMemToReg(llvm::Value *VecVal, llvm::Type *MatType,
+                               llvm::IRBuilder<> &Builder);
+llvm::Value *VecMatrixRegToMem(llvm::Value* VecVal, llvm::Type *MatType,
+                               llvm::IRBuilder<> &Builder);
+llvm::Instruction *CreateVecMatrixLoad(llvm::Value *VecPtr,
+                                       llvm::Type *MatType, llvm::IRBuilder<> &Builder);
+llvm::Instruction *CreateVecMatrixStore(llvm::Value* VecVal, llvm::Value *VecPtr,
+                                        llvm::Type *MatType, llvm::IRBuilder<> &Builder);
+
 // For case like mat[i][j].
 // IdxList is [i][0], [i][1], [i][2],[i][3].
 // Idx is j.
diff --git a/lib/HLSL/HLMatrixLowerPass.cpp b/lib/HLSL/HLMatrixLowerPass.cpp
index 40c35e8fe..4c736a9e3 100644
--- a/lib/HLSL/HLMatrixLowerPass.cpp
+++ b/lib/HLSL/HLMatrixLowerPass.cpp
@@ -69,7 +69,7 @@ DxilFieldAnnotation *FindAnnotationFromMatUser(Value *Mat,
 }
 
 // Translate matrix type to vector type.
-Type *LowerMatrixType(Type *Ty) {
+Type *LowerMatrixType(Type *Ty, bool forMem) {
   // Only translate matrix type and function type which use matrix type.
   // Not translate struct has matrix or matrix pointer.
   // Struct should be flattened before.
@@ -84,6 +84,8 @@ Type *LowerMatrixType(Type *Ty) {
   } else if (IsMatrixType(Ty)) {
     unsigned row, col;
     Type *EltTy = GetMatrixInfo(Ty, col, row);
+    if (forMem && EltTy->isIntegerTy(1))
+      EltTy = Type::getInt32Ty(Ty->getContext());
     return VectorType::get(EltTy, row * col);
   } else {
     return Ty;
@@ -122,7 +124,7 @@ bool IsMatrixArrayPointer(llvm::Type *Ty) {
     Ty = Ty->getArrayElementType();
   return IsMatrixType(Ty);
 }
-Type *LowerMatrixArrayPointer(Type *Ty) {
+Type *LowerMatrixArrayPointer(Type *Ty, bool forMem) {
   unsigned addrSpace = Ty->getPointerAddressSpace();
   Ty = Ty->getPointerElementType();
   std::vector<unsigned> arraySizeList;
@@ -130,7 +132,7 @@ Type *LowerMatrixArrayPointer(Type *Ty) {
     arraySizeList.push_back(Ty->getArrayNumElements());
     Ty = Ty->getArrayElementType();
   }
-  Ty = LowerMatrixType(Ty);
+  Ty = LowerMatrixType(Ty, forMem);
 
   for (auto arraySize = arraySizeList.rbegin();
        arraySize != arraySizeList.rend(); arraySize++)
@@ -155,13 +157,69 @@ Type *LowerMatrixArrayPointerToOneDimArray(Type *Ty) {
   return PointerType::get(Ty, addrSpace);
 }
 Value *BuildVector(Type *EltTy, unsigned size, ArrayRef<llvm::Value *> elts,
-                   IRBuilder<> &Builder) {
+  IRBuilder<> &Builder) {
   Value *Vec = UndefValue::get(VectorType::get(EltTy, size));
   for (unsigned i = 0; i < size; i++)
     Vec = Builder.CreateInsertElement(Vec, elts[i], i);
   return Vec;
 }
 
+llvm::Value *VecMatrixMemToReg(llvm::Value *VecVal, llvm::Type *MatType,
+  llvm::IRBuilder<> &Builder)
+{
+  llvm::Type *VecMatRegTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/false);
+  if (VecVal->getType() == VecMatRegTy) {
+    return VecVal;
+  }
+
+  DXASSERT(VecMatRegTy->getVectorElementType()->isIntegerTy(1),
+    "Vector matrix mem to reg type mismatch should only happen for bools.");
+  llvm::Type *VecMatMemTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/true);
+  return Builder.CreateICmpNE(VecVal, Constant::getNullValue(VecMatMemTy));
+}
+
+llvm::Value *VecMatrixRegToMem(llvm::Value* VecVal, llvm::Type *MatType,
+  llvm::IRBuilder<> &Builder)
+{
+  llvm::Type *VecMatMemTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/true);
+  if (VecVal->getType() == VecMatMemTy) {
+    return VecVal;
+  }
+
+  DXASSERT(VecVal->getType()->getVectorElementType()->isIntegerTy(1),
+    "Vector matrix reg to mem type mismatch should only happen for bools.");
+  return Builder.CreateZExt(VecVal, VecMatMemTy);
+}
+
+llvm::Instruction *CreateVecMatrixLoad(
+  llvm::Value *VecPtr, llvm::Type *MatType, llvm::IRBuilder<> &Builder)
+{
+  llvm::Instruction *VecVal = Builder.CreateLoad(VecPtr);
+  return cast<llvm::Instruction>(VecMatrixMemToReg(VecVal, MatType, Builder));
+}
+
+llvm::Instruction *CreateVecMatrixStore(llvm::Value* VecVal, llvm::Value *VecPtr,
+  llvm::Type *MatType, llvm::IRBuilder<> &Builder)
+{
+  llvm::Type *VecMatMemTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/true);
+  if (VecVal->getType() == VecMatMemTy) {
+    return Builder.CreateStore(VecVal, VecPtr);
+  }
+
+  // We need to convert to the memory representation, and we want to return
+  // the conversion instruction rather than the store since that's what
+  // accepts the register-typed i1 values.
+
+  // Do not use VecMatrixRegToMem as it may constant fold the conversion
+  // instruction, which is what we want to return.
+  DXASSERT(VecVal->getType()->getVectorElementType()->isIntegerTy(1),
+    "Vector matrix reg to mem type mismatch should only happen for bools.");
+
+  llvm::Instruction *ConvInst = Builder.Insert(new ZExtInst(VecVal, VecMatMemTy));
+  Builder.CreateStore(ConvInst, VecPtr);
+  return ConvInst;
+}
+
 Value *LowerGEPOnMatIndexListToIndex(
     llvm::GetElementPtrInst *GEP, ArrayRef<Value *> IdxList) {
   IRBuilder<> Builder(GEP);
@@ -508,7 +566,7 @@ Instruction *HLMatrixLowerPass::MatLdStToVec(CallInst *CI) {
     if (isa<AllocaInst>(matPtr) || GetIfMatrixGEPOfUDTAlloca(matPtr) ||
         GetIfMatrixGEPOfUDTArg(matPtr, *m_pHLModule)) {
       Value *vecPtr = matToVecMap[cast<Instruction>(matPtr)];
-      result = Builder.CreateLoad(vecPtr);
+      result = CreateVecMatrixLoad(vecPtr, matPtr->getType()->getPointerElementType(), Builder);
     } else
       result = MatIntrinsicToVec(CI);
   } break;
@@ -519,9 +577,8 @@ Instruction *HLMatrixLowerPass::MatLdStToVec(CallInst *CI) {
         GetIfMatrixGEPOfUDTArg(matPtr, *m_pHLModule)) {
       Value *vecPtr = matToVecMap[cast<Instruction>(matPtr)];
       Value *matVal = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
-      Value *vecVal =
-          UndefValue::get(HLMatrixLower::LowerMatrixType(matVal->getType()));
-      result = Builder.CreateStore(vecVal, vecPtr);
+      Value *vecVal = UndefValue::get(HLMatrixLower::LowerMatrixType(matVal->getType()));
+      result = CreateVecMatrixStore(vecVal, vecPtr, matVal->getType(), Builder);
     } else
       result = MatIntrinsicToVec(CI);
   } break;
@@ -905,11 +962,11 @@ void HLMatrixLowerPass::lowerToVec(Instruction *matInst) {
     
     IRBuilder<> AllocaBuilder(AI);
     if (Ty->isArrayTy()) {
-      Type *vecTy = HLMatrixLower::LowerMatrixArrayPointer(AI->getType());
+      Type *vecTy = HLMatrixLower::LowerMatrixArrayPointer(AI->getType(), /*forMem*/ true);
       vecTy = vecTy->getPointerElementType();
       vecVal = AllocaBuilder.CreateAlloca(vecTy, nullptr, AI->getName());
     } else {
-      Type *vecTy = HLMatrixLower::LowerMatrixType(matTy);
+      Type *vecTy = HLMatrixLower::LowerMatrixType(matTy, /*forMem*/ true);
       vecVal = AllocaBuilder.CreateAlloca(vecTy, nullptr, AI->getName());
     }
     // Update debug info.
@@ -2059,7 +2116,8 @@ void HLMatrixLowerPass::TranslateMatArrayGEP(Value *matInst,
           // Skip the vector version.
           if (useCall->getType()->isVectorTy())
             continue;
-          Value *newLd = Builder.CreateLoad(newGEP);
+          Type *matTy = useCall->getType();
+          Value *newLd = CreateVecMatrixLoad(newGEP, matTy, Builder);
           DXASSERT(matToVecMap.count(useCall), "must have vec version");
           Value *oldLd = matToVecMap[useCall];
           // Delete the oldLd.
@@ -2082,7 +2140,7 @@ void HLMatrixLowerPass::TranslateMatArrayGEP(Value *matInst,
 
           DXASSERT(matToVecMap.count(matInst), "must have vec version");
           Value *vecVal = matToVecMap[matInst];
-          Builder.CreateStore(vecVal, vecPtr);
+          CreateVecMatrixStore(vecVal, vecPtr, matVal->getType(), Builder);
         } break;
         }
       } break;
@@ -2174,9 +2232,17 @@ void HLMatrixLowerPass::replaceMatWithVec(Value *matVal,
           // Load Already translated in lowerToVec.
           // Store val operand will be set by the val use.
           // Do nothing here.
-        } else if (StoreInst *stInst = dyn_cast<StoreInst>(vecUser))
+        } else if (StoreInst *stInst = dyn_cast<StoreInst>(vecUser)) {
+          DXASSERT(vecVal->getType() == stInst->getValueOperand()->getType(),
+            "Mismatched vector matrix store value types.");
           stInst->setOperand(0, vecVal);
-        else
+        } else if (ZExtInst *zextInst = dyn_cast<ZExtInst>(vecUser)) {
+          // This happens when storing bool matrices,
+          // which must first undergo conversion from i1's to i32's.
+          DXASSERT(vecVal->getType() == zextInst->getOperand(0)->getType(),
+            "Mismatched vector matrix store value types.");
+          zextInst->setOperand(0, vecVal);
+        } else
           TrivialMatReplace(matVal, vecVal, useCall);
 
       } break;
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index b0426f501..b2db0646a 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -4988,7 +4988,8 @@ Value *TranslateConstBufMatLd(Type *matType, Value *handle, Value *offset,
                               bool colMajor, OP *OP, const DataLayout &DL,
                               IRBuilder<> &Builder) {
   unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/true)->getVectorElementType();
   unsigned matSize = col * row;
   std::vector<Value *> elts(matSize);
   Value *EltByteSize = ConstantInt::get(
@@ -5001,7 +5002,9 @@ Value *TranslateConstBufMatLd(Type *matType, Value *handle, Value *offset,
     baseOffset = Builder.CreateAdd(baseOffset, EltByteSize);
   }
 
-  return HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Value* Vec = HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Vec = HLMatrixLower::VecMatrixMemToReg(Vec, matType, Builder);
+  return Vec;
 }
 
 void TranslateCBGep(GetElementPtrInst *GEP, Value *handle, Value *baseOffset,
@@ -5417,10 +5420,11 @@ Value *GenerateCBLoadLegacy(Value *handle, Value *legacyIdx,
 
 Value *TranslateConstBufMatLdLegacy(Type *matType, Value *handle,
                                     Value *legacyIdx, bool colMajor, OP *OP,
-                                    const DataLayout &DL,
+                                    bool memElemRepr, const DataLayout &DL,
                                     IRBuilder<> &Builder) {
   unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/memElemRepr)->getVectorElementType();
 
   unsigned matSize = col * row;
   std::vector<Value *> elts(matSize);
@@ -5506,8 +5510,9 @@ void TranslateCBAddressUserLegacy(Instruction *user, Value *handle,
       Type *matType = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx)
                           ->getType()
                           ->getPointerElementType();
+      // This will replace a call, so we should use the register representation of elements
       Value *newLd = TranslateConstBufMatLdLegacy(
-          matType, handle, legacyIdx, colMajor, hlslOP, DL, Builder);
+          matType, handle, legacyIdx, colMajor, hlslOP, /*memElemRepr*/false, DL, Builder);
       CI->replaceAllUsesWith(newLd);
       CI->eraseFromParent();
     } else if (group == HLOpcodeGroup::HLSubscript) {
@@ -5534,8 +5539,9 @@ void TranslateCBAddressUserLegacy(Instruction *user, Value *handle,
 
       Value *ldData = UndefValue::get(resultType);
       if (!dynamicIndexing) {
+        // This will replace a load or GEP, so we should use the memory representation of elements
         Value *matLd = TranslateConstBufMatLdLegacy(
-            matType, handle, legacyIdx, colMajor, hlslOP, DL, Builder);
+            matType, handle, legacyIdx, colMajor, hlslOP, /*memElemRepr*/true, DL, Builder);
         // The matLd is keep original layout, just use the idx calc in
         // EmitHLSLMatrixElement and EmitHLSLMatrixSubscript.
         switch (subOp) {
@@ -6022,7 +6028,8 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
                                Value *bufIdx, Value *baseOffset,
                                bool colMajor, const DataLayout &DL) {
   unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/true)->getVectorElementType();
   unsigned  EltSize = DL.getTypeAllocSize(EltTy);
   Constant* alignment = OP->GetI32Const(EltSize);
 
@@ -6054,14 +6061,20 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
     offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * EltSize));
   }
 
-  return HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Value *Vec = HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Vec = HLMatrixLower::VecMatrixMemToReg(Vec, matType, Builder);
+  return Vec;
 }
 
 void TranslateStructBufMatSt(Type *matType, IRBuilder<> &Builder, Value *handle,
                              hlsl::OP *OP, Value *bufIdx, Value *baseOffset,
                              Value *val, bool colMajor, const DataLayout &DL) {
   unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/true)->getVectorElementType();
+
+  val = HLMatrixLower::VecMatrixRegToMem(val, matType, Builder);
+
   unsigned EltSize = DL.getTypeAllocSize(EltTy);
   Constant *Alignment = OP->GetI32Const(EltSize);
   Value *offset = baseOffset;
diff --git a/lib/HLSL/HLSignatureLower.cpp b/lib/HLSL/HLSignatureLower.cpp
index 648d775db..3228d487c 100644
--- a/lib/HLSL/HLSignatureLower.cpp
+++ b/lib/HLSL/HLSignatureLower.cpp
@@ -912,80 +912,54 @@ void GenerateInputOutputUserCall(InputOutputAccessInfo &info, Value *undefVertex
     DXASSERT_NOMSG(group == HLOpcodeGroup::HLMatLoadStore);
     HLMatLoadStoreOpcode matOp = static_cast<HLMatLoadStoreOpcode>(opcode);
     switch (matOp) {
-    case HLMatLoadStoreOpcode::ColMatLoad: {
-      IRBuilder<> LocalBuilder(CI);
-      Type *matTy = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx)
-                        ->getType()
-                        ->getPointerElementType();
-      unsigned col, row;
-      Type *EltTy = HLMatrixLower::GetMatrixInfo(matTy, col, row);
-      std::vector<Value *> matElts(col * row);
-      for (unsigned c = 0; c < col; c++) {
-        Constant *constRowIdx = LocalBuilder.getInt32(c);
-        Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
-        for (unsigned r = 0; r < row; r++) {
-          SmallVector<Value *, 4> args = {OpArg, ID, rowIdx, columnConsts[r]};
-          if (vertexID)
-            args.emplace_back(vertexID);
-
-          Value *input = LocalBuilder.CreateCall(ldStFunc, args);
-          unsigned matIdx = c * row + r;
-          matElts[matIdx] = input;
-        }
-      }
-      Value *newVec =
-          HLMatrixLower::BuildVector(EltTy, col * row, matElts, LocalBuilder);
-      CI->replaceAllUsesWith(newVec);
-      CI->eraseFromParent();
-    } break;
+    case HLMatLoadStoreOpcode::ColMatLoad:
     case HLMatLoadStoreOpcode::RowMatLoad: {
       IRBuilder<> LocalBuilder(CI);
       Type *matTy = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx)
                         ->getType()
                         ->getPointerElementType();
       unsigned col, row;
-      Type *EltTy = HLMatrixLower::GetMatrixInfo(matTy, col, row);
+      HLMatrixLower::GetMatrixInfo(matTy, col, row);
       std::vector<Value *> matElts(col * row);
-      for (unsigned r = 0; r < row; r++) {
-        Constant *constRowIdx = LocalBuilder.getInt32(r);
-        Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
-        for (unsigned c = 0; c < col; c++) {
-          SmallVector<Value *, 4> args = {OpArg, ID, rowIdx, columnConsts[c]};
-          if (vertexID)
-            args.emplace_back(vertexID);
 
-          Value *input = LocalBuilder.CreateCall(ldStFunc, args);
-          unsigned matIdx = r * col + c;
-          matElts[matIdx] = input;
+      if (matOp == HLMatLoadStoreOpcode::ColMatLoad) {
+        for (unsigned c = 0; c < col; c++) {
+          Constant *constRowIdx = LocalBuilder.getInt32(c);
+          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+          for (unsigned r = 0; r < row; r++) {
+            SmallVector<Value *, 4> args = { OpArg, ID, rowIdx, columnConsts[r] };
+            if (vertexID)
+              args.emplace_back(vertexID);
+
+            Value *input = LocalBuilder.CreateCall(ldStFunc, args);
+            unsigned matIdx = c * row + r;
+            matElts[matIdx] = input;
+          }
+        }
+      } else {
+        for (unsigned r = 0; r < row; r++) {
+          Constant *constRowIdx = LocalBuilder.getInt32(r);
+          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+          for (unsigned c = 0; c < col; c++) {
+            SmallVector<Value *, 4> args = { OpArg, ID, rowIdx, columnConsts[c] };
+            if (vertexID)
+              args.emplace_back(vertexID);
+
+            Value *input = LocalBuilder.CreateCall(ldStFunc, args);
+            unsigned matIdx = r * col + c;
+            matElts[matIdx] = input;
+          }
         }
       }
+
       Value *newVec =
-          HLMatrixLower::BuildVector(EltTy, col * row, matElts, LocalBuilder);
+          HLMatrixLower::BuildVector(matElts[0]->getType(), col * row, matElts, LocalBuilder);
+      newVec = HLMatrixLower::VecMatrixMemToReg(newVec, matTy, LocalBuilder);
+
       CI->replaceAllUsesWith(newVec);
       CI->eraseFromParent();
     } break;
-    case HLMatLoadStoreOpcode::ColMatStore: {
-      IRBuilder<> LocalBuilder(CI);
-      Value *Val = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
-      Type *matTy = CI->getArgOperand(HLOperandIndex::kMatStoreDstPtrOpIdx)
-                        ->getType()
-                        ->getPointerElementType();
-      unsigned col, row;
-      HLMatrixLower::GetMatrixInfo(matTy, col, row);
-
-      for (unsigned c = 0; c < col; c++) {
-        Constant *constColIdx = LocalBuilder.getInt32(c);
-        Value *colIdx = LocalBuilder.CreateAdd(idxVal, constColIdx);
-
-        for (unsigned r = 0; r < row; r++) {
-          unsigned matIdx = HLMatrixLower::GetColMajorIdx(r, c, row);
-          Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
-          LocalBuilder.CreateCall(ldStFunc,
-                                  {OpArg, ID, colIdx, columnConsts[r], Elt});
-        }
-      }
-      CI->eraseFromParent();
-    } break;
+    case HLMatLoadStoreOpcode::ColMatStore:
     case HLMatLoadStoreOpcode::RowMatStore: {
       IRBuilder<> LocalBuilder(CI);
       Value *Val = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
@@ -995,14 +969,30 @@ void GenerateInputOutputUserCall(InputOutputAccessInfo &info, Value *undefVertex
       unsigned col, row;
       HLMatrixLower::GetMatrixInfo(matTy, col, row);
 
-      for (unsigned r = 0; r < row; r++) {
-        Constant *constRowIdx = LocalBuilder.getInt32(r);
-        Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+      Val = HLMatrixLower::VecMatrixRegToMem(Val, matTy, LocalBuilder);
+
+      if (matOp == HLMatLoadStoreOpcode::ColMatStore) {
         for (unsigned c = 0; c < col; c++) {
-          unsigned matIdx = HLMatrixLower::GetRowMajorIdx(r, c, col);
-          Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
-          LocalBuilder.CreateCall(ldStFunc,
-                                  {OpArg, ID, rowIdx, columnConsts[c], Elt});
+          Constant *constColIdx = LocalBuilder.getInt32(c);
+          Value *colIdx = LocalBuilder.CreateAdd(idxVal, constColIdx);
+
+          for (unsigned r = 0; r < row; r++) {
+            unsigned matIdx = HLMatrixLower::GetColMajorIdx(r, c, row);
+            Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
+            LocalBuilder.CreateCall(ldStFunc,
+              { OpArg, ID, colIdx, columnConsts[r], Elt });
+          }
+        }
+      } else {
+        for (unsigned r = 0; r < row; r++) {
+          Constant *constRowIdx = LocalBuilder.getInt32(r);
+          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+          for (unsigned c = 0; c < col; c++) {
+            unsigned matIdx = HLMatrixLower::GetRowMajorIdx(r, c, col);
+            Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
+            LocalBuilder.CreateCall(ldStFunc,
+              { OpArg, ID, rowIdx, columnConsts[c], Elt });
+          }
         }
       }
       CI->eraseFromParent();
diff --git a/tools/clang/lib/AST/ASTContext.cpp b/tools/clang/lib/AST/ASTContext.cpp
index a695e9da5..af88bfa50 100644
--- a/tools/clang/lib/AST/ASTContext.cpp
+++ b/tools/clang/lib/AST/ASTContext.cpp
@@ -1568,7 +1568,6 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     // Vector align to its element.
     if (getLangOpts().HLSL) {
       Align = EltInfo.Align;
-      Width = Align * VT->getNumElements();
     }
     // HLSL Change Ends.
     // If the alignment is not a power of 2, round up to the next power of 2.
diff --git a/tools/clang/lib/Basic/Targets.cpp b/tools/clang/lib/Basic/Targets.cpp
index fb9af0323..71e08a4e7 100644
--- a/tools/clang/lib/Basic/Targets.cpp
+++ b/tools/clang/lib/Basic/Targets.cpp
@@ -6991,9 +6991,7 @@ public:
     LongWidth = LongAlign = 32;
     LongDoubleWidth = LongDoubleAlign = 64;
     LongDoubleFormat = &llvm::APFloat::IEEEdouble;
-    BoolWidth = 32;
-    // To avoid member for alignment.
-    BoolAlign = 8;
+    BoolWidth = BoolAlign = 32;
 
     // using the Microsoft ABI.
     TheCXXABI.set(TargetCXXABI::Microsoft);
diff --git a/tools/clang/lib/CodeGen/CGExpr.cpp b/tools/clang/lib/CodeGen/CGExpr.cpp
index f23aa37f0..422feab4a 100644
--- a/tools/clang/lib/CodeGen/CGExpr.cpp
+++ b/tools/clang/lib/CodeGen/CGExpr.cpp
@@ -1082,6 +1082,14 @@ static bool hasBooleanRepresentation(QualType Ty) {
   return false;
 }
 
+// HLSL Change Begin.
+static bool hasBooleanScalarOrVectorRepresentation(QualType Ty) {
+  if (hlsl::IsHLSLVecType(Ty))
+    return hasBooleanRepresentation(hlsl::GetElementTypeOrType(Ty));
+  return hasBooleanRepresentation(Ty);
+}
+// HLSL Change End.
+
 static bool getRangeForType(CodeGenFunction &CGF, QualType Ty,
                             llvm::APInt &Min, llvm::APInt &End,
                             bool StrictEnums) {
@@ -1233,30 +1241,31 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(llvm::Value *Addr, bool Volatile,
 }
 
 llvm::Value *CodeGenFunction::EmitToMemory(llvm::Value *Value, QualType Ty) {
-  // Bool has a different representation in memory than in registers.
-  if (hasBooleanRepresentation(Ty)) {
+  // HLSL Change Begin.
+  // Bool scalar and vectors have a different representation in memory than in registers.
+  if (hasBooleanScalarOrVectorRepresentation(Ty)) {
     // This should really always be an i1, but sometimes it's already
     // an i8, and it's awkward to track those cases down.
-    if (Value->getType()->isIntegerTy(1))
+    llvm::Type *ValTy = Value->getType();
+    llvm::Type *VecElemTy = ValTy->isVectorTy() ? ValTy->getVectorElementType() : ValTy;
+    if (VecElemTy->isIntegerTy(1))
       return Builder.CreateZExt(Value, ConvertTypeForMem(Ty), "frombool");
-    assert(Value->getType()->isIntegerTy(getContext().getTypeSize(Ty)) &&
-           "wrong value rep of bool");
   }
+  // HLSL Change End.
 
   return Value;
 }
 
 llvm::Value *CodeGenFunction::EmitFromMemory(llvm::Value *Value, QualType Ty) {
-  // Bool has a different representation in memory than in registers.
-  if (hasBooleanRepresentation(Ty)) {
-    assert(Value->getType()->isIntegerTy(getContext().getTypeSize(Ty)) &&
-           "wrong value rep of bool");
-    // HLSL Change Begin.
+  // HLSL Change Begin.
+  // Bool scalar and vectors have a different representation in memory than in registers.
+  if (hasBooleanScalarOrVectorRepresentation(Ty)) {
+    llvm::Type *ValTy = Value->getType();
     // Use ne v, 0 to convert to i1 instead of trunc.
     return Builder.CreateICmpNE(
-        Value, llvm::ConstantInt::get(Value->getType(), 0), "tobool");
-    // HLSL Change End.
+        Value, llvm::ConstantVector::getNullValue(ValTy), "tobool");
   }
+  // HLSL Change End.
 
   return Value;
 }
@@ -1475,6 +1484,8 @@ RValue CodeGenFunction::EmitLoadOfExtVectorElementLValue(LValue LV) {
   Load->setAlignment(LV.getAlignment().getQuantity());
   llvm::Value *Vec = Load;
 
+  Vec = EmitFromMemory(Vec, LV.getType()); // HLSL Change
+
   const llvm::Constant *Elts = LV.getExtVectorElts();
 
   // If the result of the expression is a non-vector type, we must be extracting
@@ -1748,7 +1759,10 @@ void CodeGenFunction::EmitStoreThroughExtVectorComponentLValue(RValue Src,
   const llvm::Constant *Elts = Dst.getExtVectorElts();
 
   llvm::Value *SrcVal = Src.getScalarVal();
+
   // HLSL Change Starts
+  SrcVal = EmitToMemory(SrcVal, Dst.getType());
+
   const VectorType *VTy = Dst.getType()->getAs<VectorType>();
   if (VTy == nullptr && getContext().getLangOpts().HLSL)
     VTy =
@@ -2918,11 +2932,12 @@ CodeGenFunction::EmitHLSLVectorElementExpr(const HLSLVectorElementExpr *E) {
     assert(hlsl::IsHLSLVecType(E->getBase()->getType()) &&
            "Result must be a vector");
     llvm::Value *Vec = EmitScalarExpr(E->getBase());
+    Vec = EmitToMemory(Vec, E->getBase()->getType());
 
     // Store the vector to memory (because LValue wants an address).
-    llvm::Value *VecMem = CreateMemTemp(E->getBase()->getType());
-    Builder.CreateStore(Vec, VecMem);
-    Base = MakeAddrLValue(VecMem, E->getBase()->getType());
+    llvm::Value *VecMemPtr = CreateMemTemp(E->getBase()->getType());
+    Builder.CreateStore(Vec, VecMemPtr);
+    Base = MakeAddrLValue(VecMemPtr, E->getBase()->getType());
   }
 
   QualType type =
diff --git a/tools/clang/lib/CodeGen/CGHLSLMS.cpp b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
index b71a9983d..66c0d79d0 100644
--- a/tools/clang/lib/CodeGen/CGHLSLMS.cpp
+++ b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
@@ -3838,6 +3838,15 @@ static Value *CastLdValue(Value *Ptr, llvm::Type *FromTy, llvm::Type *ToTy, IRBu
       // Change scalar into vec1.
       Value *Vec1 = UndefValue::get(ToTy);
       return Builder.CreateInsertElement(Vec1, V, (uint64_t)0);
+    } else if (vecSize == 1 && FromTy->isIntegerTy()
+      && ToTy->getVectorElementType()->isIntegerTy(1)) {
+      // load(bitcast i32* to <1 x i1>*)
+      // Rewrite to
+      // insertelement(icmp ne (load i32*), 0)
+      Value *IntV = Builder.CreateLoad(Ptr);
+      Value *BoolV = Builder.CreateICmpNE(IntV, ConstantInt::get(IntV->getType(), 0), "tobool");
+      Value *Vec1 = UndefValue::get(ToTy);
+      return Builder.CreateInsertElement(Vec1, BoolV, (uint64_t)0);
     } else if (FromTy->isVectorTy() && vecSize == 1) {
       Value *V = Builder.CreateLoad(Ptr);
       // VectorTrunc
@@ -5468,15 +5477,20 @@ static void AddMissingCastOpsInInitList(SmallVector<Value *, 4> &elts, SmallVect
 
 static void StoreInitListToDestPtr(Value *DestPtr,
                                    SmallVector<Value *, 4> &elts, unsigned &idx,
-                                   QualType Type, CodeGenTypes &Types, bool bDefaultRowMajor,
-                                   CGBuilderTy &Builder, llvm::Module &M) {
+                                   QualType Type, bool bDefaultRowMajor,
+                                   CodeGenFunction &CGF, llvm::Module &M) {
+  CodeGenTypes &Types = CGF.getTypes();
+  CGBuilderTy &Builder = CGF.Builder;
+
   llvm::Type *Ty = DestPtr->getType()->getPointerElementType();
   llvm::Type *i32Ty = llvm::Type::getInt32Ty(Ty->getContext());
 
   if (Ty->isVectorTy()) {
-    Value *Result = UndefValue::get(Ty);
-    for (unsigned i = 0; i < Ty->getVectorNumElements(); i++)
+    llvm::Type *RegTy = CGF.ConvertType(Type);
+    Value *Result = UndefValue::get(RegTy);
+    for (unsigned i = 0; i < RegTy->getVectorNumElements(); i++)
       Result = Builder.CreateInsertElement(Result, elts[idx + i], i);
+    Result = CGF.EmitToMemory(Result, Type);
     Builder.CreateStore(Result, DestPtr);
     idx += Ty->getVectorNumElements();
   } else if (HLMatrixLower::IsMatrixType(Ty)) {
@@ -5541,8 +5555,8 @@ static void StoreInitListToDestPtr(Value *DestPtr,
             unsigned i = RL.getNonVirtualBaseLLVMFieldNo(BaseDecl);
             Constant *gepIdx = ConstantInt::get(i32Ty, i);
             Value *GEP = Builder.CreateInBoundsGEP(DestPtr, {zero, gepIdx});
-            StoreInitListToDestPtr(GEP, elts, idx, parentTy, Types,
-                                   bDefaultRowMajor, Builder, M);
+            StoreInitListToDestPtr(GEP, elts, idx, parentTy,
+                                   bDefaultRowMajor, CGF, M);
           }
         }
       }
@@ -5550,8 +5564,8 @@ static void StoreInitListToDestPtr(Value *DestPtr,
         unsigned i = RL.getLLVMFieldNo(field);
         Constant *gepIdx = ConstantInt::get(i32Ty, i);
         Value *GEP = Builder.CreateInBoundsGEP(DestPtr, {zero, gepIdx});
-        StoreInitListToDestPtr(GEP, elts, idx, field->getType(), Types,
-                               bDefaultRowMajor, Builder, M);
+        StoreInitListToDestPtr(GEP, elts, idx, field->getType(),
+                               bDefaultRowMajor, CGF, M);
       }
     }
   } else if (Ty->isArrayTy()) {
@@ -5560,8 +5574,8 @@ static void StoreInitListToDestPtr(Value *DestPtr,
     for (unsigned i = 0; i < Ty->getArrayNumElements(); i++) {
       Constant *gepIdx = ConstantInt::get(i32Ty, i);
       Value *GEP = Builder.CreateInBoundsGEP(DestPtr, {zero, gepIdx});
-      StoreInitListToDestPtr(GEP, elts, idx, EltType, Types, bDefaultRowMajor,
-                             Builder, M);
+      StoreInitListToDestPtr(GEP, elts, idx, EltType, bDefaultRowMajor,
+                             CGF, M);
     }
   } else {
     DXASSERT(Ty->isSingleValueType(), "invalid type");
@@ -5741,8 +5755,8 @@ Value *CGMSHLSLRuntime::EmitHLSLInitListExpr(CodeGenFunction &CGF, InitListExpr
     ParamList.append(EltValList.begin(), EltValList.end());
     idx = 0;
     bool bDefaultRowMajor = m_pHLModule->GetHLOptions().bDefaultRowMajor;
-    StoreInitListToDestPtr(DestPtr, EltValList, idx, ResultTy, CGF.getTypes(),
-                           bDefaultRowMajor, CGF.Builder, TheModule);
+    StoreInitListToDestPtr(DestPtr, EltValList, idx, ResultTy,
+                           bDefaultRowMajor, CGF, TheModule);
     return nullptr;
   }
 
@@ -7077,15 +7091,10 @@ void CGMSHLSLRuntime::EmitHLSLOutParamConversionInit(
     BasicBlock *InsertBlock = CGF.Builder.GetInsertBlock();
     Function *F = InsertBlock->getParent();
 
-    if (ParamTy->isBooleanType()) {
-      // Create i32 for bool.
-      ParamTy = CGM.getContext().IntTy;
-    }
     // Make sure the alloca is in entry block to stop inline create stacksave.
     IRBuilder<> AllocaBuilder(dxilutil::FindAllocaInsertionPt(F));
-    tmpArgAddr = AllocaBuilder.CreateAlloca(CGF.ConvertType(ParamTy));
+    tmpArgAddr = AllocaBuilder.CreateAlloca(CGF.ConvertTypeForMem(ParamTy));
 
-      
     // add it to local decl map
     TmpArgMap(tmpArg, tmpArgAddr);
 
@@ -7164,6 +7173,8 @@ void CGMSHLSLRuntime::EmitHLSLOutParamConversionCopyBack(
         else
           outVal = EmitHLSLMatrixLoad(CGF, tmpArgAddr, ParamTy);
 
+        outVal = CGF.EmitFromMemory(outVal, ParamTy);
+
         llvm::Type *ToTy = CGF.ConvertType(ArgTy);
         llvm::Type *FromTy = outVal->getType();
         Value *castVal = outVal;
diff --git a/tools/clang/lib/CodeGen/CodeGenTypes.cpp b/tools/clang/lib/CodeGen/CodeGenTypes.cpp
index 425c35ff2..5641cfc8a 100644
--- a/tools/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/tools/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -109,15 +109,26 @@ void CodeGenTypes::addRecordTypeName(const RecordDecl *RD,
 /// a type.  For example, the scalar representation for _Bool is i1, but the
 /// memory representation is usually i8 or i32, depending on the target.
 llvm::Type *CodeGenTypes::ConvertTypeForMem(QualType T) {
+  // HLSL Change Starts
+  if (hlsl::IsHLSLVecType(T)) {
+    // Vectors of bools in memory should become vectors of
+    // the memory representation of the elements.
+    // Clang doesn't do this for plain VectorTypes,
+    // which is fine otherwise a bool1x1 matrix would become
+    // [n x <m x i32>] since array elements always have memory representation.
+    QualType ElemT = hlsl::GetElementTypeOrType(T);
+    return llvm::VectorType::get(ConvertTypeForMem(ElemT), hlsl::GetHLSLVecSize(T));
+  }
+
   llvm::Type *R = ConvertType(T);
 
-  // If this is a non-bool type, don't map it.
-  if (!R->isIntegerTy(1))
-    return R;
+  if (R->isIntegerTy(1)) {
+    // Bools have a different representation in memory
+    return llvm::IntegerType::get(getLLVMContext(), (unsigned)Context.getTypeSize(T));
+  }
 
-  // Otherwise, return an integer of the target-specified size.
-  return llvm::IntegerType::get(getLLVMContext(),
-                                (unsigned)Context.getTypeSize(T));
+  return R;
+  // HLSL Change Ends
 }
 
 
diff --git a/tools/clang/test/CodeGenHLSL/RValSubscript.hlsl b/tools/clang/test/CodeGenHLSL/RValSubscript.hlsl
index 231a55cb8..51432e370 100644
--- a/tools/clang/test/CodeGenHLSL/RValSubscript.hlsl
+++ b/tools/clang/test/CodeGenHLSL/RValSubscript.hlsl
@@ -5,7 +5,7 @@
 // CHECK: i32 5)
 // CHECK: extractvalue
 // CHECK: , 2
-// CHECK: icmp eq
+// CHECK: icmp ne
 // CHECK 0
 
 // For (x4 < 3)[1]
@@ -47,7 +47,7 @@
 // CHECK: fcmp fast oeq
 // CHECK: fcmp fast oeq
 // CHECK: fcmp fast oeq
-// CHECK: alloca [16 x i1]
+// CHECK: alloca [16 x i32]
 
 
 float4x4 xt;
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/bool_loadbuf_storebuf_memrepr.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/bool_loadbuf_storebuf_memrepr.hlsl
new file mode 100644
index 000000000..e566a1f0e
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_loadbuf_storebuf_memrepr.hlsl
@@ -0,0 +1,110 @@
+// RUN: %dxc -E main -T ps_6_0 -O0 %s | FileCheck %s
+
+// Ensure that bools are converted from/to their mem representation when loaded/stored in buffers
+
+// Constant buffer loads
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+
+// Structured buffer loads
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+
+// Structured buffer stores
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+  
+
+struct AllTheBools
+{
+    bool2x2 m;
+    bool2 v;
+    bool s;
+    bool2x2 ma[2];
+    bool2 va[2];
+    bool sa[2];
+};
+
+ConstantBuffer<AllTheBools> cb;
+StructuredBuffer<AllTheBools> sb;
+RWStructuredBuffer<AllTheBools> rwsb;
+
+float main(int i : I) : SV_Target
+{
+    float result = 0;
+
+    // Constant buffer loads
+    if (cb.m._22 && cb.v.y && cb.s
+        && cb.ma[1]._22 && cb.va[1].y && cb.sa[1])
+    {
+        result++;
+    }
+    
+    // Structured buffer loads
+    if (sb[0].m._22 && sb[0].v.y && sb[0].s
+        && sb[0].ma[1]._22 && sb[0].va[1].y && sb[0].sa[1])
+    {
+        result++;
+    }
+
+    // Structured buffer stores
+    if (result >= 1.0f)
+    {
+        rwsb[0].m._22 = i == 42;
+        rwsb[0].v.y = i == 42;
+        rwsb[0].s = i == 42;
+        rwsb[0].ma[1]._22 = i == 42;
+        rwsb[0].va[1].y = i == 42;
+        rwsb[0].sa[1] = i == 42;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/bool_memrepr.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/bool_memrepr.hlsl
new file mode 100644
index 000000000..20d7ce08f
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_memrepr.hlsl
@@ -0,0 +1,52 @@
+// RUN: %dxc -E main -T ps_6_0 -O0 %s | FileCheck %s
+
+// Ensure that bools are converted from/to their memory representation when loaded/stored
+
+// Local variables should never be i1s
+// CHECK-not: alloca {{.*}}i1
+
+// Test stores
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+
+// Test loads
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+
+float main(int i : I) : SV_Target
+{
+    bool s = i == 42;
+    bool1 v = i == 42;
+    bool1x1 m = i == 42;
+    bool sa[1] = { i == 42 };
+    bool1 va[1] = { i == 42 };
+    bool1x1 ma[1] = { i == 42 };
+
+    return s && v.x && m._11 && sa[0] && va[0].x && ma[0]._11 ? 1.0f : 2.0f;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/bool_scalar_swizzle.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/bool_scalar_swizzle.hlsl
new file mode 100644
index 000000000..9499f3ddc
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_scalar_swizzle.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -E main -T ps_6_0 -O0 %s | FileCheck %s
+
+// This is mostly a regression test for a bug where a bitcast
+// from i32* to i1* was emitted.
+
+// CHECK: alloca i32
+// CHECK: alloca [2 x i32]
+// CHECK-NOT: bitcast
+
+float main() : SV_Target
+{
+    bool b = true;
+    bool2 b2 = b.xx;
+    return 0;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/bool_stress.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/bool_stress.hlsl
new file mode 100644
index 000000000..aacf9fd85
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_stress.hlsl
@@ -0,0 +1,54 @@
+// RUN: %dxc -E main -T vs_6_0 -O0 %s
+
+// Regression test for compiler crashes in complex bool cases
+
+struct AllTheBools
+{
+    bool b : B;
+    bool ba2[2] :BA2;
+    bool1 b1 : B1;
+    bool3 b3 : B3;
+    bool3 b3a2[2] : B3A2;
+    bool1x1 b1x1 : B1X1;
+    bool2x3 b2x3 : B2X3;
+    row_major bool2x3 rmb2x3 : RMB2X3;
+    bool2x3 b2x3a2[2] : B2X3A2;
+};
+
+ConstantBuffer<AllTheBools> cb;
+StructuredBuffer<AllTheBools> sb;
+
+void not(in out bool value) { value = !value; }
+
+void not(in out bool2 value)
+{
+  value = !value;
+  not(value.x);
+  not(value.y);
+}
+
+void not(in out bool3 value)
+{
+  not(value.xz);
+  not(value.y);
+}
+
+AllTheBools main(AllTheBools input, float f : F)
+{
+    AllTheBools output;
+    output.b = input.b ? cb.b : sb[0].b;
+    output.ba2[1] = input.b;
+    output.ba2[0] = input.ba2[1];
+    output.b1 = input.b3.y;
+    output.b3 = input.b.xxx;
+    output.b3a2 = sb[0].b3a2;
+    if (sb[0].b) return cb;
+
+    output.b1x1 = cb.b2x3._22;
+    output.b2x3 = bool2x3(sb[0].b3, bool3(f > 2, input.b, false));
+    output.rmb2x3 = input.b2x3;
+    not(output.rmb2x3[0]);
+    output.b2x3a2[1] = cb.b2x3;
+    output.b2x3a2[0] = input.b2x3;
+    return output;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/global-var-write-test04.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/global-var-write-test04.hlsl
index a0440c733..a3392a0b5 100644
--- a/tools/clang/test/CodeGenHLSL/quick-test/global-var-write-test04.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/global-var-write-test04.hlsl
@@ -7,7 +7,7 @@
 // CHECK: {{.*g_v.*}} = external constant <4 x float>, align 4
 // CHECK: {{.*g_m1.*}} = external constant %class.matrix.int.2.2, align 4
 // CHECK: {{.*g_m2.*}} = external constant %class.matrix.int.2.2, align 4
-// CHECK: {{.*g_b.*}} = external constant i32, align 1
+// CHECK: {{.*g_b.*}} = external constant i32, align 4
 // CHECK: {{.*g_a.*}} = external constant [5 x i32], align 4
 // CHECK: {{.*g_a2d.*}} = external constant [3 x [2 x i32]], align 4
 // CHECK-NOT: {{(.*g_s1.*)(.*static.copy.*)}} = internal global float 0.000000e+00