diff --git a/include/dxc/HLSL/HLMatrixLowerHelper.h b/include/dxc/HLSL/HLMatrixLowerHelper.h
index 39a432319..de61be469 100644
--- a/include/dxc/HLSL/HLMatrixLowerHelper.h
+++ b/include/dxc/HLSL/HLMatrixLowerHelper.h
@@ -31,17 +31,27 @@ bool IsMatrixType(llvm::Type *Ty);
 DxilFieldAnnotation *FindAnnotationFromMatUser(llvm::Value *Mat,
                                                DxilTypeSystem &typeSys);
 // Translate matrix type to vector type.
-llvm::Type *LowerMatrixType(llvm::Type *Ty);
+llvm::Type *LowerMatrixType(llvm::Type *Ty, bool forMem = false);
 // TODO: use type annotation.
 llvm::Type *GetMatrixInfo(llvm::Type *Ty, unsigned &col, unsigned &row);
 // TODO: use type annotation.
 bool IsMatrixArrayPointer(llvm::Type *Ty);
 // Translate matrix array pointer type to vector array pointer type.
-llvm::Type *LowerMatrixArrayPointer(llvm::Type *Ty);
+llvm::Type *LowerMatrixArrayPointer(llvm::Type *Ty, bool forMem = false);
 
 llvm::Value *BuildVector(llvm::Type *EltTy, unsigned size,
                          llvm::ArrayRef<llvm::Value *> elts,
                          llvm::IRBuilder<> &Builder);
+
+llvm::Value *VecMatrixMemToReg(llvm::Value *VecVal, llvm::Type *MatType,
+                               llvm::IRBuilder<> &Builder);
+llvm::Value *VecMatrixRegToMem(llvm::Value* VecVal, llvm::Type *MatType,
+                               llvm::IRBuilder<> &Builder);
+llvm::Instruction *CreateVecMatrixLoad(llvm::Value *VecPtr,
+                                       llvm::Type *MatType, llvm::IRBuilder<> &Builder);
+llvm::Instruction *CreateVecMatrixStore(llvm::Value* VecVal, llvm::Value *VecPtr,
+                                        llvm::Type *MatType, llvm::IRBuilder<> &Builder);
+
 // For case like mat[i][j].
 // IdxList is [i][0], [i][1], [i][2],[i][3].
 // Idx is j.
diff --git a/lib/HLSL/HLMatrixLowerPass.cpp b/lib/HLSL/HLMatrixLowerPass.cpp
index 40c35e8fe..4c736a9e3 100644
--- a/lib/HLSL/HLMatrixLowerPass.cpp
+++ b/lib/HLSL/HLMatrixLowerPass.cpp
@@ -69,7 +69,7 @@ DxilFieldAnnotation *FindAnnotationFromMatUser(Value *Mat,
 }
 
 // Translate matrix type to vector type.
-Type *LowerMatrixType(Type *Ty) {
+Type *LowerMatrixType(Type *Ty, bool forMem) {
   // Only translate matrix type and function type which use matrix type.
   // Not translate struct has matrix or matrix pointer.
   // Struct should be flattened before.
@@ -84,6 +84,8 @@ Type *LowerMatrixType(Type *Ty) {
   } else if (IsMatrixType(Ty)) {
     unsigned row, col;
     Type *EltTy = GetMatrixInfo(Ty, col, row);
+    if (forMem && EltTy->isIntegerTy(1))
+      EltTy = Type::getInt32Ty(Ty->getContext());
     return VectorType::get(EltTy, row * col);
   } else {
     return Ty;
@@ -122,7 +124,7 @@ bool IsMatrixArrayPointer(llvm::Type *Ty) {
     Ty = Ty->getArrayElementType();
   return IsMatrixType(Ty);
 }
-Type *LowerMatrixArrayPointer(Type *Ty) {
+Type *LowerMatrixArrayPointer(Type *Ty, bool forMem) {
   unsigned addrSpace = Ty->getPointerAddressSpace();
   Ty = Ty->getPointerElementType();
   std::vector<unsigned> arraySizeList;
@@ -130,7 +132,7 @@ Type *LowerMatrixArrayPointer(Type *Ty) {
     arraySizeList.push_back(Ty->getArrayNumElements());
     Ty = Ty->getArrayElementType();
   }
-  Ty = LowerMatrixType(Ty);
+  Ty = LowerMatrixType(Ty, forMem);
 
   for (auto arraySize = arraySizeList.rbegin();
        arraySize != arraySizeList.rend(); arraySize++)
@@ -155,13 +157,69 @@ Type *LowerMatrixArrayPointerToOneDimArray(Type *Ty) {
   return PointerType::get(Ty, addrSpace);
 }
 Value *BuildVector(Type *EltTy, unsigned size, ArrayRef<llvm::Value *> elts,
-                   IRBuilder<> &Builder) {
+  IRBuilder<> &Builder) {
   Value *Vec = UndefValue::get(VectorType::get(EltTy, size));
   for (unsigned i = 0; i < size; i++)
     Vec = Builder.CreateInsertElement(Vec, elts[i], i);
   return Vec;
 }
 
+llvm::Value *VecMatrixMemToReg(llvm::Value *VecVal, llvm::Type *MatType,
+  llvm::IRBuilder<> &Builder)
+{
+  llvm::Type *VecMatRegTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/false);
+  if (VecVal->getType() == VecMatRegTy) {
+    return VecVal;
+  }
+
+  DXASSERT(VecMatRegTy->getVectorElementType()->isIntegerTy(1),
+    "Vector matrix mem to reg type mismatch should only happen for bools.");
+  llvm::Type *VecMatMemTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/true);
+  return Builder.CreateICmpNE(VecVal, Constant::getNullValue(VecMatMemTy));
+}
+
+llvm::Value *VecMatrixRegToMem(llvm::Value* VecVal, llvm::Type *MatType,
+  llvm::IRBuilder<> &Builder)
+{
+  llvm::Type *VecMatMemTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/true);
+  if (VecVal->getType() == VecMatMemTy) {
+    return VecVal;
+  }
+
+  DXASSERT(VecVal->getType()->getVectorElementType()->isIntegerTy(1),
+    "Vector matrix reg to mem type mismatch should only happen for bools.");
+  return Builder.CreateZExt(VecVal, VecMatMemTy);
+}
+
+llvm::Instruction *CreateVecMatrixLoad(
+  llvm::Value *VecPtr, llvm::Type *MatType, llvm::IRBuilder<> &Builder)
+{
+  llvm::Instruction *VecVal = Builder.CreateLoad(VecPtr);
+  return cast<llvm::Instruction>(VecMatrixMemToReg(VecVal, MatType, Builder));
+}
+
+llvm::Instruction *CreateVecMatrixStore(llvm::Value* VecVal, llvm::Value *VecPtr,
+  llvm::Type *MatType, llvm::IRBuilder<> &Builder)
+{
+  llvm::Type *VecMatMemTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/true);
+  if (VecVal->getType() == VecMatMemTy) {
+    return Builder.CreateStore(VecVal, VecPtr);
+  }
+
+  // We need to convert to the memory representation, and we want to return
+  // the conversion instruction rather than the store since that's what
+  // accepts the register-typed i1 values.
+
+  // Do not use VecMatrixRegToMem as it may constant fold the conversion
+  // instruction, which is what we want to return.
+  DXASSERT(VecVal->getType()->getVectorElementType()->isIntegerTy(1),
+    "Vector matrix reg to mem type mismatch should only happen for bools.");
+
+  llvm::Instruction *ConvInst = Builder.Insert(new ZExtInst(VecVal, VecMatMemTy));
+  Builder.CreateStore(ConvInst, VecPtr);
+  return ConvInst;
+}
+
 Value *LowerGEPOnMatIndexListToIndex(
     llvm::GetElementPtrInst *GEP, ArrayRef<Value *> IdxList) {
   IRBuilder<> Builder(GEP);
@@ -508,7 +566,7 @@ Instruction *HLMatrixLowerPass::MatLdStToVec(CallInst *CI) {
     if (isa<AllocaInst>(matPtr) || GetIfMatrixGEPOfUDTAlloca(matPtr) ||
         GetIfMatrixGEPOfUDTArg(matPtr, *m_pHLModule)) {
       Value *vecPtr = matToVecMap[cast<Instruction>(matPtr)];
-      result = Builder.CreateLoad(vecPtr);
+      result = CreateVecMatrixLoad(vecPtr, matPtr->getType()->getPointerElementType(), Builder);
     } else
       result = MatIntrinsicToVec(CI);
   } break;
@@ -519,9 +577,8 @@ Instruction *HLMatrixLowerPass::MatLdStToVec(CallInst *CI) {
         GetIfMatrixGEPOfUDTArg(matPtr, *m_pHLModule)) {
       Value *vecPtr = matToVecMap[cast<Instruction>(matPtr)];
       Value *matVal = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
-      Value *vecVal =
-          UndefValue::get(HLMatrixLower::LowerMatrixType(matVal->getType()));
-      result = Builder.CreateStore(vecVal, vecPtr);
+      Value *vecVal = UndefValue::get(HLMatrixLower::LowerMatrixType(matVal->getType()));
+      result = CreateVecMatrixStore(vecVal, vecPtr, matVal->getType(), Builder);
     } else
       result = MatIntrinsicToVec(CI);
   } break;
@@ -905,11 +962,11 @@ void HLMatrixLowerPass::lowerToVec(Instruction *matInst) {
     
     IRBuilder<> AllocaBuilder(AI);
     if (Ty->isArrayTy()) {
-      Type *vecTy = HLMatrixLower::LowerMatrixArrayPointer(AI->getType());
+      Type *vecTy = HLMatrixLower::LowerMatrixArrayPointer(AI->getType(), /*forMem*/ true);
       vecTy = vecTy->getPointerElementType();
       vecVal = AllocaBuilder.CreateAlloca(vecTy, nullptr, AI->getName());
     } else {
-      Type *vecTy = HLMatrixLower::LowerMatrixType(matTy);
+      Type *vecTy = HLMatrixLower::LowerMatrixType(matTy, /*forMem*/ true);
       vecVal = AllocaBuilder.CreateAlloca(vecTy, nullptr, AI->getName());
     }
     // Update debug info.
@@ -2059,7 +2116,8 @@ void HLMatrixLowerPass::TranslateMatArrayGEP(Value *matInst,
           // Skip the vector version.
           if (useCall->getType()->isVectorTy())
             continue;
-          Value *newLd = Builder.CreateLoad(newGEP);
+          Type *matTy = useCall->getType();
+          Value *newLd = CreateVecMatrixLoad(newGEP, matTy, Builder);
           DXASSERT(matToVecMap.count(useCall), "must have vec version");
           Value *oldLd = matToVecMap[useCall];
           // Delete the oldLd.
@@ -2082,7 +2140,7 @@ void HLMatrixLowerPass::TranslateMatArrayGEP(Value *matInst,
 
           DXASSERT(matToVecMap.count(matInst), "must have vec version");
           Value *vecVal = matToVecMap[matInst];
-          Builder.CreateStore(vecVal, vecPtr);
+          CreateVecMatrixStore(vecVal, vecPtr, matVal->getType(), Builder);
         } break;
         }
       } break;
@@ -2174,9 +2232,17 @@ void HLMatrixLowerPass::replaceMatWithVec(Value *matVal,
           // Load Already translated in lowerToVec.
           // Store val operand will be set by the val use.
           // Do nothing here.
-        } else if (StoreInst *stInst = dyn_cast<StoreInst>(vecUser))
+        } else if (StoreInst *stInst = dyn_cast<StoreInst>(vecUser)) {
+          DXASSERT(vecVal->getType() == stInst->getValueOperand()->getType(),
+            "Mismatched vector matrix store value types.");
           stInst->setOperand(0, vecVal);
-        else
+        } else if (ZExtInst *zextInst = dyn_cast<ZExtInst>(vecUser)) {
+          // This happens when storing bool matrices,
+          // which must first undergo conversion from i1's to i32's.
+          DXASSERT(vecVal->getType() == zextInst->getOperand(0)->getType(),
+            "Mismatched vector matrix store value types.");
+          zextInst->setOperand(0, vecVal);
+        } else
           TrivialMatReplace(matVal, vecVal, useCall);
 
       } break;
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index b0426f501..b2db0646a 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -4988,7 +4988,8 @@ Value *TranslateConstBufMatLd(Type *matType, Value *handle, Value *offset,
                               bool colMajor, OP *OP, const DataLayout &DL,
                               IRBuilder<> &Builder) {
   unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/true)->getVectorElementType();
   unsigned matSize = col * row;
   std::vector<Value *> elts(matSize);
   Value *EltByteSize = ConstantInt::get(
@@ -5001,7 +5002,9 @@ Value *TranslateConstBufMatLd(Type *matType, Value *handle, Value *offset,
     baseOffset = Builder.CreateAdd(baseOffset, EltByteSize);
   }
 
-  return HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Value* Vec = HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Vec = HLMatrixLower::VecMatrixMemToReg(Vec, matType, Builder);
+  return Vec;
 }
 
 void TranslateCBGep(GetElementPtrInst *GEP, Value *handle, Value *baseOffset,
@@ -5417,10 +5420,11 @@ Value *GenerateCBLoadLegacy(Value *handle, Value *legacyIdx,
 
 Value *TranslateConstBufMatLdLegacy(Type *matType, Value *handle,
                                     Value *legacyIdx, bool colMajor, OP *OP,
-                                    const DataLayout &DL,
+                                    bool memElemRepr, const DataLayout &DL,
                                     IRBuilder<> &Builder) {
   unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/memElemRepr)->getVectorElementType();
 
   unsigned matSize = col * row;
   std::vector<Value *> elts(matSize);
@@ -5506,8 +5510,9 @@ void TranslateCBAddressUserLegacy(Instruction *user, Value *handle,
       Type *matType = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx)
                           ->getType()
                           ->getPointerElementType();
+      // This will replace a call, so we should use the register representation of elements
       Value *newLd = TranslateConstBufMatLdLegacy(
-          matType, handle, legacyIdx, colMajor, hlslOP, DL, Builder);
+          matType, handle, legacyIdx, colMajor, hlslOP, /*memElemRepr*/false, DL, Builder);
       CI->replaceAllUsesWith(newLd);
       CI->eraseFromParent();
     } else if (group == HLOpcodeGroup::HLSubscript) {
@@ -5534,8 +5539,9 @@ void TranslateCBAddressUserLegacy(Instruction *user, Value *handle,
 
       Value *ldData = UndefValue::get(resultType);
       if (!dynamicIndexing) {
+        // This will replace a load or GEP, so we should use the memory representation of elements
         Value *matLd = TranslateConstBufMatLdLegacy(
-            matType, handle, legacyIdx, colMajor, hlslOP, DL, Builder);
+            matType, handle, legacyIdx, colMajor, hlslOP, /*memElemRepr*/true, DL, Builder);
         // The matLd is keep original layout, just use the idx calc in
         // EmitHLSLMatrixElement and EmitHLSLMatrixSubscript.
         switch (subOp) {
@@ -6022,7 +6028,8 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
                                Value *bufIdx, Value *baseOffset,
                                bool colMajor, const DataLayout &DL) {
   unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/true)->getVectorElementType();
   unsigned  EltSize = DL.getTypeAllocSize(EltTy);
   Constant* alignment = OP->GetI32Const(EltSize);
 
@@ -6054,14 +6061,20 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
     offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * EltSize));
   }
 
-  return HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Value *Vec = HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Vec = HLMatrixLower::VecMatrixMemToReg(Vec, matType, Builder);
+  return Vec;
 }
 
 void TranslateStructBufMatSt(Type *matType, IRBuilder<> &Builder, Value *handle,
                              hlsl::OP *OP, Value *bufIdx, Value *baseOffset,
                              Value *val, bool colMajor, const DataLayout &DL) {
   unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/true)->getVectorElementType();
+
+  val = HLMatrixLower::VecMatrixRegToMem(val, matType, Builder);
+
   unsigned EltSize = DL.getTypeAllocSize(EltTy);
   Constant *Alignment = OP->GetI32Const(EltSize);
   Value *offset = baseOffset;
diff --git a/lib/HLSL/HLSignatureLower.cpp b/lib/HLSL/HLSignatureLower.cpp
index 648d775db..3228d487c 100644
--- a/lib/HLSL/HLSignatureLower.cpp
+++ b/lib/HLSL/HLSignatureLower.cpp
@@ -912,80 +912,54 @@ void GenerateInputOutputUserCall(InputOutputAccessInfo &info, Value *undefVertex
     DXASSERT_NOMSG(group == HLOpcodeGroup::HLMatLoadStore);
     HLMatLoadStoreOpcode matOp = static_cast<HLMatLoadStoreOpcode>(opcode);
     switch (matOp) {
-    case HLMatLoadStoreOpcode::ColMatLoad: {
-      IRBuilder<> LocalBuilder(CI);
-      Type *matTy = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx)
-                        ->getType()
-                        ->getPointerElementType();
-      unsigned col, row;
-      Type *EltTy = HLMatrixLower::GetMatrixInfo(matTy, col, row);
-      std::vector<Value *> matElts(col * row);
-      for (unsigned c = 0; c < col; c++) {
-        Constant *constRowIdx = LocalBuilder.getInt32(c);
-        Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
-        for (unsigned r = 0; r < row; r++) {
-          SmallVector<Value *, 4> args = {OpArg, ID, rowIdx, columnConsts[r]};
-          if (vertexID)
-            args.emplace_back(vertexID);
-
-          Value *input = LocalBuilder.CreateCall(ldStFunc, args);
-          unsigned matIdx = c * row + r;
-          matElts[matIdx] = input;
-        }
-      }
-      Value *newVec =
-          HLMatrixLower::BuildVector(EltTy, col * row, matElts, LocalBuilder);
-      CI->replaceAllUsesWith(newVec);
-      CI->eraseFromParent();
-    } break;
+    case HLMatLoadStoreOpcode::ColMatLoad:
     case HLMatLoadStoreOpcode::RowMatLoad: {
       IRBuilder<> LocalBuilder(CI);
       Type *matTy = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx)
                         ->getType()
                         ->getPointerElementType();
       unsigned col, row;
-      Type *EltTy = HLMatrixLower::GetMatrixInfo(matTy, col, row);
+      HLMatrixLower::GetMatrixInfo(matTy, col, row);
       std::vector<Value *> matElts(col * row);
-      for (unsigned r = 0; r < row; r++) {
-        Constant *constRowIdx = LocalBuilder.getInt32(r);
-        Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
-        for (unsigned c = 0; c < col; c++) {
-          SmallVector<Value *, 4> args = {OpArg, ID, rowIdx, columnConsts[c]};
-          if (vertexID)
-            args.emplace_back(vertexID);
 
-          Value *input = LocalBuilder.CreateCall(ldStFunc, args);
-          unsigned matIdx = r * col + c;
-          matElts[matIdx] = input;
+      if (matOp == HLMatLoadStoreOpcode::ColMatLoad) {
+        for (unsigned c = 0; c < col; c++) {
+          Constant *constRowIdx = LocalBuilder.getInt32(c);
+          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+          for (unsigned r = 0; r < row; r++) {
+            SmallVector<Value *, 4> args = { OpArg, ID, rowIdx, columnConsts[r] };
+            if (vertexID)
+              args.emplace_back(vertexID);
+
+            Value *input = LocalBuilder.CreateCall(ldStFunc, args);
+            unsigned matIdx = c * row + r;
+            matElts[matIdx] = input;
+          }
+        }
+      } else {
+        for (unsigned r = 0; r < row; r++) {
+          Constant *constRowIdx = LocalBuilder.getInt32(r);
+          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+          for (unsigned c = 0; c < col; c++) {
+            SmallVector<Value *, 4> args = { OpArg, ID, rowIdx, columnConsts[c] };
+            if (vertexID)
+              args.emplace_back(vertexID);
+
+            Value *input = LocalBuilder.CreateCall(ldStFunc, args);
+            unsigned matIdx = r * col + c;
+            matElts[matIdx] = input;
+          }
         }
       }
+
       Value *newVec =
-          HLMatrixLower::BuildVector(EltTy, col * row, matElts, LocalBuilder);
+          HLMatrixLower::BuildVector(matElts[0]->getType(), col * row, matElts, LocalBuilder);
+      newVec = HLMatrixLower::VecMatrixMemToReg(newVec, matTy, LocalBuilder);
+
       CI->replaceAllUsesWith(newVec);
       CI->eraseFromParent();
     } break;
-    case HLMatLoadStoreOpcode::ColMatStore: {
-      IRBuilder<> LocalBuilder(CI);
-      Value *Val = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
-      Type *matTy = CI->getArgOperand(HLOperandIndex::kMatStoreDstPtrOpIdx)
-                        ->getType()
-                        ->getPointerElementType();
-      unsigned col, row;
-      HLMatrixLower::GetMatrixInfo(matTy, col, row);
-
-      for (unsigned c = 0; c < col; c++) {
-        Constant *constColIdx = LocalBuilder.getInt32(c);
-        Value *colIdx = LocalBuilder.CreateAdd(idxVal, constColIdx);
-
-        for (unsigned r = 0; r < row; r++) {
-          unsigned matIdx = HLMatrixLower::GetColMajorIdx(r, c, row);
-          Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
-          LocalBuilder.CreateCall(ldStFunc,
-                                  {OpArg, ID, colIdx, columnConsts[r], Elt});
-        }
-      }
-      CI->eraseFromParent();
-    } break;
+    case HLMatLoadStoreOpcode::ColMatStore:
     case HLMatLoadStoreOpcode::RowMatStore: {
       IRBuilder<> LocalBuilder(CI);
       Value *Val = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
@@ -995,14 +969,30 @@ void GenerateInputOutputUserCall(InputOutputAccessInfo &info, Value *undefVertex
       unsigned col, row;
       HLMatrixLower::GetMatrixInfo(matTy, col, row);
 
-      for (unsigned r = 0; r < row; r++) {
-        Constant *constRowIdx = LocalBuilder.getInt32(r);
-        Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+      Val = HLMatrixLower::VecMatrixRegToMem(Val, matTy, LocalBuilder);
+
+      if (matOp == HLMatLoadStoreOpcode::ColMatStore) {
         for (unsigned c = 0; c < col; c++) {
-          unsigned matIdx = HLMatrixLower::GetRowMajorIdx(r, c, col);
-          Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
-          LocalBuilder.CreateCall(ldStFunc,
-                                  {OpArg, ID, rowIdx, columnConsts[c], Elt});
+          Constant *constColIdx = LocalBuilder.getInt32(c);
+          Value *colIdx = LocalBuilder.CreateAdd(idxVal, constColIdx);
+
+          for (unsigned r = 0; r < row; r++) {
+            unsigned matIdx = HLMatrixLower::GetColMajorIdx(r, c, row);
+            Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
+            LocalBuilder.CreateCall(ldStFunc,
+              { OpArg, ID, colIdx, columnConsts[r], Elt });
+          }
+        }
+      } else {
+        for (unsigned r = 0; r < row; r++) {
+          Constant *constRowIdx = LocalBuilder.getInt32(r);
+          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+          for (unsigned c = 0; c < col; c++) {
+            unsigned matIdx = HLMatrixLower::GetRowMajorIdx(r, c, col);
+            Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
+            LocalBuilder.CreateCall(ldStFunc,
+              { OpArg, ID, rowIdx, columnConsts[c], Elt });
+          }
         }
       }
       CI->eraseFromParent();
diff --git a/tools/clang/lib/AST/ASTContext.cpp b/tools/clang/lib/AST/ASTContext.cpp
index a695e9da5..af88bfa50 100644
--- a/tools/clang/lib/AST/ASTContext.cpp
+++ b/tools/clang/lib/AST/ASTContext.cpp
@@ -1568,7 +1568,6 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     // Vector align to its element.
     if (getLangOpts().HLSL) {
       Align = EltInfo.Align;
-      Width = Align * VT->getNumElements();
     }
     // HLSL Change Ends.
     // If the alignment is not a power of 2, round up to the next power of 2.
diff --git a/tools/clang/lib/Basic/Targets.cpp b/tools/clang/lib/Basic/Targets.cpp
index fb9af0323..71e08a4e7 100644
--- a/tools/clang/lib/Basic/Targets.cpp
+++ b/tools/clang/lib/Basic/Targets.cpp
@@ -6991,9 +6991,7 @@ public:
     LongWidth = LongAlign = 32;
     LongDoubleWidth = LongDoubleAlign = 64;
     LongDoubleFormat = &llvm::APFloat::IEEEdouble;
-    BoolWidth = 32;
-    // To avoid member for alignment.
-    BoolAlign = 8;
+    BoolWidth = BoolAlign = 32;
 
     // using the Microsoft ABI.
     TheCXXABI.set(TargetCXXABI::Microsoft);
diff --git a/tools/clang/lib/CodeGen/CGExpr.cpp b/tools/clang/lib/CodeGen/CGExpr.cpp
index f23aa37f0..422feab4a 100644
--- a/tools/clang/lib/CodeGen/CGExpr.cpp
+++ b/tools/clang/lib/CodeGen/CGExpr.cpp
@@ -1082,6 +1082,14 @@ static bool hasBooleanRepresentation(QualType Ty) {
   return false;
 }
 
+// HLSL Change Begin.
+static bool hasBooleanScalarOrVectorRepresentation(QualType Ty) {
+  if (hlsl::IsHLSLVecType(Ty))
+    return hasBooleanRepresentation(hlsl::GetElementTypeOrType(Ty));
+  return hasBooleanRepresentation(Ty);
+}
+// HLSL Change End.
+
 static bool getRangeForType(CodeGenFunction &CGF, QualType Ty,
                             llvm::APInt &Min, llvm::APInt &End,
                             bool StrictEnums) {
@@ -1233,30 +1241,31 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(llvm::Value *Addr, bool Volatile,
 }
 
 llvm::Value *CodeGenFunction::EmitToMemory(llvm::Value *Value, QualType Ty) {
-  // Bool has a different representation in memory than in registers.
-  if (hasBooleanRepresentation(Ty)) {
+  // HLSL Change Begin.
+  // Bool scalar and vectors have a different representation in memory than in registers.
+  if (hasBooleanScalarOrVectorRepresentation(Ty)) {
     // This should really always be an i1, but sometimes it's already
     // an i8, and it's awkward to track those cases down.
-    if (Value->getType()->isIntegerTy(1))
+    llvm::Type *ValTy = Value->getType();
+    llvm::Type *VecElemTy = ValTy->isVectorTy() ? ValTy->getVectorElementType() : ValTy;
+    if (VecElemTy->isIntegerTy(1))
       return Builder.CreateZExt(Value, ConvertTypeForMem(Ty), "frombool");
-    assert(Value->getType()->isIntegerTy(getContext().getTypeSize(Ty)) &&
-           "wrong value rep of bool");
   }
+  // HLSL Change End.
 
   return Value;
 }
 
 llvm::Value *CodeGenFunction::EmitFromMemory(llvm::Value *Value, QualType Ty) {
-  // Bool has a different representation in memory than in registers.
-  if (hasBooleanRepresentation(Ty)) {
-    assert(Value->getType()->isIntegerTy(getContext().getTypeSize(Ty)) &&
-           "wrong value rep of bool");
-    // HLSL Change Begin.
+  // HLSL Change Begin.
+  // Bool scalar and vectors have a different representation in memory than in registers.
+  if (hasBooleanScalarOrVectorRepresentation(Ty)) {
+    llvm::Type *ValTy = Value->getType();
     // Use ne v, 0 to convert to i1 instead of trunc.
     return Builder.CreateICmpNE(
-        Value, llvm::ConstantInt::get(Value->getType(), 0), "tobool");
-    // HLSL Change End.
+        Value, llvm::ConstantVector::getNullValue(ValTy), "tobool");
   }
+  // HLSL Change End.
 
   return Value;
 }
@@ -1475,6 +1484,8 @@ RValue CodeGenFunction::EmitLoadOfExtVectorElementLValue(LValue LV) {
   Load->setAlignment(LV.getAlignment().getQuantity());
   llvm::Value *Vec = Load;
 
+  Vec = EmitFromMemory(Vec, LV.getType()); // HLSL Change
+
   const llvm::Constant *Elts = LV.getExtVectorElts();
 
   // If the result of the expression is a non-vector type, we must be extracting
@@ -1748,7 +1759,10 @@ void CodeGenFunction::EmitStoreThroughExtVectorComponentLValue(RValue Src,
   const llvm::Constant *Elts = Dst.getExtVectorElts();
 
   llvm::Value *SrcVal = Src.getScalarVal();
+
   // HLSL Change Starts
+  SrcVal = EmitToMemory(SrcVal, Dst.getType());
+
   const VectorType *VTy = Dst.getType()->getAs<VectorType>();
   if (VTy == nullptr && getContext().getLangOpts().HLSL)
     VTy =
@@ -2918,11 +2932,12 @@ CodeGenFunction::EmitHLSLVectorElementExpr(const HLSLVectorElementExpr *E) {
     assert(hlsl::IsHLSLVecType(E->getBase()->getType()) &&
            "Result must be a vector");
     llvm::Value *Vec = EmitScalarExpr(E->getBase());
+    Vec = EmitToMemory(Vec, E->getBase()->getType());
 
     // Store the vector to memory (because LValue wants an address).
-    llvm::Value *VecMem = CreateMemTemp(E->getBase()->getType());
-    Builder.CreateStore(Vec, VecMem);
-    Base = MakeAddrLValue(VecMem, E->getBase()->getType());
+    llvm::Value *VecMemPtr = CreateMemTemp(E->getBase()->getType());
+    Builder.CreateStore(Vec, VecMemPtr);
+    Base = MakeAddrLValue(VecMemPtr, E->getBase()->getType());
   }
 
   QualType type =
diff --git a/tools/clang/lib/CodeGen/CGHLSLMS.cpp b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
index b71a9983d..66c0d79d0 100644
--- a/tools/clang/lib/CodeGen/CGHLSLMS.cpp
+++ b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
@@ -3838,6 +3838,15 @@ static Value *CastLdValue(Value *Ptr, llvm::Type *FromTy, llvm::Type *ToTy, IRBu
       // Change scalar into vec1.
       Value *Vec1 = UndefValue::get(ToTy);
       return Builder.CreateInsertElement(Vec1, V, (uint64_t)0);
+    } else if (vecSize == 1 && FromTy->isIntegerTy()
+      && ToTy->getVectorElementType()->isIntegerTy(1)) {
+      // load(bitcast i32* to <1 x i1>*)
+      // Rewrite to
+      // insertelement(icmp ne (load i32*), 0)
+      Value *IntV = Builder.CreateLoad(Ptr);
+      Value *BoolV = Builder.CreateICmpNE(IntV, ConstantInt::get(IntV->getType(), 0), "tobool");
+      Value *Vec1 = UndefValue::get(ToTy);
+      return Builder.CreateInsertElement(Vec1, BoolV, (uint64_t)0);
     } else if (FromTy->isVectorTy() && vecSize == 1) {
       Value *V = Builder.CreateLoad(Ptr);
       // VectorTrunc
@@ -5468,15 +5477,20 @@ static void AddMissingCastOpsInInitList(SmallVector<Value *, 4> &elts, SmallVect
 
 static void StoreInitListToDestPtr(Value *DestPtr,
                                    SmallVector<Value *, 4> &elts, unsigned &idx,
-                                   QualType Type, CodeGenTypes &Types, bool bDefaultRowMajor,
-                                   CGBuilderTy &Builder, llvm::Module &M) {
+                                   QualType Type, bool bDefaultRowMajor,
+                                   CodeGenFunction &CGF, llvm::Module &M) {
+  CodeGenTypes &Types = CGF.getTypes();
+  CGBuilderTy &Builder = CGF.Builder;
+
   llvm::Type *Ty = DestPtr->getType()->getPointerElementType();
   llvm::Type *i32Ty = llvm::Type::getInt32Ty(Ty->getContext());
 
   if (Ty->isVectorTy()) {
-    Value *Result = UndefValue::get(Ty);
-    for (unsigned i = 0; i < Ty->getVectorNumElements(); i++)
+    llvm::Type *RegTy = CGF.ConvertType(Type);
+    Value *Result = UndefValue::get(RegTy);
+    for (unsigned i = 0; i < RegTy->getVectorNumElements(); i++)
       Result = Builder.CreateInsertElement(Result, elts[idx + i], i);
+    Result = CGF.EmitToMemory(Result, Type);
     Builder.CreateStore(Result, DestPtr);
     idx += Ty->getVectorNumElements();
   } else if (HLMatrixLower::IsMatrixType(Ty)) {
@@ -5541,8 +5555,8 @@ static void StoreInitListToDestPtr(Value *DestPtr,
             unsigned i = RL.getNonVirtualBaseLLVMFieldNo(BaseDecl);
             Constant *gepIdx = ConstantInt::get(i32Ty, i);
             Value *GEP = Builder.CreateInBoundsGEP(DestPtr, {zero, gepIdx});
-            StoreInitListToDestPtr(GEP, elts, idx, parentTy, Types,
-                                   bDefaultRowMajor, Builder, M);
+            StoreInitListToDestPtr(GEP, elts, idx, parentTy,
+                                   bDefaultRowMajor, CGF, M);
           }
         }
       }
@@ -5550,8 +5564,8 @@ static void StoreInitListToDestPtr(Value *DestPtr,
         unsigned i = RL.getLLVMFieldNo(field);
         Constant *gepIdx = ConstantInt::get(i32Ty, i);
         Value *GEP = Builder.CreateInBoundsGEP(DestPtr, {zero, gepIdx});
-        StoreInitListToDestPtr(GEP, elts, idx, field->getType(), Types,
-                               bDefaultRowMajor, Builder, M);
+        StoreInitListToDestPtr(GEP, elts, idx, field->getType(),
+                               bDefaultRowMajor, CGF, M);
       }
     }
   } else if (Ty->isArrayTy()) {
@@ -5560,8 +5574,8 @@ static void StoreInitListToDestPtr(Value *DestPtr,
     for (unsigned i = 0; i < Ty->getArrayNumElements(); i++) {
       Constant *gepIdx = ConstantInt::get(i32Ty, i);
       Value *GEP = Builder.CreateInBoundsGEP(DestPtr, {zero, gepIdx});
-      StoreInitListToDestPtr(GEP, elts, idx, EltType, Types, bDefaultRowMajor,
-                             Builder, M);
+      StoreInitListToDestPtr(GEP, elts, idx, EltType, bDefaultRowMajor,
+                             CGF, M);
     }
   } else {
     DXASSERT(Ty->isSingleValueType(), "invalid type");
@@ -5741,8 +5755,8 @@ Value *CGMSHLSLRuntime::EmitHLSLInitListExpr(CodeGenFunction &CGF, InitListExpr
     ParamList.append(EltValList.begin(), EltValList.end());
     idx = 0;
     bool bDefaultRowMajor = m_pHLModule->GetHLOptions().bDefaultRowMajor;
-    StoreInitListToDestPtr(DestPtr, EltValList, idx, ResultTy, CGF.getTypes(),
-                           bDefaultRowMajor, CGF.Builder, TheModule);
+    StoreInitListToDestPtr(DestPtr, EltValList, idx, ResultTy,
+                           bDefaultRowMajor, CGF, TheModule);
     return nullptr;
   }
 
@@ -7077,15 +7091,10 @@ void CGMSHLSLRuntime::EmitHLSLOutParamConversionInit(
     BasicBlock *InsertBlock = CGF.Builder.GetInsertBlock();
     Function *F = InsertBlock->getParent();
 
-    if (ParamTy->isBooleanType()) {
-      // Create i32 for bool.
-      ParamTy = CGM.getContext().IntTy;
-    }
     // Make sure the alloca is in entry block to stop inline create stacksave.
     IRBuilder<> AllocaBuilder(dxilutil::FindAllocaInsertionPt(F));
-    tmpArgAddr = AllocaBuilder.CreateAlloca(CGF.ConvertType(ParamTy));
+    tmpArgAddr = AllocaBuilder.CreateAlloca(CGF.ConvertTypeForMem(ParamTy));
 
-      
     // add it to local decl map
     TmpArgMap(tmpArg, tmpArgAddr);
 
@@ -7164,6 +7173,8 @@ void CGMSHLSLRuntime::EmitHLSLOutParamConversionCopyBack(
         else
           outVal = EmitHLSLMatrixLoad(CGF, tmpArgAddr, ParamTy);
 
+        outVal = CGF.EmitFromMemory(outVal, ParamTy);
+
         llvm::Type *ToTy = CGF.ConvertType(ArgTy);
         llvm::Type *FromTy = outVal->getType();
         Value *castVal = outVal;
diff --git a/tools/clang/lib/CodeGen/CodeGenTypes.cpp b/tools/clang/lib/CodeGen/CodeGenTypes.cpp
index 425c35ff2..5641cfc8a 100644
--- a/tools/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/tools/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -109,15 +109,26 @@ void CodeGenTypes::addRecordTypeName(const RecordDecl *RD,
 /// a type.  For example, the scalar representation for _Bool is i1, but the
 /// memory representation is usually i8 or i32, depending on the target.
 llvm::Type *CodeGenTypes::ConvertTypeForMem(QualType T) {
+  // HLSL Change Starts
+  if (hlsl::IsHLSLVecType(T)) {
+    // Vectors of bools in memory should become vectors of
+    // the memory representation of the elements.
+    // Clang doesn't do this for plain VectorTypes,
+    // which is fine otherwise a bool1x1 matrix would become
+    // [n x <m x i32>] since array elements always have memory representation.
+    QualType ElemT = hlsl::GetElementTypeOrType(T);
+    return llvm::VectorType::get(ConvertTypeForMem(ElemT), hlsl::GetHLSLVecSize(T));
+  }
+
   llvm::Type *R = ConvertType(T);
 
-  // If this is a non-bool type, don't map it.
-  if (!R->isIntegerTy(1))
-    return R;
+  if (R->isIntegerTy(1)) {
+    // Bools have a different representation in memory
+    return llvm::IntegerType::get(getLLVMContext(), (unsigned)Context.getTypeSize(T));
+  }
 
-  // Otherwise, return an integer of the target-specified size.
-  return llvm::IntegerType::get(getLLVMContext(),
-                                (unsigned)Context.getTypeSize(T));
+  return R;
+  // HLSL Change Ends
 }
 
 
diff --git a/tools/clang/test/CodeGenHLSL/RValSubscript.hlsl b/tools/clang/test/CodeGenHLSL/RValSubscript.hlsl
index 231a55cb8..51432e370 100644
--- a/tools/clang/test/CodeGenHLSL/RValSubscript.hlsl
+++ b/tools/clang/test/CodeGenHLSL/RValSubscript.hlsl
@@ -5,7 +5,7 @@
 // CHECK: i32 5)
 // CHECK: extractvalue
 // CHECK: , 2
-// CHECK: icmp eq
+// CHECK: icmp ne
 // CHECK 0
 
 // For (x4 < 3)[1]
@@ -47,7 +47,7 @@
 // CHECK: fcmp fast oeq
 // CHECK: fcmp fast oeq
 // CHECK: fcmp fast oeq
-// CHECK: alloca [16 x i1]
+// CHECK: alloca [16 x i32]
 
 
 float4x4 xt;
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/bool_loadbuf_storebuf_memrepr.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/bool_loadbuf_storebuf_memrepr.hlsl
new file mode 100644
index 000000000..e566a1f0e
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_loadbuf_storebuf_memrepr.hlsl
@@ -0,0 +1,110 @@
+// RUN: %dxc -E main -T ps_6_0 -O0 %s | FileCheck %s
+
+// Ensure that bools are converted from/to their mem representation when loaded/stored in buffers
+
+// Constant buffer loads
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+
+// Structured buffer loads
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+
+// Structured buffer stores
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+  
+
+struct AllTheBools
+{
+    bool2x2 m;
+    bool2 v;
+    bool s;
+    bool2x2 ma[2];
+    bool2 va[2];
+    bool sa[2];
+};
+
+ConstantBuffer<AllTheBools> cb;
+StructuredBuffer<AllTheBools> sb;
+RWStructuredBuffer<AllTheBools> rwsb;
+
+float main(int i : I) : SV_Target
+{
+    float result = 0;
+
+    // Constant buffer loads
+    if (cb.m._22 && cb.v.y && cb.s
+        && cb.ma[1]._22 && cb.va[1].y && cb.sa[1])
+    {
+        result++;
+    }
+    
+    // Structured buffer loads
+    if (sb[0].m._22 && sb[0].v.y && sb[0].s
+        && sb[0].ma[1]._22 && sb[0].va[1].y && sb[0].sa[1])
+    {
+        result++;
+    }
+
+    // Structured buffer stores
+    if (result >= 1.0f)
+    {
+        rwsb[0].m._22 = i == 42;
+        rwsb[0].v.y = i == 42;
+        rwsb[0].s = i == 42;
+        rwsb[0].ma[1]._22 = i == 42;
+        rwsb[0].va[1].y = i == 42;
+        rwsb[0].sa[1] = i == 42;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/bool_memrepr.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/bool_memrepr.hlsl
new file mode 100644
index 000000000..20d7ce08f
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_memrepr.hlsl
@@ -0,0 +1,52 @@
+// RUN: %dxc -E main -T ps_6_0 -O0 %s | FileCheck %s
+
+// Ensure that bools are converted from/to their memory representation when loaded/stored
+
+// Local variables should never be i1s
+// CHECK-not: alloca {{.*}}i1
+
+// Test stores
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+
+// Test loads
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+
+float main(int i : I) : SV_Target
+{
+    bool s = i == 42;
+    bool1 v = i == 42;
+    bool1x1 m = i == 42;
+    bool sa[1] = { i == 42 };
+    bool1 va[1] = { i == 42 };
+    bool1x1 ma[1] = { i == 42 };
+
+    return s && v.x && m._11 && sa[0] && va[0].x && ma[0]._11 ? 1.0f : 2.0f;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/bool_scalar_swizzle.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/bool_scalar_swizzle.hlsl
new file mode 100644
index 000000000..9499f3ddc
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_scalar_swizzle.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -E main -T ps_6_0 -O0 %s | FileCheck %s
+
+// This is mostly a regression test for a bug where a bitcast
+// from i32* to i1* was emitted.
+
+// CHECK: alloca i32
+// CHECK: alloca [2 x i32]
+// CHECK-NOT: bitcast
+
+float main() : SV_Target
+{
+    bool b = true;
+    bool2 b2 = b.xx;
+    return 0;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/bool_stress.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/bool_stress.hlsl
new file mode 100644
index 000000000..aacf9fd85
--- /dev/null
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_stress.hlsl
@@ -0,0 +1,54 @@
+// RUN: %dxc -E main -T vs_6_0 -O0 %s
+
+// Regression test for compiler crashes in complex bool cases
+
+struct AllTheBools
+{
+    bool b : B;
+    bool ba2[2] :BA2;
+    bool1 b1 : B1;
+    bool3 b3 : B3;
+    bool3 b3a2[2] : B3A2;
+    bool1x1 b1x1 : B1X1;
+    bool2x3 b2x3 : B2X3;
+    row_major bool2x3 rmb2x3 : RMB2X3;
+    bool2x3 b2x3a2[2] : B2X3A2;
+};
+
+ConstantBuffer<AllTheBools> cb;
+StructuredBuffer<AllTheBools> sb;
+
+void not(in out bool value) { value = !value; }
+
+void not(in out bool2 value)
+{
+  value = !value;
+  not(value.x);
+  not(value.y);
+}
+
+void not(in out bool3 value)
+{
+  not(value.xz);
+  not(value.y);
+}
+
+AllTheBools main(AllTheBools input, float f : F)
+{
+    AllTheBools output;
+    output.b = input.b ? cb.b : sb[0].b;
+    output.ba2[1] = input.b;
+    output.ba2[0] = input.ba2[1];
+    output.b1 = input.b3.y;
+    output.b3 = input.b.xxx;
+    output.b3a2 = sb[0].b3a2;
+    if (sb[0].b) return cb;
+
+    output.b1x1 = cb.b2x3._22;
+    output.b2x3 = bool2x3(sb[0].b3, bool3(f > 2, input.b, false));
+    output.rmb2x3 = input.b2x3;
+    not(output.rmb2x3[0]);
+    output.b2x3a2[1] = cb.b2x3;
+    output.b2x3a2[0] = input.b2x3;
+    return output;
+}
\ No newline at end of file
diff --git a/tools/clang/test/CodeGenHLSL/quick-test/global-var-write-test04.hlsl b/tools/clang/test/CodeGenHLSL/quick-test/global-var-write-test04.hlsl
index a0440c733..a3392a0b5 100644
--- a/tools/clang/test/CodeGenHLSL/quick-test/global-var-write-test04.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/global-var-write-test04.hlsl
@@ -7,7 +7,7 @@
 // CHECK: {{.*g_v.*}} = external constant <4 x float>, align 4
 // CHECK: {{.*g_m1.*}} = external constant %class.matrix.int.2.2, align 4
 // CHECK: {{.*g_m2.*}} = external constant %class.matrix.int.2.2, align 4
-// CHECK: {{.*g_b.*}} = external constant i32, align 1
+// CHECK: {{.*g_b.*}} = external constant i32, align 4
 // CHECK: {{.*g_a.*}} = external constant [5 x i32], align 4
 // CHECK: {{.*g_a2d.*}} = external constant [3 x [2 x i32]], align 4
 // CHECK-NOT: {{(.*g_s1.*)(.*static.copy.*)}} = internal global float 0.000000e+00