Change in-memory representation of bool vectors to use i32's (#1742)

This change fixes an inconsistency between scalar and vector bools, where the formers were stored as i32s in memory (as opposed to i1s in registers) while the latter remained <n x i1>s. This lead to a number of crashes in more complex bool scenarios where scalar and vector element types were expected to match. The solution is to extend Clang's strategy for scalar bools to vector of bools. Clang relies on three key functions to deal with differences between register and memory type representations which are special-cased for bools and now also for bool vectors. Several HLSL code additions did not properly leverage those functions and this was addressed wherever possible, in some cases removing the need for special cases for bools. To deal with matrices, a similar concept of register/memory representation was introduced in the matrix lowering code and the lowering passes were updated accordingly.
2018-12-03 13:28:22 -08:00 · 2018-12-03 13:28:22 -08:00 · 8226092fd7
--- a/include/dxc/HLSL/HLMatrixLowerHelper.h
+++ b/include/dxc/HLSL/HLMatrixLowerHelper.h
@ -31,17 +31,27 @@ bool IsMatrixType(llvm::Type *Ty);
 DxilFieldAnnotation *FindAnnotationFromMatUser(llvm::Value *Mat,
                                               DxilTypeSystem &typeSys);
 // Translate matrix type to vector type.
-llvm::Type *LowerMatrixType(llvm::Type *Ty);
+llvm::Type *LowerMatrixType(llvm::Type *Ty, bool forMem = false);
 // TODO: use type annotation.
 llvm::Type *GetMatrixInfo(llvm::Type *Ty, unsigned &col, unsigned &row);
 // TODO: use type annotation.
 bool IsMatrixArrayPointer(llvm::Type *Ty);
 // Translate matrix array pointer type to vector array pointer type.
-llvm::Type *LowerMatrixArrayPointer(llvm::Type *Ty);
+llvm::Type *LowerMatrixArrayPointer(llvm::Type *Ty, bool forMem = false);

 llvm::Value *BuildVector(llvm::Type *EltTy, unsigned size,
                         llvm::ArrayRef<llvm::Value *> elts,
                         llvm::IRBuilder<> &Builder);
+
+llvm::Value *VecMatrixMemToReg(llvm::Value *VecVal, llvm::Type *MatType,
+                               llvm::IRBuilder<> &Builder);
+llvm::Value *VecMatrixRegToMem(llvm::Value* VecVal, llvm::Type *MatType,
+                               llvm::IRBuilder<> &Builder);
+llvm::Instruction *CreateVecMatrixLoad(llvm::Value *VecPtr,
+                                       llvm::Type *MatType, llvm::IRBuilder<> &Builder);
+llvm::Instruction *CreateVecMatrixStore(llvm::Value* VecVal, llvm::Value *VecPtr,
+                                        llvm::Type *MatType, llvm::IRBuilder<> &Builder);
+
 // For case like mat[i][j].
 // IdxList is [i][0], [i][1], [i][2],[i][3].
 // Idx is j.
--- a/lib/HLSL/HLMatrixLowerPass.cpp
+++ b/lib/HLSL/HLMatrixLowerPass.cpp
@ -69,7 +69,7 @@ DxilFieldAnnotation *FindAnnotationFromMatUser(Value *Mat,
 }

 // Translate matrix type to vector type.
-Type *LowerMatrixType(Type *Ty) {
+Type *LowerMatrixType(Type *Ty, bool forMem) {
  // Only translate matrix type and function type which use matrix type.
  // Not translate struct has matrix or matrix pointer.
  // Struct should be flattened before.
@ -84,6 +84,8 @@ Type *LowerMatrixType(Type *Ty) {
  } else if (IsMatrixType(Ty)) {
    unsigned row, col;
    Type *EltTy = GetMatrixInfo(Ty, col, row);
+    if (forMem && EltTy->isIntegerTy(1))
+      EltTy = Type::getInt32Ty(Ty->getContext());
    return VectorType::get(EltTy, row * col);
  } else {
    return Ty;
@ -122,7 +124,7 @@ bool IsMatrixArrayPointer(llvm::Type *Ty) {
    Ty = Ty->getArrayElementType();
  return IsMatrixType(Ty);
 }
-Type *LowerMatrixArrayPointer(Type *Ty) {
+Type *LowerMatrixArrayPointer(Type *Ty, bool forMem) {
  unsigned addrSpace = Ty->getPointerAddressSpace();
  Ty = Ty->getPointerElementType();
  std::vector<unsigned> arraySizeList;
@ -130,7 +132,7 @@ Type *LowerMatrixArrayPointer(Type *Ty) {
    arraySizeList.push_back(Ty->getArrayNumElements());
    Ty = Ty->getArrayElementType();
  }
-  Ty = LowerMatrixType(Ty);
+  Ty = LowerMatrixType(Ty, forMem);

  for (auto arraySize = arraySizeList.rbegin();
       arraySize != arraySizeList.rend(); arraySize++)
@ -155,13 +157,69 @@ Type *LowerMatrixArrayPointerToOneDimArray(Type *Ty) {
  return PointerType::get(Ty, addrSpace);
 }
 Value *BuildVector(Type *EltTy, unsigned size, ArrayRef<llvm::Value *> elts,
-                   IRBuilder<> &Builder) {
+  IRBuilder<> &Builder) {
  Value *Vec = UndefValue::get(VectorType::get(EltTy, size));
  for (unsigned i = 0; i < size; i++)
    Vec = Builder.CreateInsertElement(Vec, elts[i], i);
  return Vec;
 }

+llvm::Value *VecMatrixMemToReg(llvm::Value *VecVal, llvm::Type *MatType,
+  llvm::IRBuilder<> &Builder)
+{
+  llvm::Type *VecMatRegTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/false);
+  if (VecVal->getType() == VecMatRegTy) {
+    return VecVal;
+  }
+
+  DXASSERT(VecMatRegTy->getVectorElementType()->isIntegerTy(1),
+    "Vector matrix mem to reg type mismatch should only happen for bools.");
+  llvm::Type *VecMatMemTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/true);
+  return Builder.CreateICmpNE(VecVal, Constant::getNullValue(VecMatMemTy));
+}
+
+llvm::Value *VecMatrixRegToMem(llvm::Value* VecVal, llvm::Type *MatType,
+  llvm::IRBuilder<> &Builder)
+{
+  llvm::Type *VecMatMemTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/true);
+  if (VecVal->getType() == VecMatMemTy) {
+    return VecVal;
+  }
+
+  DXASSERT(VecVal->getType()->getVectorElementType()->isIntegerTy(1),
+    "Vector matrix reg to mem type mismatch should only happen for bools.");
+  return Builder.CreateZExt(VecVal, VecMatMemTy);
+}
+
+llvm::Instruction *CreateVecMatrixLoad(
+  llvm::Value *VecPtr, llvm::Type *MatType, llvm::IRBuilder<> &Builder)
+{
+  llvm::Instruction *VecVal = Builder.CreateLoad(VecPtr);
+  return cast<llvm::Instruction>(VecMatrixMemToReg(VecVal, MatType, Builder));
+}
+
+llvm::Instruction *CreateVecMatrixStore(llvm::Value* VecVal, llvm::Value *VecPtr,
+  llvm::Type *MatType, llvm::IRBuilder<> &Builder)
+{
+  llvm::Type *VecMatMemTy = HLMatrixLower::LowerMatrixType(MatType, /*forMem*/true);
+  if (VecVal->getType() == VecMatMemTy) {
+    return Builder.CreateStore(VecVal, VecPtr);
+  }
+
+  // We need to convert to the memory representation, and we want to return
+  // the conversion instruction rather than the store since that's what
+  // accepts the register-typed i1 values.
+
+  // Do not use VecMatrixRegToMem as it may constant fold the conversion
+  // instruction, which is what we want to return.
+  DXASSERT(VecVal->getType()->getVectorElementType()->isIntegerTy(1),
+    "Vector matrix reg to mem type mismatch should only happen for bools.");
+
+  llvm::Instruction *ConvInst = Builder.Insert(new ZExtInst(VecVal, VecMatMemTy));
+  Builder.CreateStore(ConvInst, VecPtr);
+  return ConvInst;
+}
+
 Value *LowerGEPOnMatIndexListToIndex(
    llvm::GetElementPtrInst *GEP, ArrayRef<Value *> IdxList) {
  IRBuilder<> Builder(GEP);
@ -508,7 +566,7 @@ Instruction *HLMatrixLowerPass::MatLdStToVec(CallInst *CI) {
    if (isa<AllocaInst>(matPtr) || GetIfMatrixGEPOfUDTAlloca(matPtr) ||
        GetIfMatrixGEPOfUDTArg(matPtr, *m_pHLModule)) {
      Value *vecPtr = matToVecMap[cast<Instruction>(matPtr)];
-      result = Builder.CreateLoad(vecPtr);
+      result = CreateVecMatrixLoad(vecPtr, matPtr->getType()->getPointerElementType(), Builder);
    } else
      result = MatIntrinsicToVec(CI);
  } break;
@ -519,9 +577,8 @@ Instruction *HLMatrixLowerPass::MatLdStToVec(CallInst *CI) {
        GetIfMatrixGEPOfUDTArg(matPtr, *m_pHLModule)) {
      Value *vecPtr = matToVecMap[cast<Instruction>(matPtr)];
      Value *matVal = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
-      Value *vecVal =
-          UndefValue::get(HLMatrixLower::LowerMatrixType(matVal->getType()));
-      result = Builder.CreateStore(vecVal, vecPtr);
+      Value *vecVal = UndefValue::get(HLMatrixLower::LowerMatrixType(matVal->getType()));
+      result = CreateVecMatrixStore(vecVal, vecPtr, matVal->getType(), Builder);
    } else
      result = MatIntrinsicToVec(CI);
  } break;
@ -905,11 +962,11 @@ void HLMatrixLowerPass::lowerToVec(Instruction *matInst) {
    
    IRBuilder<> AllocaBuilder(AI);
    if (Ty->isArrayTy()) {
-      Type *vecTy = HLMatrixLower::LowerMatrixArrayPointer(AI->getType());
+      Type *vecTy = HLMatrixLower::LowerMatrixArrayPointer(AI->getType(), /*forMem*/ true);
      vecTy = vecTy->getPointerElementType();
      vecVal = AllocaBuilder.CreateAlloca(vecTy, nullptr, AI->getName());
    } else {
-      Type *vecTy = HLMatrixLower::LowerMatrixType(matTy);
+      Type *vecTy = HLMatrixLower::LowerMatrixType(matTy, /*forMem*/ true);
      vecVal = AllocaBuilder.CreateAlloca(vecTy, nullptr, AI->getName());
    }
    // Update debug info.
@ -2059,7 +2116,8 @@ void HLMatrixLowerPass::TranslateMatArrayGEP(Value *matInst,
          // Skip the vector version.
          if (useCall->getType()->isVectorTy())
            continue;
-          Value *newLd = Builder.CreateLoad(newGEP);
+          Type *matTy = useCall->getType();
+          Value *newLd = CreateVecMatrixLoad(newGEP, matTy, Builder);
          DXASSERT(matToVecMap.count(useCall), "must have vec version");
          Value *oldLd = matToVecMap[useCall];
          // Delete the oldLd.
@ -2082,7 +2140,7 @@ void HLMatrixLowerPass::TranslateMatArrayGEP(Value *matInst,

          DXASSERT(matToVecMap.count(matInst), "must have vec version");
          Value *vecVal = matToVecMap[matInst];
-          Builder.CreateStore(vecVal, vecPtr);
+          CreateVecMatrixStore(vecVal, vecPtr, matVal->getType(), Builder);
        } break;
        }
      } break;
@ -2174,9 +2232,17 @@ void HLMatrixLowerPass::replaceMatWithVec(Value *matVal,
          // Load Already translated in lowerToVec.
          // Store val operand will be set by the val use.
          // Do nothing here.
-        } else if (StoreInst *stInst = dyn_cast<StoreInst>(vecUser))
+        } else if (StoreInst *stInst = dyn_cast<StoreInst>(vecUser)) {
+          DXASSERT(vecVal->getType() == stInst->getValueOperand()->getType(),
+            "Mismatched vector matrix store value types.");
          stInst->setOperand(0, vecVal);
-        else
+        } else if (ZExtInst *zextInst = dyn_cast<ZExtInst>(vecUser)) {
+          // This happens when storing bool matrices,
+          // which must first undergo conversion from i1's to i32's.
+          DXASSERT(vecVal->getType() == zextInst->getOperand(0)->getType(),
+            "Mismatched vector matrix store value types.");
+          zextInst->setOperand(0, vecVal);
+        } else
          TrivialMatReplace(matVal, vecVal, useCall);

      } break;
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@ -4988,7 +4988,8 @@ Value *TranslateConstBufMatLd(Type *matType, Value *handle, Value *offset,
                              bool colMajor, OP *OP, const DataLayout &DL,
                              IRBuilder<> &Builder) {
  unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/true)->getVectorElementType();
  unsigned matSize = col * row;
  std::vector<Value *> elts(matSize);
  Value *EltByteSize = ConstantInt::get(
@ -5001,7 +5002,9 @@ Value *TranslateConstBufMatLd(Type *matType, Value *handle, Value *offset,
    baseOffset = Builder.CreateAdd(baseOffset, EltByteSize);
  }

-  return HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Value* Vec = HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Vec = HLMatrixLower::VecMatrixMemToReg(Vec, matType, Builder);
+  return Vec;
 }

 void TranslateCBGep(GetElementPtrInst *GEP, Value *handle, Value *baseOffset,
@ -5417,10 +5420,11 @@ Value *GenerateCBLoadLegacy(Value *handle, Value *legacyIdx,

 Value *TranslateConstBufMatLdLegacy(Type *matType, Value *handle,
                                    Value *legacyIdx, bool colMajor, OP *OP,
-                                    const DataLayout &DL,
+                                    bool memElemRepr, const DataLayout &DL,
                                    IRBuilder<> &Builder) {
  unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/memElemRepr)->getVectorElementType();

  unsigned matSize = col * row;
  std::vector<Value *> elts(matSize);
@ -5506,8 +5510,9 @@ void TranslateCBAddressUserLegacy(Instruction *user, Value *handle,
      Type *matType = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx)
                          ->getType()
                          ->getPointerElementType();
+      // This will replace a call, so we should use the register representation of elements
      Value *newLd = TranslateConstBufMatLdLegacy(
-          matType, handle, legacyIdx, colMajor, hlslOP, DL, Builder);
+          matType, handle, legacyIdx, colMajor, hlslOP, /*memElemRepr*/false, DL, Builder);
      CI->replaceAllUsesWith(newLd);
      CI->eraseFromParent();
    } else if (group == HLOpcodeGroup::HLSubscript) {
@ -5534,8 +5539,9 @@ void TranslateCBAddressUserLegacy(Instruction *user, Value *handle,

      Value *ldData = UndefValue::get(resultType);
      if (!dynamicIndexing) {
+        // This will replace a load or GEP, so we should use the memory representation of elements
        Value *matLd = TranslateConstBufMatLdLegacy(
-            matType, handle, legacyIdx, colMajor, hlslOP, DL, Builder);
+            matType, handle, legacyIdx, colMajor, hlslOP, /*memElemRepr*/true, DL, Builder);
        // The matLd is keep original layout, just use the idx calc in
        // EmitHLSLMatrixElement and EmitHLSLMatrixSubscript.
        switch (subOp) {
@ -6022,7 +6028,8 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
                               Value *bufIdx, Value *baseOffset,
                               bool colMajor, const DataLayout &DL) {
  unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/true)->getVectorElementType();
  unsigned  EltSize = DL.getTypeAllocSize(EltTy);
  Constant* alignment = OP->GetI32Const(EltSize);

@ -6054,14 +6061,20 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * EltSize));
  }

-  return HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Value *Vec = HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
+  Vec = HLMatrixLower::VecMatrixMemToReg(Vec, matType, Builder);
+  return Vec;
 }

 void TranslateStructBufMatSt(Type *matType, IRBuilder<> &Builder, Value *handle,
                             hlsl::OP *OP, Value *bufIdx, Value *baseOffset,
                             Value *val, bool colMajor, const DataLayout &DL) {
  unsigned col, row;
-  Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
+  HLMatrixLower::GetMatrixInfo(matType, col, row);
+  Type *EltTy = HLMatrixLower::LowerMatrixType(matType, /*forMem*/true)->getVectorElementType();
+
+  val = HLMatrixLower::VecMatrixRegToMem(val, matType, Builder);
+
  unsigned EltSize = DL.getTypeAllocSize(EltTy);
  Constant *Alignment = OP->GetI32Const(EltSize);
  Value *offset = baseOffset;
--- a/lib/HLSL/HLSignatureLower.cpp
+++ b/lib/HLSL/HLSignatureLower.cpp
@ -912,80 +912,54 @@ void GenerateInputOutputUserCall(InputOutputAccessInfo &info, Value *undefVertex
    DXASSERT_NOMSG(group == HLOpcodeGroup::HLMatLoadStore);
    HLMatLoadStoreOpcode matOp = static_cast<HLMatLoadStoreOpcode>(opcode);
    switch (matOp) {
-    case HLMatLoadStoreOpcode::ColMatLoad: {
-      IRBuilder<> LocalBuilder(CI);
-      Type *matTy = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx)
-                        ->getType()
-                        ->getPointerElementType();
-      unsigned col, row;
-      Type *EltTy = HLMatrixLower::GetMatrixInfo(matTy, col, row);
-      std::vector<Value *> matElts(col * row);
-      for (unsigned c = 0; c < col; c++) {
-        Constant *constRowIdx = LocalBuilder.getInt32(c);
-        Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
-        for (unsigned r = 0; r < row; r++) {
-          SmallVector<Value *, 4> args = {OpArg, ID, rowIdx, columnConsts[r]};
-          if (vertexID)
-            args.emplace_back(vertexID);
-
-          Value *input = LocalBuilder.CreateCall(ldStFunc, args);
-          unsigned matIdx = c * row + r;
-          matElts[matIdx] = input;
-        }
-      }
-      Value *newVec =
-          HLMatrixLower::BuildVector(EltTy, col * row, matElts, LocalBuilder);
-      CI->replaceAllUsesWith(newVec);
-      CI->eraseFromParent();
-    } break;
+    case HLMatLoadStoreOpcode::ColMatLoad:
    case HLMatLoadStoreOpcode::RowMatLoad: {
      IRBuilder<> LocalBuilder(CI);
      Type *matTy = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx)
                        ->getType()
                        ->getPointerElementType();
      unsigned col, row;
-      Type *EltTy = HLMatrixLower::GetMatrixInfo(matTy, col, row);
+      HLMatrixLower::GetMatrixInfo(matTy, col, row);
      std::vector<Value *> matElts(col * row);
-      for (unsigned r = 0; r < row; r++) {
-        Constant *constRowIdx = LocalBuilder.getInt32(r);
-        Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
-        for (unsigned c = 0; c < col; c++) {
-          SmallVector<Value *, 4> args = {OpArg, ID, rowIdx, columnConsts[c]};
-          if (vertexID)
-            args.emplace_back(vertexID);

-          Value *input = LocalBuilder.CreateCall(ldStFunc, args);
-          unsigned matIdx = r * col + c;
-          matElts[matIdx] = input;
+      if (matOp == HLMatLoadStoreOpcode::ColMatLoad) {
+        for (unsigned c = 0; c < col; c++) {
+          Constant *constRowIdx = LocalBuilder.getInt32(c);
+          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+          for (unsigned r = 0; r < row; r++) {
+            SmallVector<Value *, 4> args = { OpArg, ID, rowIdx, columnConsts[r] };
+            if (vertexID)
+              args.emplace_back(vertexID);
+
+            Value *input = LocalBuilder.CreateCall(ldStFunc, args);
+            unsigned matIdx = c * row + r;
+            matElts[matIdx] = input;
+          }
+        }
+      } else {
+        for (unsigned r = 0; r < row; r++) {
+          Constant *constRowIdx = LocalBuilder.getInt32(r);
+          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+          for (unsigned c = 0; c < col; c++) {
+            SmallVector<Value *, 4> args = { OpArg, ID, rowIdx, columnConsts[c] };
+            if (vertexID)
+              args.emplace_back(vertexID);
+
+            Value *input = LocalBuilder.CreateCall(ldStFunc, args);
+            unsigned matIdx = r * col + c;
+            matElts[matIdx] = input;
+          }
        }
      }
+
      Value *newVec =
-          HLMatrixLower::BuildVector(EltTy, col * row, matElts, LocalBuilder);
+          HLMatrixLower::BuildVector(matElts[0]->getType(), col * row, matElts, LocalBuilder);
+      newVec = HLMatrixLower::VecMatrixMemToReg(newVec, matTy, LocalBuilder);
+
      CI->replaceAllUsesWith(newVec);
      CI->eraseFromParent();
    } break;
-    case HLMatLoadStoreOpcode::ColMatStore: {
-      IRBuilder<> LocalBuilder(CI);
-      Value *Val = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
-      Type *matTy = CI->getArgOperand(HLOperandIndex::kMatStoreDstPtrOpIdx)
-                        ->getType()
-                        ->getPointerElementType();
-      unsigned col, row;
-      HLMatrixLower::GetMatrixInfo(matTy, col, row);
-
-      for (unsigned c = 0; c < col; c++) {
-        Constant *constColIdx = LocalBuilder.getInt32(c);
-        Value *colIdx = LocalBuilder.CreateAdd(idxVal, constColIdx);
-
-        for (unsigned r = 0; r < row; r++) {
-          unsigned matIdx = HLMatrixLower::GetColMajorIdx(r, c, row);
-          Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
-          LocalBuilder.CreateCall(ldStFunc,
-                                  {OpArg, ID, colIdx, columnConsts[r], Elt});
-        }
-      }
-      CI->eraseFromParent();
-    } break;
+    case HLMatLoadStoreOpcode::ColMatStore:
    case HLMatLoadStoreOpcode::RowMatStore: {
      IRBuilder<> LocalBuilder(CI);
      Value *Val = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
@ -995,14 +969,30 @@ void GenerateInputOutputUserCall(InputOutputAccessInfo &info, Value *undefVertex
      unsigned col, row;
      HLMatrixLower::GetMatrixInfo(matTy, col, row);

-      for (unsigned r = 0; r < row; r++) {
-        Constant *constRowIdx = LocalBuilder.getInt32(r);
-        Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+      Val = HLMatrixLower::VecMatrixRegToMem(Val, matTy, LocalBuilder);
+
+      if (matOp == HLMatLoadStoreOpcode::ColMatStore) {
        for (unsigned c = 0; c < col; c++) {
-          unsigned matIdx = HLMatrixLower::GetRowMajorIdx(r, c, col);
-          Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
-          LocalBuilder.CreateCall(ldStFunc,
-                                  {OpArg, ID, rowIdx, columnConsts[c], Elt});
+          Constant *constColIdx = LocalBuilder.getInt32(c);
+          Value *colIdx = LocalBuilder.CreateAdd(idxVal, constColIdx);
+
+          for (unsigned r = 0; r < row; r++) {
+            unsigned matIdx = HLMatrixLower::GetColMajorIdx(r, c, row);
+            Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
+            LocalBuilder.CreateCall(ldStFunc,
+              { OpArg, ID, colIdx, columnConsts[r], Elt });
+          }
+        }
+      } else {
+        for (unsigned r = 0; r < row; r++) {
+          Constant *constRowIdx = LocalBuilder.getInt32(r);
+          Value *rowIdx = LocalBuilder.CreateAdd(idxVal, constRowIdx);
+          for (unsigned c = 0; c < col; c++) {
+            unsigned matIdx = HLMatrixLower::GetRowMajorIdx(r, c, col);
+            Value *Elt = LocalBuilder.CreateExtractElement(Val, matIdx);
+            LocalBuilder.CreateCall(ldStFunc,
+              { OpArg, ID, rowIdx, columnConsts[c], Elt });
+          }
        }
      }
      CI->eraseFromParent();
--- a/tools/clang/lib/AST/ASTContext.cpp
+++ b/tools/clang/lib/AST/ASTContext.cpp
@ -1568,7 +1568,6 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
    // Vector align to its element.
    if (getLangOpts().HLSL) {
      Align = EltInfo.Align;
-      Width = Align * VT->getNumElements();
    }
    // HLSL Change Ends.
    // If the alignment is not a power of 2, round up to the next power of 2.
--- a/tools/clang/lib/Basic/Targets.cpp
+++ b/tools/clang/lib/Basic/Targets.cpp
@ -6991,9 +6991,7 @@ public:
    LongWidth = LongAlign = 32;
    LongDoubleWidth = LongDoubleAlign = 64;
    LongDoubleFormat = &llvm::APFloat::IEEEdouble;
-    BoolWidth = 32;
-    // To avoid member for alignment.
-    BoolAlign = 8;
+    BoolWidth = BoolAlign = 32;

    // using the Microsoft ABI.
    TheCXXABI.set(TargetCXXABI::Microsoft);
--- a/tools/clang/lib/CodeGen/CGExpr.cpp
+++ b/tools/clang/lib/CodeGen/CGExpr.cpp
@ -1082,6 +1082,14 @@ static bool hasBooleanRepresentation(QualType Ty) {
  return false;
 }

+// HLSL Change Begin.
+static bool hasBooleanScalarOrVectorRepresentation(QualType Ty) {
+  if (hlsl::IsHLSLVecType(Ty))
+    return hasBooleanRepresentation(hlsl::GetElementTypeOrType(Ty));
+  return hasBooleanRepresentation(Ty);
+}
+// HLSL Change End.
+
 static bool getRangeForType(CodeGenFunction &CGF, QualType Ty,
                            llvm::APInt &Min, llvm::APInt &End,
                            bool StrictEnums) {
@ -1233,30 +1241,31 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(llvm::Value *Addr, bool Volatile,
 }

 llvm::Value *CodeGenFunction::EmitToMemory(llvm::Value *Value, QualType Ty) {
-  // Bool has a different representation in memory than in registers.
-  if (hasBooleanRepresentation(Ty)) {
+  // HLSL Change Begin.
+  // Bool scalar and vectors have a different representation in memory than in registers.
+  if (hasBooleanScalarOrVectorRepresentation(Ty)) {
    // This should really always be an i1, but sometimes it's already
    // an i8, and it's awkward to track those cases down.
-    if (Value->getType()->isIntegerTy(1))
+    llvm::Type *ValTy = Value->getType();
+    llvm::Type *VecElemTy = ValTy->isVectorTy() ? ValTy->getVectorElementType() : ValTy;
+    if (VecElemTy->isIntegerTy(1))
      return Builder.CreateZExt(Value, ConvertTypeForMem(Ty), "frombool");
-    assert(Value->getType()->isIntegerTy(getContext().getTypeSize(Ty)) &&
-           "wrong value rep of bool");
  }
+  // HLSL Change End.

  return Value;
 }

 llvm::Value *CodeGenFunction::EmitFromMemory(llvm::Value *Value, QualType Ty) {
-  // Bool has a different representation in memory than in registers.
-  if (hasBooleanRepresentation(Ty)) {
-    assert(Value->getType()->isIntegerTy(getContext().getTypeSize(Ty)) &&
-           "wrong value rep of bool");
-    // HLSL Change Begin.
+  // HLSL Change Begin.
+  // Bool scalar and vectors have a different representation in memory than in registers.
+  if (hasBooleanScalarOrVectorRepresentation(Ty)) {
+    llvm::Type *ValTy = Value->getType();
    // Use ne v, 0 to convert to i1 instead of trunc.
    return Builder.CreateICmpNE(
-        Value, llvm::ConstantInt::get(Value->getType(), 0), "tobool");
-    // HLSL Change End.
+        Value, llvm::ConstantVector::getNullValue(ValTy), "tobool");
  }
+  // HLSL Change End.

  return Value;
 }
@ -1475,6 +1484,8 @@ RValue CodeGenFunction::EmitLoadOfExtVectorElementLValue(LValue LV) {
  Load->setAlignment(LV.getAlignment().getQuantity());
  llvm::Value *Vec = Load;

+  Vec = EmitFromMemory(Vec, LV.getType()); // HLSL Change
+
  const llvm::Constant *Elts = LV.getExtVectorElts();

  // If the result of the expression is a non-vector type, we must be extracting
@ -1748,7 +1759,10 @@ void CodeGenFunction::EmitStoreThroughExtVectorComponentLValue(RValue Src,
  const llvm::Constant *Elts = Dst.getExtVectorElts();

  llvm::Value *SrcVal = Src.getScalarVal();
+
  // HLSL Change Starts
+  SrcVal = EmitToMemory(SrcVal, Dst.getType());
+
  const VectorType *VTy = Dst.getType()->getAs<VectorType>();
  if (VTy == nullptr && getContext().getLangOpts().HLSL)
    VTy =
@ -2918,11 +2932,12 @@ CodeGenFunction::EmitHLSLVectorElementExpr(const HLSLVectorElementExpr *E) {
    assert(hlsl::IsHLSLVecType(E->getBase()->getType()) &&
           "Result must be a vector");
    llvm::Value *Vec = EmitScalarExpr(E->getBase());
+    Vec = EmitToMemory(Vec, E->getBase()->getType());

    // Store the vector to memory (because LValue wants an address).
-    llvm::Value *VecMem = CreateMemTemp(E->getBase()->getType());
-    Builder.CreateStore(Vec, VecMem);
-    Base = MakeAddrLValue(VecMem, E->getBase()->getType());
+    llvm::Value *VecMemPtr = CreateMemTemp(E->getBase()->getType());
+    Builder.CreateStore(Vec, VecMemPtr);
+    Base = MakeAddrLValue(VecMemPtr, E->getBase()->getType());
  }

  QualType type =
--- a/tools/clang/lib/CodeGen/CGHLSLMS.cpp
+++ b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
@ -3838,6 +3838,15 @@ static Value *CastLdValue(Value *Ptr, llvm::Type *FromTy, llvm::Type *ToTy, IRBu
      // Change scalar into vec1.
      Value *Vec1 = UndefValue::get(ToTy);
      return Builder.CreateInsertElement(Vec1, V, (uint64_t)0);
+    } else if (vecSize == 1 && FromTy->isIntegerTy()
+      && ToTy->getVectorElementType()->isIntegerTy(1)) {
+      // load(bitcast i32* to <1 x i1>*)
+      // Rewrite to
+      // insertelement(icmp ne (load i32*), 0)
+      Value *IntV = Builder.CreateLoad(Ptr);
+      Value *BoolV = Builder.CreateICmpNE(IntV, ConstantInt::get(IntV->getType(), 0), "tobool");
+      Value *Vec1 = UndefValue::get(ToTy);
+      return Builder.CreateInsertElement(Vec1, BoolV, (uint64_t)0);
    } else if (FromTy->isVectorTy() && vecSize == 1) {
      Value *V = Builder.CreateLoad(Ptr);
      // VectorTrunc
@ -5468,15 +5477,20 @@ static void AddMissingCastOpsInInitList(SmallVector<Value *, 4> &elts, SmallVect

 static void StoreInitListToDestPtr(Value *DestPtr,
                                   SmallVector<Value *, 4> &elts, unsigned &idx,
-                                   QualType Type, CodeGenTypes &Types, bool bDefaultRowMajor,
-                                   CGBuilderTy &Builder, llvm::Module &M) {
+                                   QualType Type, bool bDefaultRowMajor,
+                                   CodeGenFunction &CGF, llvm::Module &M) {
+  CodeGenTypes &Types = CGF.getTypes();
+  CGBuilderTy &Builder = CGF.Builder;
+
  llvm::Type *Ty = DestPtr->getType()->getPointerElementType();
  llvm::Type *i32Ty = llvm::Type::getInt32Ty(Ty->getContext());

  if (Ty->isVectorTy()) {
-    Value *Result = UndefValue::get(Ty);
-    for (unsigned i = 0; i < Ty->getVectorNumElements(); i++)
+    llvm::Type *RegTy = CGF.ConvertType(Type);
+    Value *Result = UndefValue::get(RegTy);
+    for (unsigned i = 0; i < RegTy->getVectorNumElements(); i++)
      Result = Builder.CreateInsertElement(Result, elts[idx + i], i);
+    Result = CGF.EmitToMemory(Result, Type);
    Builder.CreateStore(Result, DestPtr);
    idx += Ty->getVectorNumElements();
  } else if (HLMatrixLower::IsMatrixType(Ty)) {
@ -5541,8 +5555,8 @@ static void StoreInitListToDestPtr(Value *DestPtr,
            unsigned i = RL.getNonVirtualBaseLLVMFieldNo(BaseDecl);
            Constant *gepIdx = ConstantInt::get(i32Ty, i);
            Value *GEP = Builder.CreateInBoundsGEP(DestPtr, {zero, gepIdx});
-            StoreInitListToDestPtr(GEP, elts, idx, parentTy, Types,
-                                   bDefaultRowMajor, Builder, M);
+            StoreInitListToDestPtr(GEP, elts, idx, parentTy,
+                                   bDefaultRowMajor, CGF, M);
          }
        }
      }
@ -5550,8 +5564,8 @@ static void StoreInitListToDestPtr(Value *DestPtr,
        unsigned i = RL.getLLVMFieldNo(field);
        Constant *gepIdx = ConstantInt::get(i32Ty, i);
        Value *GEP = Builder.CreateInBoundsGEP(DestPtr, {zero, gepIdx});
-        StoreInitListToDestPtr(GEP, elts, idx, field->getType(), Types,
-                               bDefaultRowMajor, Builder, M);
+        StoreInitListToDestPtr(GEP, elts, idx, field->getType(),
+                               bDefaultRowMajor, CGF, M);
      }
    }
  } else if (Ty->isArrayTy()) {
@ -5560,8 +5574,8 @@ static void StoreInitListToDestPtr(Value *DestPtr,
    for (unsigned i = 0; i < Ty->getArrayNumElements(); i++) {
      Constant *gepIdx = ConstantInt::get(i32Ty, i);
      Value *GEP = Builder.CreateInBoundsGEP(DestPtr, {zero, gepIdx});
-      StoreInitListToDestPtr(GEP, elts, idx, EltType, Types, bDefaultRowMajor,
-                             Builder, M);
+      StoreInitListToDestPtr(GEP, elts, idx, EltType, bDefaultRowMajor,
+                             CGF, M);
    }
  } else {
    DXASSERT(Ty->isSingleValueType(), "invalid type");
@ -5741,8 +5755,8 @@ Value *CGMSHLSLRuntime::EmitHLSLInitListExpr(CodeGenFunction &CGF, InitListExpr
    ParamList.append(EltValList.begin(), EltValList.end());
    idx = 0;
    bool bDefaultRowMajor = m_pHLModule->GetHLOptions().bDefaultRowMajor;
-    StoreInitListToDestPtr(DestPtr, EltValList, idx, ResultTy, CGF.getTypes(),
-                           bDefaultRowMajor, CGF.Builder, TheModule);
+    StoreInitListToDestPtr(DestPtr, EltValList, idx, ResultTy,
+                           bDefaultRowMajor, CGF, TheModule);
    return nullptr;
  }

@ -7077,15 +7091,10 @@ void CGMSHLSLRuntime::EmitHLSLOutParamConversionInit(
    BasicBlock *InsertBlock = CGF.Builder.GetInsertBlock();
    Function *F = InsertBlock->getParent();

-    if (ParamTy->isBooleanType()) {
-      // Create i32 for bool.
-      ParamTy = CGM.getContext().IntTy;
-    }
    // Make sure the alloca is in entry block to stop inline create stacksave.
    IRBuilder<> AllocaBuilder(dxilutil::FindAllocaInsertionPt(F));
-    tmpArgAddr = AllocaBuilder.CreateAlloca(CGF.ConvertType(ParamTy));
+    tmpArgAddr = AllocaBuilder.CreateAlloca(CGF.ConvertTypeForMem(ParamTy));

-      
    // add it to local decl map
    TmpArgMap(tmpArg, tmpArgAddr);

@ -7164,6 +7173,8 @@ void CGMSHLSLRuntime::EmitHLSLOutParamConversionCopyBack(
        else
          outVal = EmitHLSLMatrixLoad(CGF, tmpArgAddr, ParamTy);

+        outVal = CGF.EmitFromMemory(outVal, ParamTy);
+
        llvm::Type *ToTy = CGF.ConvertType(ArgTy);
        llvm::Type *FromTy = outVal->getType();
        Value *castVal = outVal;
--- a/tools/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/tools/clang/lib/CodeGen/CodeGenTypes.cpp
@ -109,15 +109,26 @@ void CodeGenTypes::addRecordTypeName(const RecordDecl *RD,
 /// a type.  For example, the scalar representation for _Bool is i1, but the
 /// memory representation is usually i8 or i32, depending on the target.
 llvm::Type *CodeGenTypes::ConvertTypeForMem(QualType T) {
+  // HLSL Change Starts
+  if (hlsl::IsHLSLVecType(T)) {
+    // Vectors of bools in memory should become vectors of
+    // the memory representation of the elements.
+    // Clang doesn't do this for plain VectorTypes,
+    // which is fine otherwise a bool1x1 matrix would become
+    // [n x <m x i32>] since array elements always have memory representation.
+    QualType ElemT = hlsl::GetElementTypeOrType(T);
+    return llvm::VectorType::get(ConvertTypeForMem(ElemT), hlsl::GetHLSLVecSize(T));
+  }
+
  llvm::Type *R = ConvertType(T);

-  // If this is a non-bool type, don't map it.
-  if (!R->isIntegerTy(1))
-    return R;
+  if (R->isIntegerTy(1)) {
+    // Bools have a different representation in memory
+    return llvm::IntegerType::get(getLLVMContext(), (unsigned)Context.getTypeSize(T));
+  }

-  // Otherwise, return an integer of the target-specified size.
-  return llvm::IntegerType::get(getLLVMContext(),
-                                (unsigned)Context.getTypeSize(T));
+  return R;
+  // HLSL Change Ends
 }


--- a/tools/clang/test/CodeGenHLSL/RValSubscript.hlsl
+++ b/tools/clang/test/CodeGenHLSL/RValSubscript.hlsl
@ -5,7 +5,7 @@
 // CHECK: i32 5)
 // CHECK: extractvalue
 // CHECK: , 2
-// CHECK: icmp eq
+// CHECK: icmp ne
 // CHECK 0

 // For (x4 < 3)[1]
@ -47,7 +47,7 @@
 // CHECK: fcmp fast oeq
 // CHECK: fcmp fast oeq
 // CHECK: fcmp fast oeq
-// CHECK: alloca [16 x i1]
+// CHECK: alloca [16 x i32]


 float4x4 xt;
--- a/tools/clang/test/CodeGenHLSL/quick-test/bool_loadbuf_storebuf_memrepr.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_loadbuf_storebuf_memrepr.hlsl
@ -0,0 +1,110 @@
+// RUN: %dxc -E main -T ps_6_0 -O0 %s | FileCheck %s
+
+// Ensure that bools are converted from/to their mem representation when loaded/stored in buffers
+
+// Constant buffer loads
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+// CHECK: extractvalue %dx.types.CBufRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+
+// Structured buffer loads
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32
+// CHECK: extractvalue %dx.types.ResRet.i32
+// CHECK: icmp ne i32 {{.*}}, 0
+
+// Structured buffer stores
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: call void @dx.op.bufferStore.i32
+  
+
+struct AllTheBools
+{
+    bool2x2 m;
+    bool2 v;
+    bool s;
+    bool2x2 ma[2];
+    bool2 va[2];
+    bool sa[2];
+};
+
+ConstantBuffer<AllTheBools> cb;
+StructuredBuffer<AllTheBools> sb;
+RWStructuredBuffer<AllTheBools> rwsb;
+
+float main(int i : I) : SV_Target
+{
+    float result = 0;
+
+    // Constant buffer loads
+    if (cb.m._22 && cb.v.y && cb.s
+        && cb.ma[1]._22 && cb.va[1].y && cb.sa[1])
+    {
+        result++;
+    }
+    
+    // Structured buffer loads
+    if (sb[0].m._22 && sb[0].v.y && sb[0].s
+        && sb[0].ma[1]._22 && sb[0].va[1].y && sb[0].sa[1])
+    {
+        result++;
+    }
+
+    // Structured buffer stores
+    if (result >= 1.0f)
+    {
+        rwsb[0].m._22 = i == 42;
+        rwsb[0].v.y = i == 42;
+        rwsb[0].s = i == 42;
+        rwsb[0].ma[1]._22 = i == 42;
+        rwsb[0].va[1].y = i == 42;
+        rwsb[0].sa[1] = i == 42;
+    }
+
+    return 0;
+}
--- a/tools/clang/test/CodeGenHLSL/quick-test/bool_memrepr.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_memrepr.hlsl
@ -0,0 +1,52 @@
+// RUN: %dxc -E main -T ps_6_0 -O0 %s | FileCheck %s
+
+// Ensure that bools are converted from/to their memory representation when loaded/stored
+
+// Local variables should never be i1s
+// CHECK-not: alloca {{.*}}i1
+
+// Test stores
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+// CHECK: icmp eq i32 {{.*}}, 42
+// CHECK: zext i1 {{.*}} to i32
+// CHECK: store i32
+
+// Test loads
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: load i32
+// CHECK: icmp ne i32 {{.*}}, 0
+
+float main(int i : I) : SV_Target
+{
+    bool s = i == 42;
+    bool1 v = i == 42;
+    bool1x1 m = i == 42;
+    bool sa[1] = { i == 42 };
+    bool1 va[1] = { i == 42 };
+    bool1x1 ma[1] = { i == 42 };
+
+    return s && v.x && m._11 && sa[0] && va[0].x && ma[0]._11 ? 1.0f : 2.0f;
+}
--- a/tools/clang/test/CodeGenHLSL/quick-test/bool_scalar_swizzle.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_scalar_swizzle.hlsl
@ -0,0 +1,15 @@
+// RUN: %dxc -E main -T ps_6_0 -O0 %s | FileCheck %s
+
+// This is mostly a regression test for a bug where a bitcast
+// from i32* to i1* was emitted.
+
+// CHECK: alloca i32
+// CHECK: alloca [2 x i32]
+// CHECK-NOT: bitcast
+
+float main() : SV_Target
+{
+    bool b = true;
+    bool2 b2 = b.xx;
+    return 0;
+}
--- a/tools/clang/test/CodeGenHLSL/quick-test/bool_stress.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/bool_stress.hlsl
@ -0,0 +1,54 @@
+// RUN: %dxc -E main -T vs_6_0 -O0 %s
+
+// Regression test for compiler crashes in complex bool cases
+
+struct AllTheBools
+{
+    bool b : B;
+    bool ba2[2] :BA2;
+    bool1 b1 : B1;
+    bool3 b3 : B3;
+    bool3 b3a2[2] : B3A2;
+    bool1x1 b1x1 : B1X1;
+    bool2x3 b2x3 : B2X3;
+    row_major bool2x3 rmb2x3 : RMB2X3;
+    bool2x3 b2x3a2[2] : B2X3A2;
+};
+
+ConstantBuffer<AllTheBools> cb;
+StructuredBuffer<AllTheBools> sb;
+
+void not(in out bool value) { value = !value; }
+
+void not(in out bool2 value)
+{
+  value = !value;
+  not(value.x);
+  not(value.y);
+}
+
+void not(in out bool3 value)
+{
+  not(value.xz);
+  not(value.y);
+}
+
+AllTheBools main(AllTheBools input, float f : F)
+{
+    AllTheBools output;
+    output.b = input.b ? cb.b : sb[0].b;
+    output.ba2[1] = input.b;
+    output.ba2[0] = input.ba2[1];
+    output.b1 = input.b3.y;
+    output.b3 = input.b.xxx;
+    output.b3a2 = sb[0].b3a2;
+    if (sb[0].b) return cb;
+
+    output.b1x1 = cb.b2x3._22;
+    output.b2x3 = bool2x3(sb[0].b3, bool3(f > 2, input.b, false));
+    output.rmb2x3 = input.b2x3;
+    not(output.rmb2x3[0]);
+    output.b2x3a2[1] = cb.b2x3;
+    output.b2x3a2[0] = input.b2x3;
+    return output;
+}
--- a/tools/clang/test/CodeGenHLSL/quick-test/global-var-write-test04.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/global-var-write-test04.hlsl
@ -7,7 +7,7 @@
 // CHECK: {{.*g_v.*}} = external constant <4 x float>, align 4
 // CHECK: {{.*g_m1.*}} = external constant %class.matrix.int.2.2, align 4
 // CHECK: {{.*g_m2.*}} = external constant %class.matrix.int.2.2, align 4
-// CHECK: {{.*g_b.*}} = external constant i32, align 1
+// CHECK: {{.*g_b.*}} = external constant i32, align 4
 // CHECK: {{.*g_a.*}} = external constant [5 x i32], align 4
 // CHECK: {{.*g_a2d.*}} = external constant [3 x [2 x i32]], align 4
 // CHECK-NOT: {{(.*g_s1.*)(.*static.copy.*)}} = internal global float 0.000000e+00