PIX: Optimize debug instrumentation for fewest emitted instructions (#6281)

This change switches from instrumentation per instruction to instrumentation per basic block. Furthermore, not every instruction in a basic block needs to write debug data to the output UAV- it's enough to know that the basic block was entered, as long as the calling application can figure out what instructions were in that block. To support that knowledge, the pass now emits a text "precis" of each basic block. Also, the previous branchless UAV bounds enforcement was replaced with something similar that emits fewer instructions at the cost of a larger UAV. This tradeoff is WELL worth it. Additionally, the debug pass used to add extra blocks in order to solidify the arguments to phi instructions. This work was unnecessary, and added a lot of complexity to the resulting instrumented shader. The debugger application is only interested in the value of the phi itself and the actual value produced via the actual preceding edge. Full details are in the comments in the code. This change reduces driver-side compilation overhead from "overnight" to 2 minutes on a 160k-instruction shader.
2024-02-13 08:08:22 -08:00 · 2024-02-13 08:08:22 -08:00 · 446da195b4
--- a/include/dxc/DxilPIXPasses/DxilPIXVirtualRegisters.h
+++ b/include/dxc/DxilPIXPasses/DxilPIXVirtualRegisters.h
@ -28,7 +28,7 @@ static constexpr uint32_t ID = 3;

 void AddMD(llvm::LLVMContext &Ctx, llvm::Instruction *pI,
           std::uint32_t InstNum);
-bool FromInst(llvm::Instruction *pI, std::uint32_t *pInstNum);
+bool FromInst(llvm::Instruction const *pI, std::uint32_t *pInstNum);
 } // namespace PixDxilInstNum

 namespace PixDxilReg {
@ -36,7 +36,7 @@ static constexpr char MDName[] = "pix-dxil-reg";
 static constexpr uint32_t ID = 0;

 void AddMD(llvm::LLVMContext &Ctx, llvm::Instruction *pI, std::uint32_t RegNum);
-bool FromInst(llvm::Instruction *pI, std::uint32_t *pRegNum);
+bool FromInst(llvm::Instruction const *pI, std::uint32_t *pRegNum);
 } // namespace PixDxilReg

 namespace PixAllocaReg {
@ -45,7 +45,7 @@ static constexpr uint32_t ID = 1;

 void AddMD(llvm::LLVMContext &Ctx, llvm::AllocaInst *pAlloca,
           std::uint32_t RegNum, std::uint32_t Count);
-bool FromInst(llvm::AllocaInst *pAlloca, std::uint32_t *pRegBase,
+bool FromInst(llvm::AllocaInst const *pAlloca, std::uint32_t *pRegBase,
              std::uint32_t *pRegSize);
 } // namespace PixAllocaReg

--- a/lib/DxilPIXPasses/DxilDbgValueToDbgDeclare.cpp
+++ b/lib/DxilPIXPasses/DxilDbgValueToDbgDeclare.cpp
@ -989,15 +989,6 @@ void DxilDbgValueToDbgDeclare::handleDbgValue(llvm::Module &M,
  }
 }

-class ScopedInstruction {
-  llvm::Instruction *m_Instruction;
-
-public:
-  ScopedInstruction(llvm::Instruction *I) : m_Instruction(I) {}
-  ~ScopedInstruction() { delete m_Instruction; }
-  llvm::Instruction *Get() const { return m_Instruction; }
-};
-
 struct GlobalVariableAndStorage {
  llvm::DIGlobalVariable *DIGV;
  OffsetInBits Offset;
--- a/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp
+++ b/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp
--- a/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp
+++ b/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp
@ -33,7 +33,7 @@ void pix_dxil::PixDxilInstNum::AddMD(llvm::LLVMContext &Ctx,
                         llvm::ConstantAsMetadata::get(B.getInt32(InstNum))}));
 }

-bool pix_dxil::PixDxilInstNum::FromInst(llvm::Instruction *pI,
+bool pix_dxil::PixDxilInstNum::FromInst(llvm::Instruction const *pI,
                                        std::uint32_t *pInstNum) {
  *pInstNum = 0;

@ -73,7 +73,7 @@ void pix_dxil::PixDxilReg::AddMD(llvm::LLVMContext &Ctx, llvm::Instruction *pI,
                         llvm::ConstantAsMetadata::get(B.getInt32(RegNum))}));
 }

-bool pix_dxil::PixDxilReg::FromInst(llvm::Instruction *pI,
+bool pix_dxil::PixDxilReg::FromInst(llvm::Instruction const *pI,
                                    std::uint32_t *pRegNum) {
  *pRegNum = 0;

@ -141,7 +141,7 @@ void pix_dxil::PixAllocaReg::AddMD(llvm::LLVMContext &Ctx,
                         llvm::ConstantAsMetadata::get(B.getInt32(Count))}));
 }

-bool pix_dxil::PixAllocaReg::FromInst(llvm::AllocaInst *pAlloca,
+bool pix_dxil::PixAllocaReg::FromInst(llvm::AllocaInst const *pAlloca,
                                      std::uint32_t *pRegBase,
                                      std::uint32_t *pRegSize) {
  *pRegBase = 0;
--- a/lib/DxilPIXPasses/PixPassHelpers.cpp
+++ b/lib/DxilPIXPasses/PixPassHelpers.cpp
@ -37,9 +37,10 @@ using namespace llvm;
 using namespace hlsl;

 namespace PIXPassHelpers {
-bool IsAllocateRayQueryInstruction(llvm::Value *Val) {
+bool IsAllocateRayQueryInstruction(llvm::Value const *Val) {
  if (Val != nullptr) {
-    if (llvm::Instruction *Inst = llvm::dyn_cast<llvm::Instruction>(Val)) {
+    if (llvm::Instruction const *Inst =
+            llvm::dyn_cast<llvm::Instruction>(Val)) {
      return hlsl::OP::IsDxilOpFuncCallInst(Inst,
                                            hlsl::OP::OpCode::AllocateRayQuery);
    }
--- a/lib/DxilPIXPasses/PixPassHelpers.h
+++ b/lib/DxilPIXPasses/PixPassHelpers.h
@ -20,7 +20,17 @@
 #endif

 namespace PIXPassHelpers {
-bool IsAllocateRayQueryInstruction(llvm::Value *Val);
+
+class ScopedInstruction {
+  llvm::Instruction *m_Instruction;
+
+public:
+  ScopedInstruction(llvm::Instruction *I) : m_Instruction(I) {}
+  ~ScopedInstruction() { delete m_Instruction; }
+  llvm::Instruction *Get() const { return m_Instruction; }
+};
+
+bool IsAllocateRayQueryInstruction(llvm::Value const *Val);
 llvm::CallInst *CreateUAV(hlsl::DxilModule &DM, llvm::IRBuilder<> &Builder,
                          unsigned int registerId, const char *name);
 llvm::CallInst *CreateHandleForResource(hlsl::DxilModule &DM,
--- a/tools/clang/test/HLSLFileCheck/pix/DebugBasic.hlsl
+++ b/tools/clang/test/HLSLFileCheck/pix/DebugBasic.hlsl
@ -1,4 +1,4 @@
-// RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation | %FileCheck %s
+// RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation,UAVSize=128 | %FileCheck %s

 // Check that the basic starting header is present:

@ -10,17 +10,18 @@
 // CHECK: %CompareToX = icmp eq i32 %XIndex, 0
 // CHECK: %CompareToY = icmp eq i32 %YIndex, 0
 // CHECK: %ComparePos = and i1 %CompareToX, %CompareToY
-// CHECK: %OffsetMultiplicand = zext i1 %ComparePos to i32
-// CHECK: %ComplementOfMultiplicand = sub i32 1, %OffsetMultiplicand
-// CHECK: %OffsetAddend = mul i32 983040, %ComplementOfMultiplicand
-// CHECK: %IncrementForThisInvocation = mul i32 8, %OffsetMultiplicand

-// Check the first instruction was instrumented:
-// CHECK: %UAVIncResult = call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle, i32 0
-// CHECK: %MaskedForUAVLimit = and i32 %UAVIncResult, 983039
-// CHECK: %MultipliedForInterest = mul i32 %MaskedForUAVLimit, %OffsetMultiplicand
-// CHECK: %AddedForInterest = add i32 %MultipliedForInterest, %OffsetAddend
-// CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, i32 %AddedForInterest
+
+// Check for branches-for-interest and AND value and counter location for a UAV size of 128
+// CHECK: br i1 %ComparePos, label %PIXInterestingBlock, label %PIXNonInterestingBlock
+// CHECK: %PIXOffsetOr = phi i32 [ 0, %PIXInterestingBlock ], [ 64, %PIXNonInterestingBlock ]
+// CHECK: %PIXCounterLocation = phi i32 [ 63, %PIXInterestingBlock ], [ 127, %PIXNonInterestingBlock ]
+
+// Check the first block header was emitted: (increment, AND + OR)
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle, i32 0
+// CHECK: and i32 
+// CHECK: or i32
+


 [RootSignature("")]
--- a/tools/clang/test/HLSLFileCheck/pix/DebugFlowControl.hlsl
+++ b/tools/clang/test/HLSLFileCheck/pix/DebugFlowControl.hlsl
@ -2,15 +2,31 @@

 // Check that flow control constructs don't break the instrumentation.

-// CHECK:  %UAVIncResult2 = call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle, i32 0
+// CHECK:  call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle, i32 0

-// CHECK:  %MaskedForUAVLimit3 = and i32 %UAVIncResult2, 983039
+// There should be several blocks that have instrumentation:

-// CHECK:  %MultipliedForInterest4 = mul i32 %MaskedForUAVLimit3, %OffsetMultiplicand
+// CHECK: ; preds =
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle

-// CHECK:  %AddedForInterest5 = add i32 %MultipliedForInterest4, %OffsetAddend
+// CHECK: ; preds =
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
+
+// CHECK: ; preds =
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
+
+// CHECK: ; preds =
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
+
+// CHECK: ; preds =
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
+
+// CHECK: ; preds =
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
+
+// CHECK: ; preds =
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle

-// CHECK:  call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, i32 %AddedForInterest5


 struct VS_OUTPUT_ENV {
--- a/tools/clang/test/HLSLFileCheck/pix/DebugInstrumentRet.hlsl
+++ b/tools/clang/test/HLSLFileCheck/pix/DebugInstrumentRet.hlsl
@ -1,15 +0,0 @@
-// RUN: %dxc -T ps_6_3 %s | %opt -S -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation,parameter0=10,parameter1=20,parameter2=30 | %FileCheck %s
-
-
-
-
-// The ret's instruction number should be 4 (the last integer on this line):
-// CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, i32 {{.*}}, i32 undef, i32 4
-// But we'll check that instruction number:
-// CHECK: ret void, !pix-dxil-inst-num [[RetInstNum:![0-9]+]]
-// CHECK-DAG: [[RetInstNum]] = !{i32 3, i32 4}
-
-
-float4 main() : SV_Target {
-  return float4(0, 0, 0, 0);
-}
--- a/tools/clang/test/HLSLFileCheck/pix/DebugLimitedInstructionOverrides.hlsl
+++ b/tools/clang/test/HLSLFileCheck/pix/DebugLimitedInstructionOverrides.hlsl
@ -1,19 +1,19 @@
 // The PIX debug instrumentation pass takes optional arguments that limit the range of instruction numbers that will be instrumented.
 // (This is to cope with extremely large shaders, the instrumentation of which will break, either by out-of-memory or by TDRing when run.)

-// RUN: %dxc -EFlowControlPS -Tps_6_0 %s | %opt -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation,FirstInstruction=6,LastInstruction=9 | %FileCheck %s
+// RUN: %dxc -EFlowControlPS -Tps_6_0 %s | %opt -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation,FirstInstruction=4,LastInstruction=20 | %FileCheck %s

-// The only instrumented instructions should have instruction numbers in the range [6,9):
-// CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, {{.*}}, i32 undef, i32 6
-// CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, {{.*}}, i32 undef, i32 7
-// CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, {{.*}}, i32 undef, i32 8
+// The only instrumented blocks should have instruction numbers in the range [4,20):

-// Two more stores to finish off the instrumentation for instruction #8:
-// CHECK: call void @dx.op.bufferStore.f32
-// CHECK: call void @dx.op.bufferStore.i32
+// Skip over the preamble
+// CHECK: switch i32
+// 
+// Now there should be exactly two more instrumented blocks (two increments of the counter UAV entry)
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle

-// Then no more instrumentation at all:
-// CHECK-NOT: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle
+// Then no more instrumentation at all (i.e. no more increments of the counter UAV entry):
+// CHECK-NOT: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle

 struct VS_OUTPUT_ENV {
  float4 Pos : SV_Position;
--- a/tools/clang/test/HLSLFileCheck/pix/DebugPhisFor.hlsl
+++ b/tools/clang/test/HLSLFileCheck/pix/DebugPhisFor.hlsl
@ -1,59 +0,0 @@
-// RUN: %dxc -EForLoopPS -Tps_6_0 %s -Od | %opt -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation | %FileCheck %s
-
-// Ensure that the pass added at the begining of the for body:
-// CHECK: br label %PIXDebug
-// CHECK: br label %PIXDebug
-// CHECK: br label %PIXDebug
-
-// Followed by lots of new pix debug blocks:
-
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-
-struct VS_OUTPUT_ENV {
-  float4 Pos : SV_Position;
-  float2 Tex : TEXCOORD0;
-};
-
-uint i32;
-
-float4 ForLoopPS(VS_OUTPUT_ENV input) : SV_Target {
-  float4 ret = float4(0, 0, 0, 0);
-  for (uint i = 0; i < abs(input.Tex.x * 200); ++i) {
-    ret.x += (float)i32;
-    if (i + i32 == 0) {
-      break;
-    }
-    ret.y += (float)i32;
-    if (i + i32 == 1) {
-      continue;
-    }
-    ret.z += (float)i32;
-    if (i + i32 == 2) {
-      break;
-    }
-    ret.w += (float)i32;
-    if (i + i32 == 3) {
-      continue;
-    }
-  }
-  return ret;
-}
--- a/tools/clang/test/HLSLFileCheck/pix/DebugPhisIfElse.hlsl
+++ b/tools/clang/test/HLSLFileCheck/pix/DebugPhisIfElse.hlsl
@ -1,27 +0,0 @@
-// RUN: %dxc -EFlowControlPS -Tps_6_0 %s -Od | %opt -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation | %FileCheck %s
-
-// Ensure that the pass added a block at the end of this if/else:
-// CHECK: br label %PIXDebug
-// CHECK: br label %PIXDebug
-
-// Check that block 0 emits some debug info and returns where we expect:
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-// Check that block 1 emits some debug info and returns where we expect:
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-
-float4 FlowControlPS(in uint value : value ) : SV_Target
-{
-  float4 ret = float4(0, 0, 0, 0);
-  if (value > 1) {
-    ret = float4(0, 0, 0, 2);
-  } else {
-    ret = float4(0, 0, 0, 1);
-  }
-  return ret;
-}
--- a/tools/clang/test/HLSLFileCheck/pix/DebugPhisSwicth.hlsl
+++ b/tools/clang/test/HLSLFileCheck/pix/DebugPhisSwicth.hlsl
@ -1,40 +0,0 @@
-// RUN: %dxc -EFlowControlPS -Tps_6_0 %s -Od | %opt -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation | %FileCheck %s
-
-// Check for a branch to a new block for each case:
-// CHECK: br label %PIXDebug
-// CHECK: br label %PIXDebug
-// CHECK: br label %PIXDebug
-
-// Check that three PIXDebug blocks emit some debug info and returns where we expect:
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-// Check that three PIXDebug blocks emit some debug info and returns where we expect:
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-// Check that three PIXDebug blocks emit some debug info and returns where we expect:
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-
-float4 FlowControlPS(in uint value : value ) : SV_Target
-{
-  float4 ret = float4(0, 0, 0, 0);
-  switch (value)
-  {
-  case 0:
-    ret = float4(1, 0, 0, 0);
-    break;
-  case 1:
-    ret = float4(2, 0, 0, 0);
-    break;
-  default:
-    ret = float4(3, 0, 0, 0);
-    break;
-  }
-  return ret;
-}
--- a/tools/clang/test/HLSLFileCheck/pix/DebugUAVSize.hlsl
+++ b/tools/clang/test/HLSLFileCheck/pix/DebugUAVSize.hlsl
@ -1,11 +1,10 @@
-// RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation,UAVSize=100000 | %FileCheck %s
+// RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation,UAVSize=1024 | %FileCheck %s

-// Check that the UAV size is reflected in the instrumentation. (Should be passed-in size - 64k)
-// (The offset here is the "dumping ground" for non-interesting invocations)
-// 100,000 - 65.536 = 34,464
-
-// CHECK: %OffsetAddend = mul i32 34464, %ComplementOfMultiplicand
+// Check that the UAV size is reflected in the instrumentation.
+// The AND should be (1024/4-1), and the or should be 1024/2:

+// CHECK: %PIXOffsetOr = phi i32 [ 0, %PIXInterestingBlock ], [ 512, %PIXNonInterestingBlock ]
+// CHECK: and i32 {{.*}}, 255

 [RootSignature("")]
 float4 main() : SV_Target {
--- a/tools/clang/test/HLSLFileCheck/pix/DebugVSParameters.hlsl
+++ b/tools/clang/test/HLSLFileCheck/pix/DebugVSParameters.hlsl
@ -8,7 +8,6 @@
 // CHECK: %CompareToVertId = icmp eq i32 %VertId, 1
 // CHECK: %CompareToInstanceId = icmp eq i32 %InstanceId, 2
 // CHECK: %CompareBoth = and i1 %CompareToVertId, %CompareToInstanceId
-// CHECK: %OffsetMultiplicand = zext i1 %CompareBoth to i32


 [RootSignature("")]
--- a/tools/clang/unittests/HLSL/PixTest.cpp
+++ b/tools/clang/unittests/HLSL/PixTest.cpp
@ -131,6 +131,9 @@ public:

  TEST_METHOD(DxilPIXDXRInvocationsLog_SanityTest)

+  TEST_METHOD(DebugInstrumentation_TextOutput)
+  TEST_METHOD(DebugInstrumentation_BlockReport)
+
  dxc::DxcDllSupport m_dllSupport;
  VersionSupportInfo m_ver;

@ -188,6 +191,32 @@ public:
        std::move(pOptimizedModule), {}, Tokenize(outputText.c_str(), "\n")};
  }

+  PassOutput RunDebugPass(IDxcBlob *dxil, int UAVSize = 1024 * 1024) {
+    CComPtr<IDxcOptimizer> pOptimizer;
+    VERIFY_SUCCEEDED(
+        m_dllSupport.CreateInstance(CLSID_DxcOptimizer, &pOptimizer));
+    std::vector<LPCWSTR> Options;
+    Options.push_back(L"-opt-mod-passes");
+    Options.push_back(L"-dxil-dbg-value-to-dbg-declare");
+    Options.push_back(L"-dxil-annotate-with-virtual-regs");
+    std::wstring debugArg =
+        L"-hlsl-dxil-debug-instrumentation,UAVSize=" + std::to_wstring(UAVSize);
+    Options.push_back(debugArg.c_str());
+
+    CComPtr<IDxcBlob> pOptimizedModule;
+    CComPtr<IDxcBlobEncoding> pText;
+    VERIFY_SUCCEEDED(pOptimizer->RunOptimizer(
+        dxil, Options.data(), Options.size(), &pOptimizedModule, &pText));
+
+    std::string outputText;
+    if (pText->GetBufferSize() != 0) {
+      outputText = reinterpret_cast<const char *>(pText->GetBufferPointer());
+    }
+
+    return {
+        std::move(pOptimizedModule), {}, Tokenize(outputText.c_str(), "\n")};
+  }
+
  CComPtr<IDxcBlob> FindModule(hlsl::DxilFourCC fourCC, IDxcBlob *pSource) {
    const UINT32 BC_C0DE = ((INT32)(INT8)'B' | (INT32)(INT8)'C' << 8 |
                            (INT32)0xDEC0 << 16); // BC0xc0de in big endian
@ -2570,3 +2599,96 @@ void MyMiss(inout MyPayload payload)
  auto compiledLib = Compile(m_dllSupport, source, L"lib_6_6", {});
  RunDxilPIXDXRInvocationsLog(compiledLib);
 }
+
+TEST_F(PixTest, DebugInstrumentation_TextOutput) {
+
+  const char *source = R"x(
+float4 main() : SV_Target {
+    return float4(0,0,0,0);
+})x";
+
+  auto compiled = Compile(m_dllSupport, source, L"ps_6_0", {});
+  auto output = RunDebugPass(compiled, 8 /*ludicrously low UAV size limit*/);
+  bool foundStaticOverflow = false;
+  bool foundCounterOffset = false;
+  bool foundThreshold = false;
+  for (auto const &line : output.lines) {
+    if (line.find("StaticOverflow:12") != std::string::npos)
+      foundStaticOverflow = true;
+    if (line.find("InterestingCounterOffset:3") != std::string::npos)
+      foundCounterOffset = true;
+    if (line.find("OverflowThreshold:1") != std::string::npos)
+      foundThreshold = true;
+  }
+  VERIFY_IS_TRUE(foundStaticOverflow);
+}
+
+TEST_F(PixTest, DebugInstrumentation_BlockReport) {
+
+  const char *source = R"x(
+RWStructuredBuffer<int> UAV: register(u0);
+float4 main() : SV_Target {
+    // basic int variable
+    int v = UAV[0];
+    if(v == 0)
+        UAV[1] = v;
+    else
+        UAV[2] = v;
+    // float with indexed alloca
+    float f[2];
+    f[0] = UAV[4];
+    f[1] = UAV[5];
+    if(v == 2)
+        f[0] = v;
+    else
+        f[1] = v;
+    float farray2[2];
+    farray2[0] = UAV[4];
+    farray2[1] = UAV[5];
+    if(v == 4)
+        farray2[0] = v;
+    else
+        farray2[1] = v;
+    double d = UAV[8];
+    int64_t i64 = UAV[9];
+    return float4(d,i64,0,0);
+})x";
+
+  auto compiled = Compile(m_dllSupport, source, L"ps_6_0", {L"-Od"});
+  auto output = RunDebugPass(compiled);
+  bool foundBlock = false;
+  bool foundRet = false;
+  bool foundUnnumberedVoidProllyADXNothing = false;
+  bool found32BitAssignment = false;
+  bool foundFloatAssignment = false;
+  bool foundDoubleAssignment = false;
+  bool found64BitAssignment = false;
+  bool found32BitAllocaStore = false;
+  for (auto const &line : output.lines) {
+    if (line.find("Block#") != std::string::npos) {
+      if (line.find("r,0,r;") != std::string::npos)
+        foundRet = true;
+      if (line.find("v,0,v;") != std::string::npos)
+        foundUnnumberedVoidProllyADXNothing = true;
+      if (line.find("3,3,a;") != std::string::npos)
+        found32BitAssignment = true;
+      if (line.find("d,13,a;") != std::string::npos)
+        foundDoubleAssignment = true;
+      if (line.find("f,19,a;") != std::string::npos)
+        foundFloatAssignment = true;
+      if (line.find("6,16,a;") != std::string::npos)
+        found64BitAssignment = true;
+      if (line.find("3,3,s,2+0;") != std::string::npos)
+        found32BitAllocaStore = true;
+      foundBlock = true;
+    }
+  }
+  VERIFY_IS_TRUE(foundBlock);
+  VERIFY_IS_TRUE(foundRet);
+  VERIFY_IS_TRUE(foundUnnumberedVoidProllyADXNothing);
+  VERIFY_IS_TRUE(found32BitAssignment);
+  VERIFY_IS_TRUE(found64BitAssignment);
+  VERIFY_IS_TRUE(foundFloatAssignment);
+  VERIFY_IS_TRUE(foundDoubleAssignment);
+  VERIFY_IS_TRUE(found32BitAllocaStore);
+}