diff --git a/include/dxc/DxilPIXPasses/DxilPIXVirtualRegisters.h b/include/dxc/DxilPIXPasses/DxilPIXVirtualRegisters.h
index 3af17c42e..a06cfe965 100644
--- a/include/dxc/DxilPIXPasses/DxilPIXVirtualRegisters.h
+++ b/include/dxc/DxilPIXPasses/DxilPIXVirtualRegisters.h
@@ -28,7 +28,7 @@ static constexpr uint32_t ID = 3;
 
 void AddMD(llvm::LLVMContext &Ctx, llvm::Instruction *pI,
            std::uint32_t InstNum);
-bool FromInst(llvm::Instruction *pI, std::uint32_t *pInstNum);
+bool FromInst(llvm::Instruction const *pI, std::uint32_t *pInstNum);
 } // namespace PixDxilInstNum
 
 namespace PixDxilReg {
@@ -36,7 +36,7 @@ static constexpr char MDName[] = "pix-dxil-reg";
 static constexpr uint32_t ID = 0;
 
 void AddMD(llvm::LLVMContext &Ctx, llvm::Instruction *pI, std::uint32_t RegNum);
-bool FromInst(llvm::Instruction *pI, std::uint32_t *pRegNum);
+bool FromInst(llvm::Instruction const *pI, std::uint32_t *pRegNum);
 } // namespace PixDxilReg
 
 namespace PixAllocaReg {
@@ -45,7 +45,7 @@ static constexpr uint32_t ID = 1;
 
 void AddMD(llvm::LLVMContext &Ctx, llvm::AllocaInst *pAlloca,
            std::uint32_t RegNum, std::uint32_t Count);
-bool FromInst(llvm::AllocaInst *pAlloca, std::uint32_t *pRegBase,
+bool FromInst(llvm::AllocaInst const *pAlloca, std::uint32_t *pRegBase,
               std::uint32_t *pRegSize);
 } // namespace PixAllocaReg
 
diff --git a/lib/DxilPIXPasses/DxilDbgValueToDbgDeclare.cpp b/lib/DxilPIXPasses/DxilDbgValueToDbgDeclare.cpp
index ea93d9968..c85812f34 100644
--- a/lib/DxilPIXPasses/DxilDbgValueToDbgDeclare.cpp
+++ b/lib/DxilPIXPasses/DxilDbgValueToDbgDeclare.cpp
@@ -989,15 +989,6 @@ void DxilDbgValueToDbgDeclare::handleDbgValue(llvm::Module &M,
   }
 }
 
-class ScopedInstruction {
-  llvm::Instruction *m_Instruction;
-
-public:
-  ScopedInstruction(llvm::Instruction *I) : m_Instruction(I) {}
-  ~ScopedInstruction() { delete m_Instruction; }
-  llvm::Instruction *Get() const { return m_Instruction; }
-};
-
 struct GlobalVariableAndStorage {
   llvm::DIGlobalVariable *DIGV;
   OffsetInBits Offset;
diff --git a/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp b/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp
index 1337cdfb3..f705d9192 100644
--- a/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp
+++ b/lib/DxilPIXPasses/DxilDebugInstrumentation.cpp
@@ -9,6 +9,7 @@
 //                                                                           //
 ///////////////////////////////////////////////////////////////////////////////
 
+#include <optional>
 #include <vector>
 
 #include "dxc/DXIL/DxilFunctionProps.h"
@@ -34,69 +35,103 @@ using namespace hlsl;
 //
 // In summary, instructions are added that cause a "trace" of the execution of
 // the shader to be written out to a UAV. This trace is then used by a debugger
-// application to provide a post-mortem debugging experience that reconstructs
-// the execution history of the shader.
+// application to provide a postmortem debugging experience that reconstructs
+// the execution history of the shader. The caller specifies the power-of-two
+// size of the UAV.
 //
-// The trace is only required for a particular shader instance of interest, and
+// The instrumentation is added per basic block, and each block will then write
+// a contiguous sequence of values into the UAV.
+//
+// The trace is only required for particular shader instances of interest, and
 // a branchless mechanism is used to write the trace either to an incrementing
-// location within the UAV, or to a "dumping ground" area at the top of the UAV
-// if the instance is not of interest.
+// location within the UAV, or to a "dumping ground" area in the top half of the
+// UAV if the instance is not of interest.
 //
-// The following modifications are made:
+// In addition, each half of the UAV is further subdivided: the first quarter is
+// the are in which blocks are permitted to start writing their sequence, and
+// that sequence is constrained to be no longer than the size of the second
+// quarter. This allows us to limit writes to the appropriate half of the UAV
+// via a single AND at the beginning of the basic block. An additoinal OR
+// provides the offset, either 0 for threads-of-interest, or UAVSize/2 for
+// not-of-interest.
+//
+// Threads determine where to start writing their data by incrementing a DWORD
+// that lives at the very top of that thread's half of the UAV. This is done
+// because several threads may satisfy the selection criteria (e.g. a pixel
+// shader may be invoked several times for a given pixel coordinate if the model
+// has overlapping triangles).
+//
+// A picture of the UAV layout:
+// <--------------power-of-two-size-of-UAV---------------->
+// [1           ][2           ][3           ][4           ]
+// <------A----->             ^                           ^
+//                            B                           C
+//                            <------D------>
+//
+// A: the size of the AND for interesting writes. Their payloads extend
+// beyond this into area 2, but those payloads are limited to be small
+// enough (1/4 UAV size -1) that they don't overwrite B.
+// B: The interesting thread's counter.
+// C: The uninteresting thread's counter.
+// D: Size of the AND for uninteresting threads (same value as A)
+//
+// The following modifications are made by this pass:
 //
 // First, instructions are added to the top of the entry point function that
 // implement the following:
 // -  Examine the input variables that define the instance of the shader that is
-// running. This will
-//    be SV_Position for pixel shaders, SV_Vertex+SV_Instance for vertex
-//    shaders, thread id for compute shaders etc. If these system values need to
-//    be added to the shader, then they are also added to the input signature,
-//    if appropriate.
+//    running. This will be SV_Position for pixel shaders, SV_Vertex+SV_Instance
+//    for vertex shaders, thread id for compute shaders etc. If these system
+//    values need to be added to the shader, then they are also added to the
+//    input signature, if appropriate.
 // -  Compare the above variables with the instance of interest defined by the
-// invoker of this pass.
-//    Deduce two values: a multiplicand and an addend that together allow a
-//    branchless calculation of the offset into the UAV at which to write via
-//    "offset = offset * multiplicand + addend." If the instance is NOT of
-//    interest, the multiplicand is zero and the addend is sizeof(UAV)-(a little
-//    bit), causing writes for uninteresting invocations to end up at the top of
-//    the UAV. Otherwise the multiplicand is 1 and the addend is 0.
+//    invoker of this pass. If equal, create an OR value of zero that will
+//    not affect the block's starting write offset. If not equal, the OR will
+//    move the writes into the second half of the UAV.
 // -  Calculate an "instance identifier". Even with the above instance
-// identification, several invocations may
-//    end up matching the selection criteria. Specifically, this happens during
-//    a draw call in which many triangles overlap the pixel of interest. More on
-//    this below.
+//    identification, several invocations may end up matching the selection
+//    criteria. More on this below.
 //
-// During execution, the instrumentation for most instructions cause data to be
-// emitted to the UAV. The index at which data is written is identified by
-// treating the first uint32 of the UAV as an index which is atomically
-// incremented by the instrumentation. The very first value of this counter that
+// As mentioned, a counter/offset is maintained at the top of the thread's
+// half of the UAV. The very first value of this counter that
 // is encountered by each invocation is used as the "instance identifier"
 // mentioned above. That instance identifier is written out with each packet,
-// since many pixel shaders executing in parallel will emit interleaved packets,
-// and the debugger application uses the identifiers to group packets from each
+// since many threads executing in parallel will emit interleaved packets,
+// and the debugger application uses the identifiers to gather packets from each
 // separate invocation together.
 //
-// If an instruction has a non-void and primitive return type, i.e. isn't a
-// struct, then the instrumentation will write that value out to the UAV as well
-// as part of the "step" data packet.
+// In addition to the above, this pass creates a text precis of the structure
+// being written out for each basic block. This precis is passed back to the
+// caller, and can be used to parse the UAV output later. The precis will
+// contain notes about void-type instructions, which won't write anything to the
+// UAV, allowing the caller to reconstruct those instructions.
+// Some care has to be taken about whether to emit UAV writes after the
+// corresponding instruction or before. Terminators must emit their UAV data
+// before the terminator itself, of course. Phi instructions get special
+// treatment also: their instrumentation has to come after (since phis must be
+// the first instructions in the block), but also the instrumentation must
+// execute in the same order as the precis specifies, or the caller will mix
+// up the phi values. We achieve this by saying that phi instrumentation must
+// come before the first non-phi instruction in the block.
+// Some blocks will have all-void instructions, so that no debugging
+// data is emitted at all. These blocks still produce a precis, and still
+// need to be noticed during execution, so an empty block header is emitted
+// into the UAV.
 //
-// The limiting size of the UAV is enforced in a branchless way by ANDing the
-// offset with a precomputed value that is sizeof(UAV)-64. The actual size of
-// the UAV allocated by the caller is required to be a power of two plus 64 for
-// this reason. The caller detects UAV overrun by examining a canary value close
-// to the end of the power-of-two size of the UAV. If this value has been
-// overwritten, the debug session is deemed to have overflowed the UAV. The
-// caller will than allocate a UAV that is twice the size and try again, up to a
-// predefined maximum.
-
-// Keep these in sync with the same-named value in the debugger application's
-// WinPixShaderUtils.h
-
-constexpr uint64_t DebugBufferDumpingGroundSize = 64 * 1024;
-// The actual max size per record is much smaller than this, but it never
-// hurts to be generous.
-constexpr size_t CounterOffsetBeyondUsefulData =
-    DebugBufferDumpingGroundSize / 2;
+// Error conditions:
+// Overflow of the debug output from the interesting threads will start to
+// overwrite their own area of the UAV (after the AND limits those writes
+// to the lower half of the UAV (thus, by the way, avoiding overwriting
+// their counter value)). The caller must check the counter value after
+// the debugging run is complete to see if this happened, and if so, increase
+// the UAV size and try again.
+// Uninteresting threads use an AND value that limits their writes to the
+// upper half of the UAV and can be entirely ignored by the caller.
+// Since a sufficiently-large block is guaranteed to overflow the UAV,
+// the precis-creation can exit early and report this "static" overflow
+// condition to the caller.
+// In all overflow cases, the caller is expected to try to instrument again,
+// with a larger UAV.
 
 // These definitions echo those in the debugger application's
 // debugshaderrecord.h file
@@ -110,7 +145,12 @@ enum DebugShaderModifierRecordType {
   DebugShaderModifierRecordTypeRegisterRelativeIndex0,
   DebugShaderModifierRecordTypeRegisterRelativeIndex1,
   DebugShaderModifierRecordTypeRegisterRelativeIndex2,
-  DebugShaderModifierRecordTypeDXILStepTerminator = 250,
+  // Note that everything above this line is no longer used, but is kept
+  // here in order to keep this file more in-sync with the debugger source.
+  // (As of this writing, the debugger still supports older versions of this
+  // pass which produced finer-grained debug packets.)
+  DebugShaderModifierRecordTypeDXILStepBlock = 249,
+  DebugShaderModifierRecordTypeDXILStepRet = 250,
   DebugShaderModifierRecordTypeDXILStepVoid = 251,
   DebugShaderModifierRecordTypeDXILStepFloat = 252,
   DebugShaderModifierRecordTypeDXILStepUint32 = 253,
@@ -150,6 +190,20 @@ struct DebugShaderModifierRecordDXILStepBase {
   uint32_t InstructionOffset;
 };
 
+struct DebugShaderModifierRecordDXILBlock {
+  union {
+    struct {
+      uint32_t NotUsed0 : 4;
+      uint32_t NotUsed1 : 4;
+      uint32_t Type : 8;
+      uint32_t CountOfInstructions : 16;
+    } Details;
+    uint32_t u32Header;
+  } Header;
+  uint32_t UID;
+  uint32_t FirstInstructionOrdinal;
+};
+
 template <typename ReturnType>
 struct DebugShaderModifierRecordDXILStep
     : public DebugShaderModifierRecordDXILStepBase {
@@ -174,6 +228,16 @@ DebugShaderModifierRecordPayloadSizeDwords(size_t recordTotalSizeBytes) {
           sizeof(uint32_t));
 }
 
+struct InstructionAndType {
+  Instruction *Inst;
+  std::uint32_t InstructionOrdinal;
+  DebugShaderModifierRecordType Type;
+  std::uint32_t RegisterNumber;
+  std::uint32_t AllocaBase;
+  Value *AllocaWriteIndex = nullptr;
+  std::optional<uint64_t> ConstantAllocaStoreValue;
+};
+
 class DxilDebugInstrumentation : public ModulePass {
 
 private:
@@ -220,23 +284,19 @@ private:
   uint64_t m_UAVSize = 1024 * 1024;
   struct PerFunctionValues {
     CallInst *UAVHandle = nullptr;
-    Constant *CounterOffset = nullptr;
+    Instruction *CounterOffset = nullptr;
     Value *InvocationId = nullptr;
     // Together these two values allow branchless writing to the UAV. An
     // invocation of the shader is either of interest or not (e.g. it writes to
     // the pixel the user selected for debugging or it doesn't). If not of
     // interest, debugging output will still occur, but it will be relegated to
-    // the very top few bytes of the UAV. Invocations of interest, by contrast,
+    // the top half of the UAV. Invocations of interest, by contrast,
     // will be written to the UAV at sequentially increasing offsets.
-    // This value will either be one or zero (one if the invocation is of
-    // interest, zero otherwise)
-    Value *OffsetMultiplicand = nullptr;
-    // This will either be zero (if the invocation is of interest) or
-    // (UAVSize)-(SmallValue) if not.
-    Value *OffsetAddend = nullptr;
-    Constant *OffsetMask = nullptr;
+    Value *OffsetMask = nullptr;
+    Instruction *OffsetOr = nullptr;
     Value *SelectionCriterion = nullptr;
     Value *CurrentIndex = nullptr;
+    std::vector<BasicBlock *> AddedBlocksToIgnoreForInstrumentation;
   };
   std::map<llvm::Function *, PerFunctionValues> m_FunctionToValues;
 
@@ -275,20 +335,37 @@ private:
                                SystemValueIndices SVIndices);
   Value *addHullhaderProlog(BuilderContext &BC);
   Value *addComparePrimitiveIdProlog(BuilderContext &BC, unsigned SVIndices);
-  void addDebugEntryValue(BuilderContext &BC, Value *TheValue);
+  uint32_t addDebugEntryValue(BuilderContext &BC, Value *TheValue);
   void addInvocationStartMarker(BuilderContext &BC);
+  void determineLimitANDAndInitializeCounter(BuilderContext &BC);
   void reserveDebugEntrySpace(BuilderContext &BC, uint32_t SpaceInDwords);
-  void addStoreStepDebugEntry(BuilderContext &BC, StoreInst *Inst);
-  void addStepDebugEntry(BuilderContext &BC, Instruction *Inst);
-  void addStepDebugEntryValue(BuilderContext &BC, std::uint32_t InstNum,
-                              Value *V, std::uint32_t ValueOrdinal,
-                              Value *ValueOrdinalIndex);
+  std::optional<InstructionAndType> addStoreStepDebugEntry(BuilderContext *BC,
+                                                           StoreInst *Inst);
+  std::optional<InstructionAndType> addStepDebugEntry(BuilderContext *BC,
+                                                      Instruction *Inst);
+  std::optional<DebugShaderModifierRecordType>
+  addStepDebugEntryValue(BuilderContext *BC, std::uint32_t InstNum, Value *V,
+                         std::uint32_t ValueOrdinal, Value *ValueOrdinalIndex);
   uint32_t UAVDumpingGroundOffset();
   template <typename ReturnType>
   void addStepEntryForType(DebugShaderModifierRecordType RecordType,
                            BuilderContext &BC, std::uint32_t InstNum, Value *V,
                            std::uint32_t ValueOrdinal,
                            Value *ValueOrdinalIndex);
+  struct InstructionToInstrument {
+    Value *ValueToWriteToDebugMemory;
+    DebugShaderModifierRecordType ValueType;
+    Instruction *InstructionAfterWhichToAddInstrumentation;
+    Instruction *InstructionBeforeWhichToAddInstrumentation;
+  };
+  struct BlockInstrumentationData {
+    uint32_t FirstInstructionOrdinalInBlock;
+    std::vector<InstructionToInstrument> Instructions;
+  };
+  BlockInstrumentationData FindInstrumentableInstructionsInBlock(BasicBlock &BB,
+                                                                 OP *HlslOP);
+  uint32_t
+  CountBlockPayloadBytes(std::vector<InstructionToInstrument> const &IsAndTs);
 };
 
 void DxilDebugInstrumentation::applyOptions(PassOptions O) {
@@ -302,7 +379,7 @@ void DxilDebugInstrumentation::applyOptions(PassOptions O) {
 }
 
 uint32_t DxilDebugInstrumentation::UAVDumpingGroundOffset() {
-  return static_cast<uint32_t>(m_UAVSize - DebugBufferDumpingGroundSize);
+  return static_cast<uint32_t>(m_UAVSize / 2);
 }
 
 static unsigned FindOrAddInputSignatureElement(
@@ -653,23 +730,87 @@ void DxilDebugInstrumentation::addInvocationSelectionProlog(
     assert(false); // guaranteed by runOnModule
   }
 
-  // This is a convenient place to calculate the values that modify the UAV
-  // offset for invocations of interest and for UAV size.
   auto &values = m_FunctionToValues[BC.Builder.GetInsertBlock()->getParent()];
-  values.OffsetMultiplicand =
-      BC.Builder.CreateCast(Instruction::CastOps::ZExt, ParameterTestResult,
-                            Type::getInt32Ty(BC.Ctx), "OffsetMultiplicand");
-  auto InverseOffsetMultiplicand =
-      BC.Builder.CreateSub(BC.HlslOP->GetU32Const(1), values.OffsetMultiplicand,
-                           "ComplementOfMultiplicand");
-  values.OffsetAddend =
-      BC.Builder.CreateMul(BC.HlslOP->GetU32Const(UAVDumpingGroundOffset()),
-                           InverseOffsetMultiplicand, "OffsetAddend");
-  values.OffsetMask = BC.HlslOP->GetU32Const(UAVDumpingGroundOffset() - 1);
-
   values.SelectionCriterion = ParameterTestResult;
 }
 
+void DxilDebugInstrumentation::determineLimitANDAndInitializeCounter(
+    BuilderContext &BC) {
+
+  auto &values = m_FunctionToValues[BC.Builder.GetInsertBlock()->getParent()];
+
+  // Split the block at the current insertion point. Insert a conditional
+  // branch that will invoke one of two new blocks depending on if this
+  // is a thread-of-interest. The two different classes of thread will
+  // then be given different limiting AND values within these new
+  // blocks.
+
+  BasicBlock *RestOfMainBlock = BC.Builder.GetInsertBlock()->splitBasicBlock(
+      *BC.Builder.GetInsertPoint());
+
+  // Up to this split point is a new block that we don't need to instrument:
+  values.AddedBlocksToIgnoreForInstrumentation.push_back(
+      BC.Builder.GetInsertBlock());
+
+  auto *InterestingInvocationBlock = BasicBlock::Create(
+      BC.Ctx, "PIXInterestingBlock", BC.Builder.GetInsertBlock()->getParent(),
+      RestOfMainBlock);
+  values.AddedBlocksToIgnoreForInstrumentation.push_back(
+      InterestingInvocationBlock);
+  IRBuilder<> BuilderForInteresting(InterestingInvocationBlock);
+  BuilderForInteresting.CreateBr(RestOfMainBlock);
+
+  auto *NonInterestingInvocationBlock = BasicBlock::Create(
+      BC.Ctx, "PIXNonInterestingBlock",
+      BC.Builder.GetInsertBlock()->getParent(), RestOfMainBlock);
+  values.AddedBlocksToIgnoreForInstrumentation.push_back(
+      NonInterestingInvocationBlock);
+
+  IRBuilder<> BuilderForNonInteresting(NonInterestingInvocationBlock);
+  BuilderForNonInteresting.CreateBr(RestOfMainBlock);
+
+  // Connect these new blocks as necessary:
+  BC.Builder.SetInsertPoint(BC.Builder.GetInsertBlock()->getTerminator());
+  BC.Builder.CreateCondBr(values.SelectionCriterion, InterestingInvocationBlock,
+                          NonInterestingInvocationBlock);
+  BC.Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
+
+  values.OffsetMask = BC.HlslOP->GetU32Const(m_UAVSize / 4 - 1);
+
+  // Now add a phi that selects between two constant OR values based on
+  // which branch the thread followed above (interesting or not).
+  // The OR will either place the output in the lower half or the upper
+  // half of the UAV.
+  BC.Builder.SetInsertPoint(RestOfMainBlock->getFirstInsertionPt());
+  auto *PHIForOr =
+      BC.Builder.CreatePHI(Type::getInt32Ty(BC.Ctx), 2, "PIXOffsetOr");
+  PHIForOr->addIncoming(BC.HlslOP->GetU32Const(0), InterestingInvocationBlock);
+  PHIForOr->addIncoming(BC.HlslOP->GetU32Const(m_UAVSize / 2),
+                        NonInterestingInvocationBlock);
+  values.OffsetOr = PHIForOr;
+
+  auto *PHIForCounterOffset =
+      BC.Builder.CreatePHI(Type::getInt32Ty(BC.Ctx), 2, "PIXCounterLocation");
+  const uint32_t InterestingCounterOffset =
+      static_cast<uint32_t>(m_UAVSize / 2 - 1);
+  PHIForCounterOffset->addIncoming(
+      BC.HlslOP->GetU32Const(InterestingCounterOffset),
+      InterestingInvocationBlock);
+  const uint32_t UninterestingCounterOffsetValue =
+      static_cast<uint32_t>(m_UAVSize - 1);
+  PHIForCounterOffset->addIncoming(
+      BC.HlslOP->GetU32Const(UninterestingCounterOffsetValue),
+      NonInterestingInvocationBlock);
+  values.CounterOffset = PHIForCounterOffset;
+
+  // These are reported to the caller so there are fewer assumptions made by the
+  // caller about these internal details:
+  *OSOverride << "InterestingCounterOffset:"
+              << std::to_string(InterestingCounterOffset) << "\n";
+  *OSOverride << "OverflowThreshold:" << std::to_string(m_UAVSize / 4 - 1)
+              << "\n";
+}
+
 void DxilDebugInstrumentation::reserveDebugEntrySpace(BuilderContext &BC,
                                                       uint32_t SpaceInBytes) {
   auto &values = m_FunctionToValues[BC.Builder.GetInsertBlock()->getParent()];
@@ -687,11 +828,7 @@ void DxilDebugInstrumentation::reserveDebugEntrySpace(BuilderContext &BC,
       BC.HlslOP->GetU32Const((unsigned)DXIL::AtomicBinOpCode::Add);
   UndefValue *UndefArg = UndefValue::get(Type::getInt32Ty(BC.Ctx));
 
-  // so inc will be zero for uninteresting invocations:
   Constant *Increment = BC.HlslOP->GetU32Const(SpaceInBytes);
-  Value *IncrementForThisInvocation = BC.Builder.CreateMul(
-      Increment, values.OffsetMultiplicand, "IncrementForThisInvocation");
-
   auto PreviousValue = BC.Builder.CreateCall(
       AtomicOpFunc,
       {
@@ -699,10 +836,10 @@ void DxilDebugInstrumentation::reserveDebugEntrySpace(BuilderContext &BC,
           values.UAVHandle, // %dx.types.Handle, ; resource handle
           AtomicAdd, // i32, ; binary operation code : EXCHANGE, IADD, AND, OR,
                      // XOR, IMIN, IMAX, UMIN, UMAX
-          values.CounterOffset,       // i32, ; coordinate c0: index in bytes
-          UndefArg,                   // i32, ; coordinate c1 (unused)
-          UndefArg,                   // i32, ; coordinate c2 (unused)
-          IncrementForThisInvocation, // i32); increment value
+          values.CounterOffset, // i32, ; coordinate c0: index in bytes
+          UndefArg,             // i32, ; coordinate c1 (unused)
+          UndefArg,             // i32, ; coordinate c2 (unused)
+          Increment,            // i32); increment value
       },
       "UAVIncResult");
 
@@ -710,22 +847,18 @@ void DxilDebugInstrumentation::reserveDebugEntrySpace(BuilderContext &BC,
     values.InvocationId = PreviousValue;
   }
 
-  auto MaskedForLimit = BC.Builder.CreateAnd(PreviousValue, values.OffsetMask,
-                                             "MaskedForUAVLimit");
-  // The return value will either end up being itself (multiplied by one and
-  // added with zero) or the "dump uninteresting things here" value of (UAVSize
-  // - a bit).
-  auto MultipliedForInterest = BC.Builder.CreateMul(
-      MaskedForLimit, values.OffsetMultiplicand, "MultipliedForInterest");
-  auto AddedForInterest = BC.Builder.CreateAdd(
-      MultipliedForInterest, values.OffsetAddend, "AddedForInterest");
-  values.CurrentIndex = AddedForInterest;
+  auto *Masked = BC.Builder.CreateAnd(PreviousValue, values.OffsetMask,
+                                      "MaskedForUAVLimit");
+  values.CurrentIndex =
+      BC.Builder.CreateOr(Masked, values.OffsetOr, "ORedForUAVStart");
 }
 
-void DxilDebugInstrumentation::addDebugEntryValue(BuilderContext &BC,
-                                                  Value *TheValue) {
+uint32_t DxilDebugInstrumentation::addDebugEntryValue(BuilderContext &BC,
+                                                      Value *TheValue) {
   assert(m_RemainingReservedSpaceInBytes > 0);
 
+  uint32_t BytesToBeEmitted = 0;
+
   auto TheValueTypeID = TheValue->getType()->getTypeID();
   if (TheValueTypeID == Type::TypeID::DoubleTyID) {
     Function *SplitDouble =
@@ -741,6 +874,7 @@ void DxilDebugInstrumentation::addDebugEntryValue(BuilderContext &BC,
     // addDebugEntryValue(BC, BC.HlslOP->GetU32Const(0)); // padding
     addDebugEntryValue(BC, LowBits);
     addDebugEntryValue(BC, HighBits);
+    BytesToBeEmitted += 8;
   } else if (TheValueTypeID == Type::TypeID::IntegerTyID &&
              TheValue->getType()->getIntegerBitWidth() == 64) {
     auto LowBits =
@@ -751,16 +885,17 @@ void DxilDebugInstrumentation::addDebugEntryValue(BuilderContext &BC,
     // addDebugEntryValue(BC, BC.HlslOP->GetU32Const(0)); // padding
     addDebugEntryValue(BC, LowBits);
     addDebugEntryValue(BC, HighBits);
+    BytesToBeEmitted += 8;
   } else if (TheValueTypeID == Type::TypeID::IntegerTyID &&
              (TheValue->getType()->getIntegerBitWidth() == 16 ||
               TheValue->getType()->getIntegerBitWidth() == 1)) {
     auto As32 =
         BC.Builder.CreateZExt(TheValue, Type::getInt32Ty(BC.Ctx), "As32");
-    addDebugEntryValue(BC, As32);
+    BytesToBeEmitted += addDebugEntryValue(BC, As32);
   } else if (TheValueTypeID == Type::TypeID::HalfTyID) {
     auto AsFloat =
         BC.Builder.CreateFPCast(TheValue, Type::getFloatTy(BC.Ctx), "AsFloat");
-    addDebugEntryValue(BC, AsFloat);
+    BytesToBeEmitted += addDebugEntryValue(BC, AsFloat);
   } else {
     Function *StoreValue =
         BC.HlslOP->GetOpFunc(OP::OpCode::BufferStore,
@@ -777,6 +912,7 @@ void DxilDebugInstrumentation::addDebugEntryValue(BuilderContext &BC,
       // The above are the only two valid types for a UAV store
       assert(false);
     }
+    BytesToBeEmitted += 4;
     Constant *WriteMask_X = BC.HlslOP->GetI8Const(1);
 
     auto &values = m_FunctionToValues[BC.Builder.GetInsertBlock()->getParent()];
@@ -792,8 +928,8 @@ void DxilDebugInstrumentation::addDebugEntryValue(BuilderContext &BC,
                      UndefArg, // unused values
                      WriteMask_X});
 
+    assert(m_RemainingReservedSpaceInBytes >= 4); // check for underflow
     m_RemainingReservedSpaceInBytes -= 4;
-    assert(m_RemainingReservedSpaceInBytes < 1024); // check for underflow
 
     if (m_RemainingReservedSpaceInBytes != 0) {
       values.CurrentIndex =
@@ -802,6 +938,8 @@ void DxilDebugInstrumentation::addDebugEntryValue(BuilderContext &BC,
       values.CurrentIndex = nullptr;
     }
   }
+
+  return BytesToBeEmitted;
 }
 
 void DxilDebugInstrumentation::addInvocationStartMarker(BuilderContext &BC) {
@@ -834,11 +972,9 @@ void DxilDebugInstrumentation::addStepEntryForType(
   addDebugEntryValue(BC, BC.HlslOP->GetU32Const(step.Header.u32Header));
   addDebugEntryValue(BC, values.InvocationId);
   addDebugEntryValue(BC, BC.HlslOP->GetU32Const(InstNum));
-
   if (RecordType != DebugShaderModifierRecordTypeDXILStepVoid &&
-      RecordType != DebugShaderModifierRecordTypeDXILStepTerminator) {
+      RecordType != DebugShaderModifierRecordTypeDXILStepRet) {
     addDebugEntryValue(BC, V);
-
     IRBuilder<> &B = BC.Builder;
 
     Value *VO = BC.HlslOP->GetU32Const(ValueOrdinal << 16);
@@ -850,99 +986,199 @@ void DxilDebugInstrumentation::addStepEntryForType(
   }
 }
 
-void DxilDebugInstrumentation::addStoreStepDebugEntry(BuilderContext &BC,
-                                                      StoreInst *Inst) {
+std::optional<InstructionAndType>
+DxilDebugInstrumentation::addStoreStepDebugEntry(BuilderContext *BC,
+                                                 StoreInst *Inst) {
   std::uint32_t ValueOrdinalBase;
   std::uint32_t UnusedValueOrdinalSize;
   llvm::Value *ValueOrdinalIndex;
   if (!pix_dxil::PixAllocaRegWrite::FromInst(Inst, &ValueOrdinalBase,
                                              &UnusedValueOrdinalSize,
                                              &ValueOrdinalIndex)) {
-    return;
+    return std::nullopt;
   }
 
   std::uint32_t InstNum;
   if (!pix_dxil::PixDxilInstNum::FromInst(Inst, &InstNum)) {
-    return;
+    return std::nullopt;
   }
 
   if (PIXPassHelpers::IsAllocateRayQueryInstruction(Inst->getValueOperand())) {
-    return;
+    return std::nullopt;
   }
 
-  addStepDebugEntryValue(BC, InstNum, Inst->getValueOperand(), ValueOrdinalBase,
-                         ValueOrdinalIndex);
+  auto Type = addStepDebugEntryValue(BC, InstNum, Inst->getValueOperand(),
+                                     ValueOrdinalBase, ValueOrdinalIndex);
+  if (Type) {
+    if (Instruction *ValueAsInst =
+            dyn_cast<Instruction>(Inst->getValueOperand())) {
+      uint32_t RegNum = 0;
+      if (pix_dxil::PixDxilReg::FromInst(ValueAsInst, &RegNum)) {
+        InstructionAndType ret{};
+        ret.Inst = Inst;
+        ret.InstructionOrdinal = InstNum;
+        ret.Type = *Type;
+        ret.RegisterNumber = RegNum;
+        ret.AllocaBase = ValueOrdinalBase;
+        ret.AllocaWriteIndex = ValueOrdinalIndex;
+        return ret;
+      }
+    } else if (Constant *ValueAsConst =
+                   dyn_cast<Constant>(Inst->getValueOperand())) {
+      InstructionAndType ret{};
+      ret.Inst = Inst;
+      ret.InstructionOrdinal = InstNum;
+      ret.Type = *Type;
+      ret.AllocaBase = ValueOrdinalBase;
+      ret.AllocaWriteIndex = ValueOrdinalIndex;
+
+      switch (ValueAsConst->getType()->getTypeID()) {
+      case Type::HalfTyID:
+      case Type::FloatTyID:
+      case Type::DoubleTyID:
+        ret.ConstantAllocaStoreValue = dyn_cast<ConstantFP>(ValueAsConst)
+                                           ->getValueAPF()
+                                           .bitcastToAPInt()
+                                           .getLimitedValue();
+        break;
+      case Type::IntegerTyID:
+        ret.ConstantAllocaStoreValue =
+            dyn_cast<ConstantInt>(ValueAsConst)->getLimitedValue();
+        break;
+      default:
+        return std::nullopt;
+      }
+      return ret;
+    }
+  }
+  return std::nullopt;
 }
 
-void DxilDebugInstrumentation::addStepDebugEntry(BuilderContext &BC,
-                                                 Instruction *Inst) {
-  if (Inst->getOpcode() == Instruction::OtherOps::PHI) {
-    return;
-  }
+std::optional<InstructionAndType>
+DxilDebugInstrumentation::addStepDebugEntry(BuilderContext *BC,
+                                            Instruction *Inst) {
   if (PIXPassHelpers::IsAllocateRayQueryInstruction(Inst)) {
-    return;
+    return std::nullopt;
   }
 
   if (auto *St = llvm::dyn_cast<llvm::StoreInst>(Inst)) {
-    addStoreStepDebugEntry(BC, St);
-    return;
+    return addStoreStepDebugEntry(BC, St);
   }
 
   std::uint32_t InstNum;
   if (!pix_dxil::PixDxilInstNum::FromInst(Inst, &InstNum)) {
-    return;
+    return std::nullopt;
+  }
+
+  if (auto *Ld = llvm::dyn_cast<llvm::LoadInst>(Inst)) {
+    if (llvm::isa<ConstantExpr>(Ld->getPointerOperand())) {
+      auto *constant = llvm::cast<ConstantExpr>(Ld->getPointerOperand());
+      if (constant->getOpcode() == Instruction::GetElementPtr) {
+        PIXPassHelpers::ScopedInstruction asInstr(constant->getAsInstruction());
+        auto *GEP = llvm::cast<GetElementPtrInst>(asInstr.Get());
+        if (GEP->getPointerOperand()->getName().equals("dx.nothing.a")) {
+          // These debug-only loads are interesting as instructions to
+          // step though where otherwise no step might exist for the
+          // given HLSL lines, so we include them in the instrumentation:
+          InstructionAndType ret{};
+          ret.Inst = Inst;
+          ret.InstructionOrdinal = InstNum;
+          ret.Type = DebugShaderModifierRecordTypeDXILStepVoid;
+          return ret;
+        }
+      }
+    }
   }
 
   std::uint32_t RegNum;
   if (!pix_dxil::PixDxilReg::FromInst(Inst, &RegNum)) {
-    if (Inst->getOpcode() == Instruction::Ret)
-      addStepEntryForType<void>(DebugShaderModifierRecordTypeDXILStepTerminator,
-                                BC, InstNum, nullptr, 0, 0);
-    return;
+    if (Inst->getOpcode() == Instruction::Ret) {
+      if (BC != nullptr)
+        addStepEntryForType<void>(DebugShaderModifierRecordTypeDXILStepRet, *BC,
+                                  InstNum, nullptr, 0, 0);
+      InstructionAndType ret{};
+      ret.Inst = Inst;
+      ret.InstructionOrdinal = InstNum;
+      ret.Type = DebugShaderModifierRecordTypeDXILStepRet;
+      return ret;
+    } else if (Inst->isTerminator()) {
+      if (BC != nullptr)
+        addStepEntryForType<void>(DebugShaderModifierRecordTypeDXILStepVoid,
+                                  *BC, InstNum, nullptr, 0, 0);
+      InstructionAndType ret{};
+      ret.Inst = Inst;
+      ret.InstructionOrdinal = InstNum;
+      ret.Type = DebugShaderModifierRecordTypeDXILStepVoid;
+      return ret;
+    }
+    return std::nullopt;
   }
-  addStepDebugEntryValue(BC, InstNum, Inst, RegNum, BC.Builder.getInt32(0));
+  auto Type = addStepDebugEntryValue(BC, InstNum, Inst, RegNum,
+                                     BC ? BC->Builder.getInt32(0) : nullptr);
+  if (Type) {
+    InstructionAndType ret{};
+    ret.Inst = Inst;
+    ret.InstructionOrdinal = InstNum;
+    ret.Type = *Type;
+    ret.RegisterNumber = RegNum;
+    return ret;
+  }
+  return std::nullopt;
 }
 
-void DxilDebugInstrumentation::addStepDebugEntryValue(
-    BuilderContext &BC, std::uint32_t InstNum, Value *V,
-    std::uint32_t ValueOrdinal, Value *ValueOrdinalIndex) {
+std::optional<DebugShaderModifierRecordType>
+DxilDebugInstrumentation::addStepDebugEntryValue(BuilderContext *BC,
+                                                 std::uint32_t InstNum,
+                                                 Value *V,
+                                                 std::uint32_t ValueOrdinal,
+                                                 Value *ValueOrdinalIndex) {
   const Type::TypeID ID = V->getType()->getTypeID();
 
   switch (ID) {
   case Type::TypeID::StructTyID:
   case Type::TypeID::VoidTyID:
-    addStepEntryForType<void>(DebugShaderModifierRecordTypeDXILStepVoid, BC,
-                              InstNum, V, ValueOrdinal, ValueOrdinalIndex);
-    break;
+    if (BC != nullptr)
+      addStepEntryForType<void>(DebugShaderModifierRecordTypeDXILStepVoid, *BC,
+                                InstNum, V, ValueOrdinal, ValueOrdinalIndex);
+    return DebugShaderModifierRecordTypeDXILStepVoid;
   case Type::TypeID::FloatTyID:
-    addStepEntryForType<float>(DebugShaderModifierRecordTypeDXILStepFloat, BC,
-                               InstNum, V, ValueOrdinal, ValueOrdinalIndex);
-    break;
+    if (BC != nullptr)
+      addStepEntryForType<float>(DebugShaderModifierRecordTypeDXILStepFloat,
+                                 *BC, InstNum, V, ValueOrdinal,
+                                 ValueOrdinalIndex);
+    return DebugShaderModifierRecordTypeDXILStepFloat;
   case Type::TypeID::IntegerTyID:
     if (V->getType()->getIntegerBitWidth() == 64) {
-      addStepEntryForType<uint64_t>(DebugShaderModifierRecordTypeDXILStepUint64,
-                                    BC, InstNum, V, ValueOrdinal,
-                                    ValueOrdinalIndex);
+      if (BC != nullptr)
+        addStepEntryForType<uint64_t>(
+            DebugShaderModifierRecordTypeDXILStepUint64, *BC, InstNum, V,
+            ValueOrdinal, ValueOrdinalIndex);
+      return DebugShaderModifierRecordTypeDXILStepUint64;
     } else {
-      addStepEntryForType<uint32_t>(DebugShaderModifierRecordTypeDXILStepUint32,
-                                    BC, InstNum, V, ValueOrdinal,
-                                    ValueOrdinalIndex);
+      if (BC != nullptr)
+        addStepEntryForType<uint32_t>(
+            DebugShaderModifierRecordTypeDXILStepUint32, *BC, InstNum, V,
+            ValueOrdinal, ValueOrdinalIndex);
+      return DebugShaderModifierRecordTypeDXILStepUint32;
     }
-    break;
   case Type::TypeID::DoubleTyID:
-    addStepEntryForType<double>(DebugShaderModifierRecordTypeDXILStepDouble, BC,
-                                InstNum, V, ValueOrdinal, ValueOrdinalIndex);
-    break;
+    if (BC != nullptr)
+      addStepEntryForType<double>(DebugShaderModifierRecordTypeDXILStepDouble,
+                                  *BC, InstNum, V, ValueOrdinal,
+                                  ValueOrdinalIndex);
+    return DebugShaderModifierRecordTypeDXILStepDouble;
   case Type::TypeID::HalfTyID:
-    addStepEntryForType<float>(DebugShaderModifierRecordTypeDXILStepFloat, BC,
-                               InstNum, V, ValueOrdinal, ValueOrdinalIndex);
-    break;
+    if (BC != nullptr)
+      addStepEntryForType<float>(DebugShaderModifierRecordTypeDXILStepFloat,
+                                 *BC, InstNum, V, ValueOrdinal,
+                                 ValueOrdinalIndex);
+    return DebugShaderModifierRecordTypeDXILStepFloat;
   case Type::TypeID::PointerTyID:
     // Skip pointer calculation instructions. They aren't particularly
     // meaningful to the user (being a mere implementation detail for lookup
-    // tables, etc.), and their type is problematic from a UI point of view. The
-    // subsequent instructions that dereference the pointer will be properly
-    // instrumented and show the (meaningful) retrieved value.
+    // tables, etc.), and their type is problematic from a UI point of view.
+    // The subsequent instructions that dereference the pointer will be
+    // properly instrumented and show the (meaningful) retrieved value.
     break;
   case Type::TypeID::VectorTyID:
     // Shows up in "insertelement" in raygen shader?
@@ -957,11 +1193,16 @@ void DxilDebugInstrumentation::addStepDebugEntryValue(
   case Type::TypeID::PPC_FP128TyID:
     assert(false);
   }
+  return std::nullopt;
 }
 
 bool DxilDebugInstrumentation::runOnModule(Module &M) {
   DxilModule &DM = M.GetOrCreateDxilModule();
 
+  // There is no point running this pass if it can't return its report:
+  if (OSOverride == nullptr)
+    return false;
+
   auto ShaderModel = DM.GetShaderModel();
   auto shaderKind = ShaderModel->GetKind();
 
@@ -981,10 +1222,159 @@ bool DxilDebugInstrumentation::runOnModule(Module &M) {
   return modified;
 }
 
+struct RecordTypeDatum {
+  DebugShaderModifierRecordType Type;
+  uint32_t PayloadSize;
+  const char *AsString;
+};
+
+static const RecordTypeDatum RecordTypeData[] = {
+    {DebugShaderModifierRecordTypeDXILStepRet, 0, "r"},
+    {DebugShaderModifierRecordTypeDXILStepVoid, 0, "v"},
+    {DebugShaderModifierRecordTypeDXILStepFloat, 4, "f"},
+    {DebugShaderModifierRecordTypeDXILStepUint32, 4, "3"},
+    {DebugShaderModifierRecordTypeDXILStepUint64, 8, "6"},
+    {DebugShaderModifierRecordTypeDXILStepDouble, 8, "d"}};
+
+std::optional<RecordTypeDatum const *>
+FindDatum(DebugShaderModifierRecordType RecordType) {
+  for (auto const &datum : RecordTypeData) {
+    if (datum.Type == RecordType) {
+      return &datum;
+    }
+  }
+  return std::nullopt;
+}
+
+uint32_t DxilDebugInstrumentation::CountBlockPayloadBytes(
+    std::vector<InstructionToInstrument> const &IsAndTs) {
+  uint32_t count = 0;
+  for (auto const &IandT : IsAndTs) {
+    auto datum = FindDatum(IandT.ValueType);
+    if (datum)
+      count += (*datum)->PayloadSize;
+  }
+  return count;
+}
+
+const char *TypeString(InstructionAndType const &IandT) {
+  auto datum = FindDatum(IandT.Type);
+  if (datum)
+    return (*datum)->AsString;
+  assert(false);
+  return "v";
+}
+
+Instruction *FindFirstNonPhiInstruction(Instruction *I) {
+  while (llvm::isa<llvm::PHINode>(I))
+    I = I->getNextNode();
+  return I;
+}
+
+// This function reports a textual representation of the format
+// of the debug data that will be output by the instructions
+// added by this pass.
+// The string has one or more lines of the exemplary form
+//      Block#3:5,f,22,a;7,f,22,s,20;9,f,22,s,20;10,f,23,a;12,f,23,s,21;
+// The integer after the Block# is the first instruction number in the
+// block.
+// Instructions are delimited by ; The fields within the instruction
+// (delimited by ,) are, in order:
+// -instruction ordinal
+// -data type (r=ret, v=void, f=float, 3=int32, 6=int64, d=double)
+// -scalar register number
+// -alloca/scalar indicator:
+// r == ret instruction
+// a == scalar is being created and assigned a value, and that
+//      value is in the debug output.
+// s == Existing scalar is being assigned via static alloca index.
+//      Index is appended to this instruction record. No
+//      corresponding data in the debug output.
+// d == A dynamic index added to the static base index. Base index
+//      is appended to this record. The corresponding debug entry is
+//      the dynamic index into that alloca.
+// v == A void terminator or other void-valued instruction. No
+//      corresponding data in the debug output.
+// If indicator is "a", a string of the form [base+index] for the alloca
+// store location.
+// If indicator is "d", a single integer denoting the base for the alloca
+// store.
+DxilDebugInstrumentation::BlockInstrumentationData
+DxilDebugInstrumentation::FindInstrumentableInstructionsInBlock(BasicBlock &BB,
+                                                                OP *HlslOP) {
+  BlockInstrumentationData ret{};
+  auto &Is = BB.getInstList();
+  *OSOverride << "Block#";
+  bool FoundFirstInstruction = false;
+  for (auto &Inst : Is) {
+    if (!FoundFirstInstruction) {
+      std::uint32_t InstNum;
+      if (pix_dxil::PixDxilInstNum::FromInst(&Inst, &InstNum)) {
+        *OSOverride << std::to_string(InstNum) << ":";
+        ret.FirstInstructionOrdinalInBlock = InstNum;
+        FoundFirstInstruction = true;
+      }
+    }
+    auto IandT = addStepDebugEntry(nullptr, &Inst);
+    if (IandT) {
+      InstructionToInstrument DebugOutputForThisInstruction{};
+      DebugOutputForThisInstruction.ValueType = IandT->Type;
+      auto *InsertionPoint = FindFirstNonPhiInstruction(&Inst);
+      if (InsertionPoint->isTerminator() || llvm::isa<llvm::PHINode>(Inst))
+        DebugOutputForThisInstruction
+            .InstructionBeforeWhichToAddInstrumentation = InsertionPoint;
+      else
+        DebugOutputForThisInstruction
+            .InstructionAfterWhichToAddInstrumentation = InsertionPoint;
+
+      const char *IndexingToken = nullptr;
+      std::optional<std::string> RegisterOrStaticIndex;
+      if (IandT->Type == DebugShaderModifierRecordTypeDXILStepRet) {
+        IndexingToken = "r";
+      } else if (IandT->Type == DebugShaderModifierRecordTypeDXILStepVoid) {
+        IndexingToken = "v"; // void instruction, no debug output required
+      } else if (IandT->AllocaWriteIndex != nullptr) {
+        if (ConstantInt *IndexAsConstant =
+                dyn_cast<ConstantInt>(IandT->AllocaWriteIndex)) {
+          RegisterOrStaticIndex =
+              std::to_string(IandT->AllocaBase) + "+" +
+              std::to_string(IndexAsConstant->getLimitedValue());
+          IndexingToken = "s"; // static indexing, no debug output required
+        } else {
+          IndexingToken = "d"; // dynamic indexing
+          RegisterOrStaticIndex = std::to_string(IandT->AllocaBase);
+          DebugOutputForThisInstruction.ValueToWriteToDebugMemory =
+              IandT->AllocaWriteIndex;
+        }
+      } else {
+        IndexingToken = "a"; // meaning an SSA assignment
+        // todo: Can SSA Values be assigned a literal constant?
+        DebugOutputForThisInstruction.ValueToWriteToDebugMemory = IandT->Inst;
+      }
+
+      *OSOverride << std::to_string(IandT->InstructionOrdinal) << ","
+                  << TypeString(*IandT) << ","
+                  << std::to_string(IandT->RegisterNumber) << ","
+                  << IndexingToken;
+      if (RegisterOrStaticIndex) {
+        *OSOverride << "," << *RegisterOrStaticIndex;
+      }
+      if (IandT->ConstantAllocaStoreValue) {
+        *OSOverride << "," << std::to_string(*IandT->ConstantAllocaStoreValue);
+      }
+      *OSOverride << ";";
+      if (DebugOutputForThisInstruction.ValueToWriteToDebugMemory)
+        ret.Instructions.push_back(std::move(DebugOutputForThisInstruction));
+    }
+  }
+  *OSOverride << "\n";
+  return ret;
+}
+
 bool DxilDebugInstrumentation::RunOnFunction(Module &M, DxilModule &DM,
-                                             llvm::Function *entryFunction) {
+                                             llvm::Function *function) {
   DXIL::ShaderKind shaderKind =
-      PIXPassHelpers::GetFunctionShaderKind(DM, entryFunction);
+      PIXPassHelpers::GetFunctionShaderKind(DM, function);
 
   switch (shaderKind) {
   case DXIL::ShaderKind::Amplification:
@@ -1005,34 +1395,7 @@ bool DxilDebugInstrumentation::RunOnFunction(Module &M, DxilModule &DM,
     return false;
   }
 
-  // First record pointers to all instructions in the function:
-  std::vector<Instruction *> AllInstructions;
-  for (inst_iterator I = inst_begin(entryFunction), E = inst_end(entryFunction);
-       I != E; ++I) {
-    std::uint32_t InstructionNumber;
-    if (pix_dxil::PixDxilInstNum::FromInst(&*I, &InstructionNumber)) {
-      if (InstructionNumber < m_FirstInstruction ||
-          InstructionNumber >= m_LastInstruction)
-        continue;
-      AllInstructions.push_back(&*I);
-    }
-  }
-
-  // Branchless instrumentation requires taking care of a few things:
-  // -Each invocation of the shader will be either of interest or not of
-  // interest
-  //    -If of interest, the offset into the output UAV will be as expected
-  //    -If not, the offset is forced to (UAVsize) - (Small Amount), and that
-  //    output is ignored by the CPU-side code.
-  // -The invocation of interest may overflow the UAV. This is handled by taking
-  // the modulus of the
-  //  output index. Overflow is then detected on the CPU side by checking for
-  //  the presence of a canary value at (UAVSize) - (Small Amount) * 2 (which is
-  //  actually a conservative definition of overflow).
-  //
-
-  Instruction *firstInsertionPt =
-      dxilutil::FirstNonAllocaInsertionPt(entryFunction);
+  Instruction *firstInsertionPt = dxilutil::FirstNonAllocaInsertionPt(function);
   IRBuilder<> Builder(firstInsertionPt);
 
   LLVMContext &Ctx = M.getContext();
@@ -1061,100 +1424,81 @@ bool DxilDebugInstrumentation::RunOnFunction(Module &M, DxilModule &DM,
 
   values.UAVHandle = PIXPassHelpers::CreateUAV(DM, Builder, UAVRegisterId,
                                                "PIX_DebugUAV_Handle");
-  values.CounterOffset = BC.HlslOP->GetU32Const(UAVDumpingGroundOffset() +
-                                                CounterOffsetBeyondUsefulData);
 
   auto SystemValues = addRequiredSystemValues(BC, shaderKind);
   addInvocationSelectionProlog(BC, SystemValues, shaderKind);
+  determineLimitANDAndInitializeCounter(BC);
   addInvocationStartMarker(BC);
 
-  // Explicitly name new blocks in order to provide stable names for testing
-  // purposes
-  int NewBlockCounter = 0;
+  // Instrument original instructions:
+  for (auto &BB : function->getBasicBlockList()) {
+    if (std::find(values.AddedBlocksToIgnoreForInstrumentation.begin(),
+                  values.AddedBlocksToIgnoreForInstrumentation.end(),
+                  &BB) == values.AddedBlocksToIgnoreForInstrumentation.end()) {
+      auto BlockInstrumentation =
+          FindInstrumentableInstructionsInBlock(BB, BC.HlslOP);
+      if (BlockInstrumentation.FirstInstructionOrdinalInBlock <
+              m_FirstInstruction ||
+          BlockInstrumentation.FirstInstructionOrdinalInBlock >=
+              m_LastInstruction)
+        continue;
+      uint32_t BlockPayloadBytes =
+          CountBlockPayloadBytes(BlockInstrumentation.Instructions);
+      // If the block has no instructions which require debug output,
+      // we will still write an empty block header at the end of that
+      // block (i.e. before the terminator) so that the instrumentation
+      // at least indicates that flow control went through the block.
+      Instruction *BlockInstrumentationStart = (BB).getTerminator();
+      if (!BlockInstrumentation.Instructions.empty()) {
+        auto const &First = BlockInstrumentation.Instructions[0];
+        if (First.InstructionAfterWhichToAddInstrumentation != nullptr)
+          BlockInstrumentationStart =
+              First.InstructionAfterWhichToAddInstrumentation;
+        else if (First.InstructionBeforeWhichToAddInstrumentation != nullptr)
+          BlockInstrumentationStart =
+              First.InstructionBeforeWhichToAddInstrumentation;
+        else {
+          assert(false);
+          continue;
+        }
+      }
+      IRBuilder<> Builder(BlockInstrumentationStart);
+      BuilderContext BCForBlock{BC.M, BC.DM, BC.Ctx, BC.HlslOP, Builder};
 
-  auto &Blocks = entryFunction->getBasicBlockList();
-  for (auto &CurrentBlock : Blocks) {
-    struct ValueAndPhi {
-      Value *Val;
-      PHINode *Phi;
-      unsigned Index;
-    };
-
-    std::map<BasicBlock *, std::vector<ValueAndPhi>> InsertableEdges;
-    auto &Is = CurrentBlock.getInstList();
-    for (auto &Inst : Is) {
-      if (Inst.getOpcode() != Instruction::OtherOps::PHI) {
+      DebugShaderModifierRecordDXILBlock step = {};
+      auto FullRecordSize =
+          static_cast<uint32_t>(sizeof(step) + BlockPayloadBytes);
+      if (FullRecordSize >= (m_UAVSize / 4) - 1) {
+        *OSOverride << "StaticOverflow:" << std::to_string(FullRecordSize)
+                    << "\n";
         break;
       }
-      PHINode &PN = llvm::cast<PHINode>(Inst);
-      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
-        BasicBlock *PhiBB = PN.getIncomingBlock(i);
-        Value *PhiVal = PN.getIncomingValue(i);
-        InsertableEdges[PhiBB].push_back({PhiVal, &PN, i});
-      }
-    }
-
-    for (auto &InsertableEdge : InsertableEdges) {
-      auto *NewBlock = BasicBlock::Create(
-          Ctx, "PIXDebug" + std::to_string(NewBlockCounter++),
-          InsertableEdge.first->getParent());
-      IRBuilder<> Builder(NewBlock);
-
-      auto *PreviousBlock = InsertableEdge.first;
-
-      // Modify all successor operands of the terminator in the previous block
-      // that match the current block to point to the new block:
-      TerminatorInst *terminator = PreviousBlock->getTerminator();
-      unsigned NumSuccessors = terminator->getNumSuccessors();
-      for (unsigned SuccessorIndex = 0; SuccessorIndex < NumSuccessors;
-           ++SuccessorIndex) {
-        auto *CurrentSuccessor = terminator->getSuccessor(SuccessorIndex);
-        if (CurrentSuccessor == &CurrentBlock) {
-          terminator->setSuccessor(SuccessorIndex, NewBlock);
-        }
-      }
-
-      // Modify the Phis and add debug instrumentation
-      for (auto &ValueNPhi : InsertableEdge.second) {
-        // Modify the phi to refer to the new block:
-        ValueNPhi.Phi->setIncomingBlock(ValueNPhi.Index, NewBlock);
-
-        // Add instrumentation to the new block
-        std::uint32_t RegNum;
-        if (!pix_dxil::PixDxilReg::FromInst(ValueNPhi.Phi, &RegNum)) {
+      reserveDebugEntrySpace(BCForBlock, FullRecordSize);
+      step.Header.Details.CountOfInstructions =
+          static_cast<uint16_t>(BlockInstrumentation.Instructions.size());
+      step.Header.Details.Type =
+          static_cast<uint8_t>(DebugShaderModifierRecordTypeDXILStepBlock);
+      addDebugEntryValue(BCForBlock,
+                         BCForBlock.HlslOP->GetU32Const(step.Header.u32Header));
+      addDebugEntryValue(BCForBlock, values.InvocationId);
+      addDebugEntryValue(
+          BCForBlock, BCForBlock.HlslOP->GetU32Const(
+                          BlockInstrumentation.FirstInstructionOrdinalInBlock));
+      for (auto &Inst : BlockInstrumentation.Instructions) {
+        Instruction *BuilderInstruction;
+        if (Inst.InstructionAfterWhichToAddInstrumentation != nullptr)
+          BuilderInstruction =
+              Inst.InstructionAfterWhichToAddInstrumentation->getNextNode();
+        else if (Inst.InstructionBeforeWhichToAddInstrumentation != nullptr)
+          BuilderInstruction = Inst.InstructionBeforeWhichToAddInstrumentation;
+        else {
+          assert(false);
           continue;
         }
-
-        std::uint32_t InstNum;
-        if (!pix_dxil::PixDxilInstNum::FromInst(ValueNPhi.Phi, &InstNum)) {
-          continue;
-        }
-        if (InstNum < m_FirstInstruction || InstNum >= m_LastInstruction)
-          continue;
-
-        BuilderContext BC{M, DM, Ctx, HlslOP, Builder};
-        addStepDebugEntryValue(BC, InstNum, ValueNPhi.Val, RegNum,
-                               BC.Builder.getInt32(0));
+        IRBuilder<> Builder(BuilderInstruction);
+        BuilderContext BC2{BC.M, BC.DM, BC.Ctx, BC.HlslOP, Builder};
+        addDebugEntryValue(BC2, Inst.ValueToWriteToDebugMemory);
       }
-
-      // Add a branch to the new block to point to the current block
-      Builder.CreateBr(&CurrentBlock);
-    }
-  }
-
-  // Instrument original instructions:
-  for (auto &Inst : AllInstructions) {
-    // Instrumentation goes after the instruction if it is not a terminator.
-    // Otherwise, Instrumentation goes prior to the instruction.
-    if (!Inst->isTerminator()) {
-      IRBuilder<> Builder(Inst->getNextNode());
-      BuilderContext BC2{BC.M, BC.DM, BC.Ctx, BC.HlslOP, Builder};
-      addStepDebugEntry(BC2, Inst);
-    } else {
-      // Insert before this instruction
-      IRBuilder<> Builder(Inst);
-      BuilderContext BC2{BC.M, BC.DM, BC.Ctx, BC.HlslOP, Builder};
-      addStepDebugEntry(BC2, Inst);
     }
   }
 
diff --git a/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp b/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp
index 7fd4bb152..f68e2082b 100644
--- a/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp
+++ b/lib/DxilPIXPasses/DxilPIXVirtualRegisters.cpp
@@ -33,7 +33,7 @@ void pix_dxil::PixDxilInstNum::AddMD(llvm::LLVMContext &Ctx,
                          llvm::ConstantAsMetadata::get(B.getInt32(InstNum))}));
 }
 
-bool pix_dxil::PixDxilInstNum::FromInst(llvm::Instruction *pI,
+bool pix_dxil::PixDxilInstNum::FromInst(llvm::Instruction const *pI,
                                         std::uint32_t *pInstNum) {
   *pInstNum = 0;
 
@@ -73,7 +73,7 @@ void pix_dxil::PixDxilReg::AddMD(llvm::LLVMContext &Ctx, llvm::Instruction *pI,
                          llvm::ConstantAsMetadata::get(B.getInt32(RegNum))}));
 }
 
-bool pix_dxil::PixDxilReg::FromInst(llvm::Instruction *pI,
+bool pix_dxil::PixDxilReg::FromInst(llvm::Instruction const *pI,
                                     std::uint32_t *pRegNum) {
   *pRegNum = 0;
 
@@ -141,7 +141,7 @@ void pix_dxil::PixAllocaReg::AddMD(llvm::LLVMContext &Ctx,
                          llvm::ConstantAsMetadata::get(B.getInt32(Count))}));
 }
 
-bool pix_dxil::PixAllocaReg::FromInst(llvm::AllocaInst *pAlloca,
+bool pix_dxil::PixAllocaReg::FromInst(llvm::AllocaInst const *pAlloca,
                                       std::uint32_t *pRegBase,
                                       std::uint32_t *pRegSize) {
   *pRegBase = 0;
diff --git a/lib/DxilPIXPasses/PixPassHelpers.cpp b/lib/DxilPIXPasses/PixPassHelpers.cpp
index a093b2fc2..592aca921 100644
--- a/lib/DxilPIXPasses/PixPassHelpers.cpp
+++ b/lib/DxilPIXPasses/PixPassHelpers.cpp
@@ -37,9 +37,10 @@ using namespace llvm;
 using namespace hlsl;
 
 namespace PIXPassHelpers {
-bool IsAllocateRayQueryInstruction(llvm::Value *Val) {
+bool IsAllocateRayQueryInstruction(llvm::Value const *Val) {
   if (Val != nullptr) {
-    if (llvm::Instruction *Inst = llvm::dyn_cast<llvm::Instruction>(Val)) {
+    if (llvm::Instruction const *Inst =
+            llvm::dyn_cast<llvm::Instruction>(Val)) {
       return hlsl::OP::IsDxilOpFuncCallInst(Inst,
                                             hlsl::OP::OpCode::AllocateRayQuery);
     }
diff --git a/lib/DxilPIXPasses/PixPassHelpers.h b/lib/DxilPIXPasses/PixPassHelpers.h
index 7d94a3142..406bc086d 100644
--- a/lib/DxilPIXPasses/PixPassHelpers.h
+++ b/lib/DxilPIXPasses/PixPassHelpers.h
@@ -20,7 +20,17 @@
 #endif
 
 namespace PIXPassHelpers {
-bool IsAllocateRayQueryInstruction(llvm::Value *Val);
+
+class ScopedInstruction {
+  llvm::Instruction *m_Instruction;
+
+public:
+  ScopedInstruction(llvm::Instruction *I) : m_Instruction(I) {}
+  ~ScopedInstruction() { delete m_Instruction; }
+  llvm::Instruction *Get() const { return m_Instruction; }
+};
+
+bool IsAllocateRayQueryInstruction(llvm::Value const *Val);
 llvm::CallInst *CreateUAV(hlsl::DxilModule &DM, llvm::IRBuilder<> &Builder,
                           unsigned int registerId, const char *name);
 llvm::CallInst *CreateHandleForResource(hlsl::DxilModule &DM,
diff --git a/tools/clang/test/HLSLFileCheck/pix/DebugBasic.hlsl b/tools/clang/test/HLSLFileCheck/pix/DebugBasic.hlsl
index 8c29a59ef..664778a4c 100644
--- a/tools/clang/test/HLSLFileCheck/pix/DebugBasic.hlsl
+++ b/tools/clang/test/HLSLFileCheck/pix/DebugBasic.hlsl
@@ -1,4 +1,4 @@
-// RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation | %FileCheck %s
+// RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation,UAVSize=128 | %FileCheck %s
 
 // Check that the basic starting header is present:
 
@@ -10,17 +10,18 @@
 // CHECK: %CompareToX = icmp eq i32 %XIndex, 0
 // CHECK: %CompareToY = icmp eq i32 %YIndex, 0
 // CHECK: %ComparePos = and i1 %CompareToX, %CompareToY
-// CHECK: %OffsetMultiplicand = zext i1 %ComparePos to i32
-// CHECK: %ComplementOfMultiplicand = sub i32 1, %OffsetMultiplicand
-// CHECK: %OffsetAddend = mul i32 983040, %ComplementOfMultiplicand
-// CHECK: %IncrementForThisInvocation = mul i32 8, %OffsetMultiplicand
 
-// Check the first instruction was instrumented:
-// CHECK: %UAVIncResult = call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle, i32 0
-// CHECK: %MaskedForUAVLimit = and i32 %UAVIncResult, 983039
-// CHECK: %MultipliedForInterest = mul i32 %MaskedForUAVLimit, %OffsetMultiplicand
-// CHECK: %AddedForInterest = add i32 %MultipliedForInterest, %OffsetAddend
-// CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, i32 %AddedForInterest
+
+// Check for branches-for-interest and AND value and counter location for a UAV size of 128
+// CHECK: br i1 %ComparePos, label %PIXInterestingBlock, label %PIXNonInterestingBlock
+// CHECK: %PIXOffsetOr = phi i32 [ 0, %PIXInterestingBlock ], [ 64, %PIXNonInterestingBlock ]
+// CHECK: %PIXCounterLocation = phi i32 [ 63, %PIXInterestingBlock ], [ 127, %PIXNonInterestingBlock ]
+
+// Check the first block header was emitted: (increment, AND + OR)
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle, i32 0
+// CHECK: and i32 
+// CHECK: or i32
+
 
 
 [RootSignature("")]
diff --git a/tools/clang/test/HLSLFileCheck/pix/DebugFlowControl.hlsl b/tools/clang/test/HLSLFileCheck/pix/DebugFlowControl.hlsl
index 38b2d018e..1970404dd 100644
--- a/tools/clang/test/HLSLFileCheck/pix/DebugFlowControl.hlsl
+++ b/tools/clang/test/HLSLFileCheck/pix/DebugFlowControl.hlsl
@@ -2,15 +2,31 @@
 
 // Check that flow control constructs don't break the instrumentation.
 
-// CHECK:  %UAVIncResult2 = call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle, i32 0
+// CHECK:  call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle, i32 0
 
-// CHECK:  %MaskedForUAVLimit3 = and i32 %UAVIncResult2, 983039
+// There should be several blocks that have instrumentation:
 
-// CHECK:  %MultipliedForInterest4 = mul i32 %MaskedForUAVLimit3, %OffsetMultiplicand
+// CHECK: ; preds =
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
 
-// CHECK:  %AddedForInterest5 = add i32 %MultipliedForInterest4, %OffsetAddend
+// CHECK: ; preds =
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
+
+// CHECK: ; preds =
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
+
+// CHECK: ; preds =
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
+
+// CHECK: ; preds =
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
+
+// CHECK: ; preds =
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
+
+// CHECK: ; preds =
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
 
-// CHECK:  call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, i32 %AddedForInterest5
 
 
 struct VS_OUTPUT_ENV {
diff --git a/tools/clang/test/HLSLFileCheck/pix/DebugInstrumentRet.hlsl b/tools/clang/test/HLSLFileCheck/pix/DebugInstrumentRet.hlsl
deleted file mode 100644
index 536d5b176..000000000
--- a/tools/clang/test/HLSLFileCheck/pix/DebugInstrumentRet.hlsl
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: %dxc -T ps_6_3 %s | %opt -S -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation,parameter0=10,parameter1=20,parameter2=30 | %FileCheck %s
-
-
-
-
-// The ret's instruction number should be 4 (the last integer on this line):
-// CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, i32 {{.*}}, i32 undef, i32 4
-// But we'll check that instruction number:
-// CHECK: ret void, !pix-dxil-inst-num [[RetInstNum:![0-9]+]]
-// CHECK-DAG: [[RetInstNum]] = !{i32 3, i32 4}
-
-
-float4 main() : SV_Target {
-  return float4(0, 0, 0, 0);
-}
\ No newline at end of file
diff --git a/tools/clang/test/HLSLFileCheck/pix/DebugLimitedInstructionOverrides.hlsl b/tools/clang/test/HLSLFileCheck/pix/DebugLimitedInstructionOverrides.hlsl
index 037049b62..7ab604134 100644
--- a/tools/clang/test/HLSLFileCheck/pix/DebugLimitedInstructionOverrides.hlsl
+++ b/tools/clang/test/HLSLFileCheck/pix/DebugLimitedInstructionOverrides.hlsl
@@ -1,19 +1,19 @@
 // The PIX debug instrumentation pass takes optional arguments that limit the range of instruction numbers that will be instrumented.
 // (This is to cope with extremely large shaders, the instrumentation of which will break, either by out-of-memory or by TDRing when run.)
 
-// RUN: %dxc -EFlowControlPS -Tps_6_0 %s | %opt -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation,FirstInstruction=6,LastInstruction=9 | %FileCheck %s
+// RUN: %dxc -EFlowControlPS -Tps_6_0 %s | %opt -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation,FirstInstruction=4,LastInstruction=20 | %FileCheck %s
 
-// The only instrumented instructions should have instruction numbers in the range [6,9):
-// CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, {{.*}}, i32 undef, i32 6
-// CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, {{.*}}, i32 undef, i32 7
-// CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, {{.*}}, i32 undef, i32 8
+// The only instrumented blocks should have instruction numbers in the range [4,20):
 
-// Two more stores to finish off the instrumentation for instruction #8:
-// CHECK: call void @dx.op.bufferStore.f32
-// CHECK: call void @dx.op.bufferStore.i32
+// Skip over the preamble
+// CHECK: switch i32
+// 
+// Now there should be exactly two more instrumented blocks (two increments of the counter UAV entry)
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
+// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
 
-// Then no more instrumentation at all:
-// CHECK-NOT: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle
+// Then no more instrumentation at all (i.e. no more increments of the counter UAV entry):
+// CHECK-NOT: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle
 
 struct VS_OUTPUT_ENV {
   float4 Pos : SV_Position;
diff --git a/tools/clang/test/HLSLFileCheck/pix/DebugPhisFor.hlsl b/tools/clang/test/HLSLFileCheck/pix/DebugPhisFor.hlsl
deleted file mode 100644
index eafb5de7b..000000000
--- a/tools/clang/test/HLSLFileCheck/pix/DebugPhisFor.hlsl
+++ /dev/null
@@ -1,59 +0,0 @@
-// RUN: %dxc -EForLoopPS -Tps_6_0 %s -Od | %opt -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation | %FileCheck %s
-
-// Ensure that the pass added at the begining of the for body:
-// CHECK: br label %PIXDebug
-// CHECK: br label %PIXDebug
-// CHECK: br label %PIXDebug
-
-// Followed by lots of new pix debug blocks:
-
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-
-struct VS_OUTPUT_ENV {
-  float4 Pos : SV_Position;
-  float2 Tex : TEXCOORD0;
-};
-
-uint i32;
-
-float4 ForLoopPS(VS_OUTPUT_ENV input) : SV_Target {
-  float4 ret = float4(0, 0, 0, 0);
-  for (uint i = 0; i < abs(input.Tex.x * 200); ++i) {
-    ret.x += (float)i32;
-    if (i + i32 == 0) {
-      break;
-    }
-    ret.y += (float)i32;
-    if (i + i32 == 1) {
-      continue;
-    }
-    ret.z += (float)i32;
-    if (i + i32 == 2) {
-      break;
-    }
-    ret.w += (float)i32;
-    if (i + i32 == 3) {
-      continue;
-    }
-  }
-  return ret;
-}
diff --git a/tools/clang/test/HLSLFileCheck/pix/DebugPhisIfElse.hlsl b/tools/clang/test/HLSLFileCheck/pix/DebugPhisIfElse.hlsl
deleted file mode 100644
index d15a0f6b5..000000000
--- a/tools/clang/test/HLSLFileCheck/pix/DebugPhisIfElse.hlsl
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: %dxc -EFlowControlPS -Tps_6_0 %s -Od | %opt -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation | %FileCheck %s
-
-// Ensure that the pass added a block at the end of this if/else:
-// CHECK: br label %PIXDebug
-// CHECK: br label %PIXDebug
-
-// Check that block 0 emits some debug info and returns where we expect:
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-// Check that block 1 emits some debug info and returns where we expect:
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-
-float4 FlowControlPS(in uint value : value ) : SV_Target
-{
-  float4 ret = float4(0, 0, 0, 0);
-  if (value > 1) {
-    ret = float4(0, 0, 0, 2);
-  } else {
-    ret = float4(0, 0, 0, 1);
-  }
-  return ret;
-}
\ No newline at end of file
diff --git a/tools/clang/test/HLSLFileCheck/pix/DebugPhisSwicth.hlsl b/tools/clang/test/HLSLFileCheck/pix/DebugPhisSwicth.hlsl
deleted file mode 100644
index 8d08df97e..000000000
--- a/tools/clang/test/HLSLFileCheck/pix/DebugPhisSwicth.hlsl
+++ /dev/null
@@ -1,40 +0,0 @@
-// RUN: %dxc -EFlowControlPS -Tps_6_0 %s -Od | %opt -S -dxil-annotate-with-virtual-regs -hlsl-dxil-debug-instrumentation | %FileCheck %s
-
-// Check for a branch to a new block for each case:
-// CHECK: br label %PIXDebug
-// CHECK: br label %PIXDebug
-// CHECK: br label %PIXDebug
-
-// Check that three PIXDebug blocks emit some debug info and returns where we expect:
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-// Check that three PIXDebug blocks emit some debug info and returns where we expect:
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-// Check that three PIXDebug blocks emit some debug info and returns where we expect:
-// CHECK: PIXDebug
-// CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78
-// CHECK: br label
-
-
-float4 FlowControlPS(in uint value : value ) : SV_Target
-{
-  float4 ret = float4(0, 0, 0, 0);
-  switch (value)
-  {
-  case 0:
-    ret = float4(1, 0, 0, 0);
-    break;
-  case 1:
-    ret = float4(2, 0, 0, 0);
-    break;
-  default:
-    ret = float4(3, 0, 0, 0);
-    break;
-  }
-  return ret;
-}
\ No newline at end of file
diff --git a/tools/clang/test/HLSLFileCheck/pix/DebugUAVSize.hlsl b/tools/clang/test/HLSLFileCheck/pix/DebugUAVSize.hlsl
index 5a446dadb..433dca158 100644
--- a/tools/clang/test/HLSLFileCheck/pix/DebugUAVSize.hlsl
+++ b/tools/clang/test/HLSLFileCheck/pix/DebugUAVSize.hlsl
@@ -1,11 +1,10 @@
-// RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation,UAVSize=100000 | %FileCheck %s
+// RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation,UAVSize=1024 | %FileCheck %s
 
-// Check that the UAV size is reflected in the instrumentation. (Should be passed-in size - 64k)
-// (The offset here is the "dumping ground" for non-interesting invocations)
-// 100,000 - 65.536 = 34,464
-
-// CHECK: %OffsetAddend = mul i32 34464, %ComplementOfMultiplicand
+// Check that the UAV size is reflected in the instrumentation.
+// The AND should be (1024/4-1), and the or should be 1024/2:
 
+// CHECK: %PIXOffsetOr = phi i32 [ 0, %PIXInterestingBlock ], [ 512, %PIXNonInterestingBlock ]
+// CHECK: and i32 {{.*}}, 255
 
 [RootSignature("")]
 float4 main() : SV_Target {
diff --git a/tools/clang/test/HLSLFileCheck/pix/DebugVSParameters.hlsl b/tools/clang/test/HLSLFileCheck/pix/DebugVSParameters.hlsl
index 628949a9e..6b15cdf25 100644
--- a/tools/clang/test/HLSLFileCheck/pix/DebugVSParameters.hlsl
+++ b/tools/clang/test/HLSLFileCheck/pix/DebugVSParameters.hlsl
@@ -8,7 +8,6 @@
 // CHECK: %CompareToVertId = icmp eq i32 %VertId, 1
 // CHECK: %CompareToInstanceId = icmp eq i32 %InstanceId, 2
 // CHECK: %CompareBoth = and i1 %CompareToVertId, %CompareToInstanceId
-// CHECK: %OffsetMultiplicand = zext i1 %CompareBoth to i32
 
 
 [RootSignature("")]
diff --git a/tools/clang/unittests/HLSL/PixTest.cpp b/tools/clang/unittests/HLSL/PixTest.cpp
index 3d64eace9..a41b806ef 100644
--- a/tools/clang/unittests/HLSL/PixTest.cpp
+++ b/tools/clang/unittests/HLSL/PixTest.cpp
@@ -131,6 +131,9 @@ public:
 
   TEST_METHOD(DxilPIXDXRInvocationsLog_SanityTest)
 
+  TEST_METHOD(DebugInstrumentation_TextOutput)
+  TEST_METHOD(DebugInstrumentation_BlockReport)
+
   dxc::DxcDllSupport m_dllSupport;
   VersionSupportInfo m_ver;
 
@@ -188,6 +191,32 @@ public:
         std::move(pOptimizedModule), {}, Tokenize(outputText.c_str(), "\n")};
   }
 
+  PassOutput RunDebugPass(IDxcBlob *dxil, int UAVSize = 1024 * 1024) {
+    CComPtr<IDxcOptimizer> pOptimizer;
+    VERIFY_SUCCEEDED(
+        m_dllSupport.CreateInstance(CLSID_DxcOptimizer, &pOptimizer));
+    std::vector<LPCWSTR> Options;
+    Options.push_back(L"-opt-mod-passes");
+    Options.push_back(L"-dxil-dbg-value-to-dbg-declare");
+    Options.push_back(L"-dxil-annotate-with-virtual-regs");
+    std::wstring debugArg =
+        L"-hlsl-dxil-debug-instrumentation,UAVSize=" + std::to_wstring(UAVSize);
+    Options.push_back(debugArg.c_str());
+
+    CComPtr<IDxcBlob> pOptimizedModule;
+    CComPtr<IDxcBlobEncoding> pText;
+    VERIFY_SUCCEEDED(pOptimizer->RunOptimizer(
+        dxil, Options.data(), Options.size(), &pOptimizedModule, &pText));
+
+    std::string outputText;
+    if (pText->GetBufferSize() != 0) {
+      outputText = reinterpret_cast<const char *>(pText->GetBufferPointer());
+    }
+
+    return {
+        std::move(pOptimizedModule), {}, Tokenize(outputText.c_str(), "\n")};
+  }
+
   CComPtr<IDxcBlob> FindModule(hlsl::DxilFourCC fourCC, IDxcBlob *pSource) {
     const UINT32 BC_C0DE = ((INT32)(INT8)'B' | (INT32)(INT8)'C' << 8 |
                             (INT32)0xDEC0 << 16); // BC0xc0de in big endian
@@ -2570,3 +2599,96 @@ void MyMiss(inout MyPayload payload)
   auto compiledLib = Compile(m_dllSupport, source, L"lib_6_6", {});
   RunDxilPIXDXRInvocationsLog(compiledLib);
 }
+
+TEST_F(PixTest, DebugInstrumentation_TextOutput) {
+
+  const char *source = R"x(
+float4 main() : SV_Target {
+    return float4(0,0,0,0);
+})x";
+
+  auto compiled = Compile(m_dllSupport, source, L"ps_6_0", {});
+  auto output = RunDebugPass(compiled, 8 /*ludicrously low UAV size limit*/);
+  bool foundStaticOverflow = false;
+  bool foundCounterOffset = false;
+  bool foundThreshold = false;
+  for (auto const &line : output.lines) {
+    if (line.find("StaticOverflow:12") != std::string::npos)
+      foundStaticOverflow = true;
+    if (line.find("InterestingCounterOffset:3") != std::string::npos)
+      foundCounterOffset = true;
+    if (line.find("OverflowThreshold:1") != std::string::npos)
+      foundThreshold = true;
+  }
+  VERIFY_IS_TRUE(foundStaticOverflow);
+}
+
+TEST_F(PixTest, DebugInstrumentation_BlockReport) {
+
+  const char *source = R"x(
+RWStructuredBuffer<int> UAV: register(u0);
+float4 main() : SV_Target {
+    // basic int variable
+    int v = UAV[0];
+    if(v == 0)
+        UAV[1] = v;
+    else
+        UAV[2] = v;
+    // float with indexed alloca
+    float f[2];
+    f[0] = UAV[4];
+    f[1] = UAV[5];
+    if(v == 2)
+        f[0] = v;
+    else
+        f[1] = v;
+    float farray2[2];
+    farray2[0] = UAV[4];
+    farray2[1] = UAV[5];
+    if(v == 4)
+        farray2[0] = v;
+    else
+        farray2[1] = v;
+    double d = UAV[8];
+    int64_t i64 = UAV[9];
+    return float4(d,i64,0,0);
+})x";
+
+  auto compiled = Compile(m_dllSupport, source, L"ps_6_0", {L"-Od"});
+  auto output = RunDebugPass(compiled);
+  bool foundBlock = false;
+  bool foundRet = false;
+  bool foundUnnumberedVoidProllyADXNothing = false;
+  bool found32BitAssignment = false;
+  bool foundFloatAssignment = false;
+  bool foundDoubleAssignment = false;
+  bool found64BitAssignment = false;
+  bool found32BitAllocaStore = false;
+  for (auto const &line : output.lines) {
+    if (line.find("Block#") != std::string::npos) {
+      if (line.find("r,0,r;") != std::string::npos)
+        foundRet = true;
+      if (line.find("v,0,v;") != std::string::npos)
+        foundUnnumberedVoidProllyADXNothing = true;
+      if (line.find("3,3,a;") != std::string::npos)
+        found32BitAssignment = true;
+      if (line.find("d,13,a;") != std::string::npos)
+        foundDoubleAssignment = true;
+      if (line.find("f,19,a;") != std::string::npos)
+        foundFloatAssignment = true;
+      if (line.find("6,16,a;") != std::string::npos)
+        found64BitAssignment = true;
+      if (line.find("3,3,s,2+0;") != std::string::npos)
+        found32BitAllocaStore = true;
+      foundBlock = true;
+    }
+  }
+  VERIFY_IS_TRUE(foundBlock);
+  VERIFY_IS_TRUE(foundRet);
+  VERIFY_IS_TRUE(foundUnnumberedVoidProllyADXNothing);
+  VERIFY_IS_TRUE(found32BitAssignment);
+  VERIFY_IS_TRUE(found64BitAssignment);
+  VERIFY_IS_TRUE(foundFloatAssignment);
+  VERIFY_IS_TRUE(foundDoubleAssignment);
+  VERIFY_IS_TRUE(found32BitAllocaStore);
+}