From 3a2f92acf2269143d40ecaa0c7bf91c388b402c2 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amitaga@microsoft.com>
Date: Mon, 17 Aug 2015 10:24:15 -0700
Subject: [PATCH] Added Print function for QuantizedMatrix to aid debugging

---
 Math/CNTKMathTest/MatrixQuantizerTests.cpp | 37 +++++++++++-
 Math/Math/ColumnQuantizer.h                |  3 +
 Math/Math/Math.vcxproj.filters             | 10 +---
 Math/Math/Matrix.h                         |  3 +
 Math/Math/QuantizedMatrix.cpp              | 67 ++++++++++++++++++++++
 Math/Math/QuantizedMatrix.h                |  3 +
 6 files changed, 115 insertions(+), 8 deletions(-)
diff --git a/Math/CNTKMathTest/MatrixQuantizerTests.cpp b/Math/CNTKMathTest/MatrixQuantizerTests.cpp
index 5f5870e5e..f56badfad 100644
--- a/Math/CNTKMathTest/MatrixQuantizerTests.cpp
+++ b/Math/CNTKMathTest/MatrixQuantizerTests.cpp
@@ -5,6 +5,9 @@
 //
 #include "stdafx.h"
 #include "CppUnitTest.h"
+#include "File.h"
+#include <memory>
+#include <io.h>
 
 #include "..\Math\MatrixQuantizer.h"
 #include "..\Math\CUDAPageLockedMemAllocator.h"
@@ -17,6 +20,20 @@ using namespace Microsoft::MSR::CNTK;
 using namespace Microsoft::MSR::CNTK;
 using namespace Microsoft::VisualStudio::CppUnitTestFramework;
 
+//#define DEBUG_OUTPUT_PATH L"E:/temp/MatrixQuantizerTest.out.txt"
+
+#pragma warning (disable: 4996)
+
+void RedirectStdErr(wstring logpath)
+{
+    fprintf(stderr, "Redirecting stderr to file %S\n", logpath.c_str());
+    auto f = make_shared<File>(logpath.c_str(), fileOptionsWrite | fileOptionsText);
+    if (dup2(fileno(*f), 2) == -1)
+        RuntimeError("unexpected failure to redirect stderr to log file");
+    setvbuf(stderr, NULL, _IONBF, 16384);   // unbuffer it
+    static auto fKept = f;                  // keep it around (until it gets changed)
+}
+
 namespace CNTKMathTest
 {
     TEST_CLASS(MatrixQuantizerTests)
@@ -130,14 +147,28 @@ namespace CNTKMathTest
                 ElemType* gpuPrevResidualMatrix = quantizer->GetResidualMatrix().CopyToArray();
                 ElemType *gpuPrevOutMatrix = outMatrix.CopyToArray();
 
-                QuantizedMatrix<ElemType> tempCPUQuantizationBuffer(numRows, numCols, 1, CPUDEVICE, allocator);
+#ifdef DEBUG_OUTPUT_PATH
+                inMatrix.Print("Input Matrix", 0, 2, 0, 2);
+                quantizer->GetResidualMatrix().Print("Old Residual Matrix", 0, 2, 0, 2);
+                outMatrix.Print("Old Output Matrix", 0, 2, 0, 2);
+#endif
 
+                QuantizedMatrix<ElemType> tempCPUQuantizationBuffer(numRows, numCols, 1, CPUDEVICE, allocator);
                 quantizer->QuantizeAsync(tempCPUQuantizationBuffer);
                 quantizer->WaitQuantizeAsyncDone();
 
+#ifdef DEBUG_OUTPUT_PATH
+                tempCPUQuantizationBuffer.Print("Quantized Matrix", 0, 2, 0, 2);
+                quantizer->GetResidualMatrix().Print("New residual Matrix", 0, 2, 0, 2);
+#endif
+
                 quantizer->UnquantizeAsync(tempCPUQuantizationBuffer, outMatrix, (iterNum > 0));
                 quantizer->WaitUnquantizeAsyncDone();
 
+#ifdef DEBUG_OUTPUT_PATH
+                outMatrix.Print("Unquantized Output Matrix", 0, 2, 0, 2);
+#endif
+
                 // Now verify the quantization results
                 ElemType* gpuNewResidualMatrix = quantizer->GetResidualMatrix().CopyToArray();
                 ElemType* gpuNewOutMatrix = outMatrix.CopyToArray();
@@ -251,6 +282,10 @@ namespace CNTKMathTest
         //This test will fail without GPU
         TEST_METHOD(Matrix1BitQuantize)
         {
+#ifdef DEBUG_OUTPUT_PATH
+            RedirectStdErr(DEBUG_OUTPUT_PATH);
+#endif
+
             // Test single precision 1bit quantization on CPU
             Test1BitQuantization<float>(CPUDEVICE);
 
diff --git a/Math/Math/ColumnQuantizer.h b/Math/Math/ColumnQuantizer.h
index e3b62e092..e34aa9c59 100644
--- a/Math/Math/ColumnQuantizer.h
+++ b/Math/Math/ColumnQuantizer.h
@@ -316,6 +316,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     private:
         ValueQuantizer<ElemType> valQ;
+
+        template<typename T>
+        friend class QuantizedMatrix;
     };
 
 }}}
diff --git a/Math/Math/Math.vcxproj.filters b/Math/Math/Math.vcxproj.filters
index bc8a1d478..88be01b2c 100644
--- a/Math/Math/Math.vcxproj.filters
+++ b/Math/Math/Math.vcxproj.filters
@@ -25,7 +25,7 @@
     <ClCompile Include="MatrixQuantizer.cpp" />
     <ClCompile Include="QuantizedMatrix.cpp" />
     <ClCompile Include="CUDAPageLockedMemAllocator.cpp">
-      <Filter>CPU\1bitSGD</Filter>
+      <Filter>GPU\1bitSGD</Filter>
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
@@ -57,13 +57,9 @@
     </ClInclude>
     <ClInclude Include="MatrixQuantizer.h" />
     <ClInclude Include="QuantizedMatrix.h" />
-      <Filter>CPU\1bitSGD</Filter>
-    </ClInclude>
-    <ClInclude Include="MemAllocator.h">
-      <Filter>CPU\1bitSGD</Filter>
-    </ClInclude>
+    <ClInclude Include="MemAllocator.h" />
     <ClInclude Include="CUDAPageLockedMemAllocator.h">
-      <Filter>CPU\1bitSGD</Filter>
+      <Filter>GPU\1bitSGD</Filter>
     </ClInclude>
   </ItemGroup>
   <ItemGroup>
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index dd19d9688..06f6f0bca 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -472,6 +472,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         template<typename T>
         friend class MatrixQuantizer;
+
+        template<typename T>
+        friend class QuantizedMatrix;
     };
 
     typedef Matrix<float> SingleMatrix;
diff --git a/Math/Math/QuantizedMatrix.cpp b/Math/Math/QuantizedMatrix.cpp
index 752056f7e..fd1b1fe07 100644
--- a/Math/Math/QuantizedMatrix.cpp
+++ b/Math/Math/QuantizedMatrix.cpp
@@ -1,5 +1,6 @@
 #include "stdafx.h"
 #include "QuantizedMatrix.h"
+#include "ColumnQuantizer.h"
 
 namespace Microsoft { namespace MSR { namespace CNTK {
     
@@ -107,6 +108,72 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return QuantizedMatrix<ElemType>(this->GetNumRows(), numCols, this->GetNumBits(), matrixSliceData);
     }
     
+    template<class ElemType>
+    void QuantizedMatrix<ElemType>::Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd)
+    {
+        if ((GetNumRows() == 0) || (GetNumCols() == 0))
+        {
+            throw std::logic_error("Print: QuantizedMatrix is empty.");
+        }
+
+        if (rowEnd >= GetNumRows() || colEnd >= GetNumCols())
+        {
+            throw std::invalid_argument("Index out of range.");
+        }
+
+        if (this->GetNumBits() != 1)
+        {
+            throw std::logic_error("QuantizedMatrix::Print is currently only supported for 1 bit.");
+        }
+
+        DEVICEID_TYPE orgdevice = this->GetDeviceId();
+        CurrentDataLocation curLocation = m_quantizedData->GetCurrentMatrixLocation();
+        if (curLocation == CurrentDataLocation::GPU)
+        {
+            m_quantizedData->_transferToDevice(CPUDEVICE, false, false);
+        }
+
+        if (matrixName != nullptr)
+            fprintf(stderr, "\n###### %s (%lu, %lu) ######\n", matrixName, GetNumRows(), GetNumCols());
+        else
+            fprintf(stderr, "\n###### Unnamed Matrix (%lu, %lu) ######\n", GetNumRows(), GetNumCols());
+
+        fprintf(stderr, "\n------ Print Range (%lu:%lu, %lu:%lu) ------\n", rowStart, rowEnd, colStart, colEnd);
+
+        for (size_t j = colStart; j <= colEnd; j++)
+        {
+            QuantizedColumn<ElemType>* qCol = this->GetQuantizedColumn(j);
+            fprintf(stderr, "Lower=%.10f,Upper=%.10f\t", qCol->lower, qCol->upper);
+        }
+        fprintf(stderr, "\n");
+
+        const size_t ldNbits = ValueQuantizer<ElemType>::ld(this->GetNumBits());
+        size_t numQWordsPerCol = ColumnQuantizer<ElemType>::QWordsPerCol(this->GetNumRows(), this->GetNumBits());
+        for (size_t i = rowStart; i <= rowEnd; i++)
+        {
+            size_t qWordIdx = i % numQWordsPerCol;
+            size_t offsetInQWord = i / numQWordsPerCol;
+            for (size_t j = colStart; j <= colEnd; j++)
+            {
+                QuantizedColumn<ElemType>* qCol = this->GetQuantizedColumn(j);
+                ColumnQuantizer<ElemType> q(ldNbits, qCol->lower, qCol->upper);
+                ElemType val0 = q.valQ.Unquantize(0);
+                ElemType val1 = q.valQ.Unquantize(1);
+
+                QWord qWord = qCol->bits[qWordIdx];
+                bool qVal = ((qWord >> offsetInQWord) & 1) != 0;
+                ElemType val = ValueQuantizer<ElemType>::Unquantize1(qVal, val0, val1);
+                fprintf(stderr, "%1d (%.10f)                   \t", qVal ? 1 : 0, val);
+            }
+            fprintf(stderr, "\n");
+        }
+
+        if (curLocation == CurrentDataLocation::GPU)
+        {
+            m_quantizedData->_transferToDevice(orgdevice, false, false);
+        }
+    }
+
     // Explicit instantiation
     template class QuantizedMatrix<float>;
     template class QuantizedMatrix<double>;    
diff --git a/Math/Math/QuantizedMatrix.h b/Math/Math/QuantizedMatrix.h
index 96d79ffc4..327ae2746 100644
--- a/Math/Math/QuantizedMatrix.h
+++ b/Math/Math/QuantizedMatrix.h
@@ -51,6 +51,7 @@ public:
 template<class ElemType>
 class MATH_API QuantizedMatrix
 {
+    typedef typename ValueQuantizer<ElemType>::QWord QWord;
     static const size_t QWordNumBits = ValueQuantizer<ElemType>::QWordNumBits;
 
 public:       
@@ -94,6 +95,8 @@ public:
     
     QuantizedMatrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
 
+    void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd);
+
 private:
     // Private constructor for creating quantized matrix column slices
     QuantizedMatrix(const size_t numRows, const size_t numCols, const size_t nbits, Matrix<char>* data);