(added logging to calgammaformb() to track down an error)

2015-12-14 15:40:50 -08:00 · 2015-12-14 15:40:50 -08:00 · 7a57f8b21b
--- a/Common/Include/DataTensor.h
+++ b/Common/Include/DataTensor.h
@ -25,6 +25,45 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // which represents the column-major interpretation of a transposed row-by-row-scanned image where each pixel stores (R,G,B) as a float3.
    // -----------------------------------------------------------------------

+    // Plans for improved tensor support:
+    //
+    // TensorShape support for:
+    //  - column-major arbitrary-dimension arrays  --this is already implemented
+    //  - strides for storage, allowing
+    //     - slicing
+    //  - strides for computation, allowing
+    //     - broadcasting (stride = 0)
+    //     - stride magic such as inverting index order or convolution
+    //  - insertion and dropping of 1-dimension (cf. 'new_axis' in numpy)
+    //
+    // Relation to Matrix and MBLayout:
+    //  - tensors are stored in Matrix objects
+    //  - both matrix row and column dimensions are interpreted as tensor dimensions
+    //     - row dimension is explained by a TensorShape ComputationNode::SampleLayout
+    //     - column dimensions are explained by MBLayout, which has one parallel-sequence index and one (or more) time-step dimensions, e.g. (s,t)
+    //  - the total tensor shape of what is stored in the matrix is
+    //     - no MBLayout: the SampleLayout
+    //     - in presence of an MBLayout, it is determined as
+    //        - when applying element-wise operations, first expand all operands to the same SampleLayout length by padding with 1-dimensions
+    //        - concatenate that shape, say, (I,J,K) with the shape derived from the MBLayout, say (S,T) -> (I,J,K,S,T)
+    //        - these extra dimensions are only used internally, but not accessible to the user (user/network definition operates on samples only)
+    //     - examples:
+    //        - A[(I,J,K), (S,T)] + B[(I,J,K), (S,T)] -> C[I,J,K,S,T]   // all dimensions match
+    //        - A[(I,J), (S,T)] + B[(I,J,K), (S,T)] -> C[I,J,K,S,T]     // A gets an additional broadcasting dimension that matches K
+    //        - A(I,T) + B(I) -> C(I,T)                                 // T is broadcasting for B, e.g. adding a bias
+    //        - A(I,T1,T2) + B(1,T1) -> C(I,T1,T2)                      // 2D iteration; implies a third dim for B where both first and third dim broadcast
+    //
+    // Operations:
+    //  - all elementwise operations:
+    //     - dimensions are expanded as explained above for all operands
+    //     - of note: result may also have broadcasting dimensions
+    //     - elementwise 'copy' is also considered here, which allows for strided copies
+    //  - inner product (Kronecker product+contraction) -> TimesNode
+    //     - implementable as SGEMM (may extend in the future)
+    //  - tensor transpose -> TransposeNode
+    //     - swaps any two dimensions. This does not change the column-major definition, i.e. requires a memory copy.
+    //     - special case: swapping between sample and MBLayout, e.g. turn a sample dimension to a time dimension
+
    // TODO: must match ComputationNode::m_numRows; or, rather, the TensorShape is how m_numRows is stored??
    struct TensorShape
    {
--- a/MachineLearning/CNTKSequenceTrainingLib/gammacalculation.h
+++ b/MachineLearning/CNTKSequenceTrainingLib/gammacalculation.h
@ -55,14 +55,14 @@ namespace msra { namespace lattices {
            //check total frame number to be added ?
            //int deviceid = loglikelihood.GetDeviceId();
            size_t boundaryframenum;
-            std::vector<size_t> validframes;
+            std::vector<size_t> validframes;                // [s] cursor pointing to next utterance begin within a single parallel sequence [s]
            validframes.assign(samplesInRecurrentStep, 0);
            ElemType objectValue = 0.0;
            //convert from Microsoft::MSR::CNTK::Matrix to  msra::math::ssematrixbase
            size_t numrows = loglikelihood.GetNumRows();
            size_t numcols = loglikelihood.GetNumCols();                
            Microsoft::MSR::CNTK::Matrix<ElemType> tempmatrix(m_deviceid);
-                
+
            //copy loglikelihood to pred
            if (numcols > pred.cols())
            {
@ -72,19 +72,17 @@ namespace msra { namespace lattices {

            if (doreferencealign)
                labels.SetValue((ElemType)(0.0f));
-                
-            size_t mbsize = numcols / samplesInRecurrentStep;                
+
+            size_t T = numcols / samplesInRecurrentStep;        // number of time steps in minibatch           
            if (samplesInRecurrentStep > 1)
            {
                assert(extrauttmap.size() == lattices.size());
-                assert(mbsize == pMBLayout->GetNumTimeSteps());
+                assert(T == pMBLayout->GetNumTimeSteps());
            }
-                
-            size_t mapi = 0;
-            size_t mapframenum = 0;
-            //cal gamma for each utterance
+
+            size_t mapi = 0;                // parallel-sequence index for utterance [i]
+            // cal gamma for each utterance
            size_t ts = 0;
-            //size_t ts_uid = 0;                
            for (size_t i = 0; i < lattices.size(); i++)
            {
                const size_t numframes = lattices[i]->getnumframes();
@ -92,8 +90,7 @@ namespace msra { namespace lattices {
                msra::dbn::matrixstripe predstripe(pred, ts, numframes);           // logLLs for this utterance                    
                msra::dbn::matrixstripe dengammasstripe(dengammas, ts, numframes); // denominator gammas

-                                        
-                if (samplesInRecurrentStep == 1)  //one channel 
+                if (samplesInRecurrentStep == 1)  // no sequence parallelism
                {
                    tempmatrix = loglikelihood.ColumnSlice(ts, numframes);
                    //if (m_deviceid == CPUDEVICE)
@ -104,21 +101,26 @@ namespace msra { namespace lattices {
                    if (m_deviceid != CPUDEVICE)
                        parallellattice.setloglls(tempmatrix);
                }
-                else                   //multi channel
+                else                   // multiple parallel sequences
                {
-                    //get frame number for each utterance
-                    mapi = extrauttmap[i];
-                        
-                    for (size_t j = validframes[mapi]; j < mbsize; j++)
+                    // get number of frames for the utterance
+                    mapi = extrauttmap[i];          // parallel-sequence index; in case of >1 utterance within this parallel sequence, this is in order of concatenation
+
+                    // scan MBLayout for end of utterance
+                    size_t mapframenum = SIZE_MAX;         // duration of utterance [i] as determined from MBLayout
+                    for (size_t t = validframes[mapi]; t < T; t++)
                    {
                        // TODO: Adapt this to new MBLayout, m_sequences would be easier to work off.
-                        if (pMBLayout->IsEnd(mapi,j))
+                        if (pMBLayout->IsEnd(mapi,t))
                        {
-                            mapframenum = j - validframes[mapi] + 1;
+                            mapframenum = t - validframes[mapi] + 1;
                            break;
                        }
                    }

+                    // must match the explicit information we get from the reader
+                    if (numframes != mapframenum)
+                        LogicError("gammacalculation: IsEnd() not working, numframes (%d) vs. mapframenum (%d)", (int)numframes, (int)mapframenum);
                    assert(numframes == mapframenum);

                    if (numframes > tempmatrix.GetNumCols())
@ -195,7 +197,7 @@ namespace msra { namespace lattices {
                    }
                }
                if (samplesInRecurrentStep > 1)
-                    validframes[mapi] += numframes;
+                    validframes[mapi] += numframes;             // advance the cursor within the parallel sequence
                fprintf(stderr, "dengamma value %f\n", denavlogp);
                ts += numframes;
            }