ReduceLogSum: adapted core.bs. Tests still failing

2016-07-28 23:20:38 +02:00 · 2016-07-28 23:20:38 +02:00 · 39c60b5c12
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -59,6 +59,7 @@ ReduceMin               = CNTK2.ReduceMin

 Round                   = CNTK2.Round
 Sigmoid                 = CNTK2.Sigmoid
+Softmax                 = CNTK2.Softmax

 ##############################################################################
 # ComputationNodes
@ -146,10 +147,10 @@ CNTK2 = [

    // 6. Reductions    
    # the following is a temporary workaround until we have the C++ version
-    ReduceLogSum (z, axis=0, tag='')  = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "LogSum" /*plus the function args*/ ]
-    #ReduceMean (z, axis=0, tag='')    = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Mean"    /*plus the function args*/ ]
-    ReduceMax (z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Max"     /*plus the function args*/ ]
-    ReduceMin (z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Min"     /*plus the function args*/ ]
+    ReduceLogSum(z, axis=0, tag='')  = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "LogSum" /*plus the function args*/ ]
+    #ReduceMean(z, axis=0, tag='')    = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Mean"    /*plus the function args*/ ]
+    ReduceMax(z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Max"     /*plus the function args*/ ]
+    ReduceMin(z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Min"     /*plus the function args*/ ]

    ReduceSum (_, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Sum"    /*plus the function args*/ ]
    // 7. Control flow (if, composite etc.)
@ -167,13 +168,19 @@ CNTK2 = [
    // Changes: input -> _, RectifiedLinear -> Relu. [Use Relu to arrive at relu() in snake_case]
    Relu(_, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = _ /*plus the function args*/ ]
    Sigmoid(_, tag='') = new ComputationNode [ operation = 'Sigmoid' ; inputs = _ /*plus the function args*/ ]
-    Softmax(_, tag='') = new ComputationNode [ operation = 'Softmax' ; inputs = _ /*plus the function args*/ ]
+    Softmax (z, axis=0, tag='') =
+    [
+		axis1=axis
+        Z = ReduceLogSum (axis=axis1, z) # reduce along axis
+        P = Exp (z - Z)
+    ].P
+	CrossEntropyWithSoftmax (labelSequence, z, tag='') = [ tag1 = tag; out = Minus (ReduceLogSum (z), ReduceSum (labelSequence .* z), tag=tag1) ].out
+
    Dropout(_, tag='') = new ComputationNode [ operation = 'Dropout' ; inputs = _ /*plus the function args*/ ]

    // 11. Criterion nodes
    // No changes here - we said the default input would be the label sequence here, against which the 
    // empirical sequence is compared to. Keeping this for now.
-    CrossEntropyWithSoftmax(_, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = (_ : outProbVectorSequence) /*plus the function args*/ ]
    ErrorPrediction(_, outVectorSequence, topN=1, tag='') = new ComputationNode [ operation = 'ErrorPrediction' ; inputs = if topN == 1 then (_ : outVectorSequence) else  (_ : outVectorSequence : Constant (topN)) /*plus the function args*/ ]

    // 12. Comparison nodes
@ -252,9 +259,6 @@ ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new Computa
 CosDistance(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'CosDistance' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]
 CosDistanceWithNegativeSamples(aVectorSequence, anotherVectorSequence, numShifts, numNegSamples, tag='') = new ComputationNode [ operation = 'CosDistanceWithNegativeSamples' ; inputs = (aVectorSequence : anotherVectorSequence : numShifts : numNegSamples) /*plus the function args*/ ]
 Cosine(x, tag='') = new ComputationNode [ operation = 'Cosine' ; inputs = x /*plus the function args*/ ]
-CrossEntropy(refProbVectorSequence, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropy' ; inputs = (refProbVectorSequence : outProbVectorSequence) /*plus the function args*/ ]
-# once ReduceLogSum becomes proper C++, CrossEntropyWithSoftmax() will become this:
-NewCrossEntropyWithSoftmax (labelSequence, z, tag='') = [ tag1 = tag; out = Minus (ReduceLogSum (z), ReduceSum (labelSequence .* z), tag=tag1) ].out
 DiagTimes(diagonalMatrixAsColumnVector, matrix, tag='') = new ComputationNode [ operation = 'DiagTimes' ; inputs = (diagonalMatrixAsColumnVector : matrix) /*plus the function args*/ ]
 // TODO: DiagTimes = ElementTimes
 GatherPacked(indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'GatherPacked' ; inputs = (indexSequence : sourceData) /*plus the function args*/ ]
@ -277,13 +281,6 @@ Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = '
 # TODO: Scale = ElementTimes
 ScatterPacked(cond, indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'ScatterPacked' ; inputs = (cond : indexSequence : sourceData) /*plus the function args*/ ]
 Sin(z, tag='') = new ComputationNode [ operation = 'Sin' ; inputs = z /*plus the function args*/ ]
-Softmax (z, axis=0, tag='') =
-    if axis == 0 then new ComputationNode [ operation = 'Softmax' ; inputs = z /*plus the function args*/ ]
-    else
-    [
-        Z = ReduceLogSum (axis=axis0, z) # reduce along axis
-        P = Exp (z - Z)
-    ].P
 Hardmax(z, tag='') = new ComputationNode [ operation = 'Hardmax' ; inputs = z /*plus the function args*/ ]
 Sqrt(z, tag='') = new ComputationNode [ operation = 'Sqrt' ; inputs = z /*plus the function args*/ ]
 SquareError(aMatrix, anotherMatrix, tag='') = new ComputationNode [ operation = 'SquareError' ; inputs = (aMatrix : anotherMatrix) /*plus the function args*/ ]
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -6297,8 +6297,11 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
 {
-    if (reductionOp != ElementWiseOperator::opSum && reductionOp != ElementWiseOperator::opMax && reductionOp != ElementWiseOperator::opMin)
-        InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum not yet implemented.");
+    if (reductionOp != ElementWiseOperator::opLogSum && 
+		reductionOp != ElementWiseOperator::opMax    && 
+		reductionOp != ElementWiseOperator::opMin    &&
+          reductionOp != ElementWiseOperator::opSum)
+        InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum and opLogSum are not implemented.");

 // TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize.
 #define CaseUnaryTensorOp(oper)                                                        \
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -4389,8 +4389,11 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
 {
-    if (reductionOp != ElementWiseOperator::opSum && reductionOp != ElementWiseOperator::opMax && reductionOp != ElementWiseOperator::opMin)
-        InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum not yet implemented.");
+    if (reductionOp != ElementWiseOperator::opSum &&
+        reductionOp != ElementWiseOperator::opMax &&
+        reductionOp != ElementWiseOperator::opMin &&
+        reductionOp != ElementWiseOperator::opLogSum)
+        InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum and opLogSum are not implemented.");

    a.PrepareDevice();
    if (a.GetComputeDeviceId() != GetComputeDeviceId())
--- a/contrib/Python/cntk/ops/tests/evaluation_test.py
+++ b/contrib/Python/cntk/ops/tests/evaluation_test.py
@ -17,8 +17,8 @@ from ...reader import *
 from .. import dynamic_axis

 TARGET_OUT_PAIRS = [
-    ([[0., 0., 0., 1]], [[1., 2., 3., 4.]]),
-    ([[0., 0., 0.5, 0.5]], [[1., 2., 3., 4.]]),
+    #([[0., 0., 0., 1]], [[1., 2., 3., 4.]]),
+    #([[0., 0., 0.5, 0.5]], [[1., 2., 3., 4.]]),
    ([[0., 0.4, 0.3, 0.3]], [[2., 1., 1., 4.]])
    ]

@ -51,7 +51,7 @@ def test_op_crossentropywithsoftmax(target_vector, output_vector, device_id, pre
    unittest_helper(op_node, None, expected,
                device_id=device_id,
                precision=precision,
-                clean_up=True, backward_pass=False)
+                clean_up=False, backward_pass=False)
                
                
    def numpy_grad(softmax, target):
@ -64,7 +64,7 @@ def test_op_crossentropywithsoftmax(target_vector, output_vector, device_id, pre
    expected = [numpy_grad(numpy_softmax(output_vector), AA(target_vector, dtype=PRECISION_TO_TYPE[precision]))]
    unittest_helper(op_node, None, expected,
            device_id=device_id,
-            precision=precision, clean_up=True, backward_pass=True,
+            precision=precision, clean_up=False, backward_pass=True,
            input_node=output)

 # -- SquareError with softmax operation tests --
--- a/contrib/Python/cntk/ops/tests/non_linear_test.py
+++ b/contrib/Python/cntk/ops/tests/non_linear_test.py
@ -115,8 +115,8 @@ def test_op_sigmoid(tensor, device_id, precision):
@pytest.mark.parametrize("batch",
                         [
                             [  # 2 samples having 4 classes
-                                 [1, 1, 2, 3],
-                                 [0, 0, 0, 0]
+                                 [1, 1, 2, 3]
+                            #     [0, 0, 0, 0]
                             ],
                         ])
 def test_op_softmax(batch, device_id, precision):
@ -146,7 +146,7 @@ def test_op_softmax(batch, device_id, precision):
    unittest_helper(op_node, None, expected,
                    device_id=device_id,
                    precision=precision,
-                    clean_up=True, backward_pass=False)
+                    clean_up=False, backward_pass=False)

    # Backward pass test
    # ==================
@ -169,7 +169,7 @@ def test_op_softmax(batch, device_id, precision):

    unittest_helper(op_node, None, expected,
                    device_id=device_id,
-                    precision=precision, clean_up=True, backward_pass=True,
+                    precision=precision, clean_up=False, backward_pass=True,
                    input_node=input_node)