Minor YOLO sample edits to speed up CPU post-processing (#200)

2022-02-04 19:10:57 -08:00 · 2022-02-04 19:10:57 -08:00 · 66d704bdf7
--- a/Samples/yolov4/yolov4.cpp
+++ b/Samples/yolov4/yolov4.cpp
@ -339,49 +339,69 @@ void Sample::GetModelPredictions(
    // values total.
    assert(anchors.size() == 6);

-    std::vector<float> tensorData = CopyReadbackHeap<float>(modelOutput.readback.Get());
-    TensorView<float> predTensor(tensorData, NchwExtents(modelOutput.desc.sizes));
+    // DirectML writes the final output data in NHWC, where the C channel contains the bounding box & probabilities 
+    // for each prediction.
+    const uint32_t predTensorN = modelOutput.desc.sizes[0];
+    const uint32_t predTensorH = modelOutput.desc.sizes[1];
+    const uint32_t predTensorW = modelOutput.desc.sizes[2];
+    const uint32_t predTensorC = modelOutput.desc.sizes[3];

    // YoloV4 predicts 3 boxes per scale, so we expect 3 separate predictions here
-    assert(predTensor.Sizes().n == 3);
-    
-    // Channel should contain the bounding box x/y/w/h, a confidence score, followed by probabilities for each class
-    assert(predTensor.Sizes().c == 5 + YoloV4Constants::c_numClasses);
+    assert(predTensorN == 3);

-    for (uint32_t n = 0; n < predTensor.Sizes().n; ++n)
+    // Width should contain the bounding box x/y/w/h, a confidence score, the probability for max class, and the class index
+    assert(predTensorC == 7);
+
+    struct PotentialPrediction
    {
-        for (uint32_t h = 0; h < predTensor.Sizes().h; ++h)
-        {
-            for (uint32_t w = 0; w < predTensor.Sizes().w; ++w)
-            {
-                float bx = predTensor(n, 0, h, w);
-                float by = predTensor(n, 1, h, w);
-                float bw = predTensor(n, 2, h, w);
-                float bh = predTensor(n, 3, h, w);
-                float confidence = predTensor(n, 4, h, w);
+        float bx;
+        float by;
+        float bw;
+        float bh;
+        float confidence;
+        float classMaxProbability;
+        uint32_t classIndex;
+    };

-                // Copy the probabilities for each class
-                std::vector<float> probabilities;
-                probabilities.reserve(YoloV4Constants::c_numClasses);
-                for (uint32_t i = 5; i < predTensor.Sizes().c; ++i)
+    // The output tensor should be large enough to hold the expected number of predictions.
+    assert(predTensorN * predTensorH * predTensorW * sizeof(PotentialPrediction) <= modelOutput.desc.totalTensorSizeInBytes);
+    std::vector<PotentialPrediction> tensorData = CopyReadbackHeap<PotentialPrediction>(modelOutput.readback.Get());
+
+    // Scale the boxes to be relative to the original image size
+    auto viewport = m_deviceResources->GetScreenViewport();
+    float xScale = (float)viewport.Width / YoloV4Constants::c_inputWidth;
+    float yScale = (float)viewport.Height / YoloV4Constants::c_inputHeight;
+
+    uint32_t currentPredIndex = 0;
+    for (uint32_t n = 0; n < predTensorN; ++n)
+    {
+        for (uint32_t h = 0; h < predTensorH; ++h)
+        {
+            for (uint32_t w = 0; w < predTensorW; ++w)
+            {
+                const PotentialPrediction& currentPred = tensorData[currentPredIndex++];
+
+                // Discard boxes with low scores
+                float score = currentPred.confidence * currentPred.classMaxProbability;
+                if (score < YoloV4Constants::c_scoreThreshold)
                {
-                    probabilities.push_back(predTensor(n, i, h, w));
+                    continue;
                }

                // We need to do some postprocessing on the raw values before we return them

                // Apply xyScale. Need to apply offsets of half a grid cell here, to ensure the scaling is
                // centered around zero.
-                bx = xyScale * (bx - 0.5f) + 0.5f;
-                by = xyScale * (by - 0.5f) + 0.5f;
+                float bx = xyScale * (currentPred.bx - 0.5f) + 0.5f;
+                float by = xyScale * (currentPred.by - 0.5f) + 0.5f;

                // Transform the x/y from being relative to the grid cell, to being relative to the whole image
                bx = (bx + (float)w) * stride;
                by = (by + (float)h) * stride;

                // Scale the w/h by the supplied anchors
-                bw *= anchors[n * 2];
-                bh *= anchors[n * 2 + 1];
+                float bw = currentPred.bw * anchors[n * 2];
+                float bh = currentPred.bh * anchors[n * 2 + 1];

                // Convert x,y,w,h to xmin,ymin,xmax,ymax
                float xmin = bx - bw / 2;
@ -389,12 +409,6 @@ void Sample::GetModelPredictions(
                float xmax = bx + bw / 2;
                float ymax = by + bh / 2;

-                auto viewport = m_deviceResources->GetScreenViewport();
-
-                // Scale the boxes to be relative to the original image size
-                float xScale = (float)viewport.Width / YoloV4Constants::c_inputWidth;
-                float yScale = (float)viewport.Height / YoloV4Constants::c_inputHeight;
-
                xmin *= xScale;
                ymin *= yScale;
                xmax *= xScale;
@ -412,22 +426,13 @@ void Sample::GetModelPredictions(
                    continue;
                }

-                // Discard boxes with low scores
-                ptrdiff_t classIndex = std::max_element(probabilities.begin(), probabilities.end()) - probabilities.begin();
-                float probability = probabilities[classIndex];
-                float score = confidence * probability;
-                if (score < YoloV4Constants::c_scoreThreshold)
-                {
-                    continue;
-                }
-
                Prediction pred = {};
                pred.xmin = xmin;
                pred.ymin = ymin;
                pred.xmax = xmax;
                pred.ymax = ymax;
                pred.score = score;
-                pred.predictedClass = static_cast<uint32_t>(classIndex);
+                pred.predictedClass = currentPred.classIndex;
                out->push_back(pred);
            }
        }
--- a/Samples/yolov4/yolov4ResourceBuilder.cpp
+++ b/Samples/yolov4/yolov4ResourceBuilder.cpp
@ -276,7 +276,7 @@ private:
    }
 };

-// Takes a tensor of size [1, 3 * (5 + numClasses), H, W] and returns a tensor of size [3, 5 + numClasses, H, W]. 
+// Takes a tensor of size [1, 3 * (5 + numClasses), H, W] and returns a tensor of size [3, H, W, 7]. 
 // Sigmoid activation is applied to all channels that represent probabilities (which are not all of them).
 dml::Expression DecodeModelOutput(dml::Expression output, uint32_t numClasses)
 {
@ -294,23 +294,38 @@ dml::Expression DecodeModelOutput(dml::Expression output, uint32_t numClasses)
    // Since this doesn't transform the data any, this can be accomplished with a simple reinterpret.
    output = dml::Reinterpret(output, { 3, numClasses + 5, outputSizes[2], outputSizes[3] }, dml::NullOpt);

-    // Split the new channel (of size 5+numClasses) into 4 different tensors with channels of 2, 2, 1+numClasses.
-    // These represent the box xy, box wh, confidence+probabilities for each class.
-    std::vector<dml::Expression> split = dml::Split(output, 1, { 2, 2, 1 + numClasses });
-    assert(split.size() == 3);
+    // Split the new channel (of size 5+numClasses) into 4 different tensors with channels of 2, 2, 1, numClasses.
+    // These represent the box xy, box wh, confidence, and probabilities for each class.
+    const uint32_t channelDim = 1;
+    std::vector<dml::Expression> split = dml::Split(output, channelDim, { 2, 2, 1, numClasses });
+    assert(split.size() == 4);

    // Convenience
    auto convXy = split[0];
    auto convWh = split[1];
-    auto convConfProb = split[2];
+    auto convConf = split[2];
+    auto convProb = split[3];

    // Apply final activations
    convXy = dml::ActivationSigmoid(convXy);
    convWh = dml::Exp(convWh);
-    convConfProb = dml::ActivationSigmoid(convConfProb);
+    convConf = dml::ActivationSigmoid(convConf);
+    convProb = dml::ActivationSigmoid(convProb);

-    const uint32_t joinAxis = 1; // Join along channel
-    return dml::Join({ convXy, convWh, convConfProb }, joinAxis);
+    // Compute the max and argmax of the probabilities. The argmax outputs UINT32 indices which
+    // are reinterpreted as float so they can be joined into the same output tensor.
+    auto convProbMax = dml::Reduce(convProb, DML_REDUCE_FUNCTION_MAX, { channelDim });
+    auto convProbArgMax = dml::Reduce(convProb, DML_REDUCE_FUNCTION_ARGMAX, { channelDim });
+    convProbArgMax = dml::Reinterpret(convProbArgMax, DML_TENSOR_DATA_TYPE_FLOAT32);
+
+    // Join the tensors along channel dimension.
+    auto joined = dml::Join({ convXy, convWh, convConf, convProbMax, convProbArgMax }, channelDim);
+
+    // Transpose from NCHW to NHWC for faster reading on the CPU (converts output from SoA to AoS).
+    dml::TensorDimensions sizesNchw = joined.GetOutputDesc().sizes;
+    dml::TensorDimensions sizesNhwc = { sizesNchw[0], sizesNchw[3], sizesNchw[2], sizesNchw[1] };
+    dml::TensorStrides stridesNhwc = { sizesNchw[1] * sizesNchw[2] * sizesNchw[3], sizesNchw[3], 1, sizesNchw[2] * sizesNchw[3] };
+    return dml::Identity(dml::Reinterpret(joined, sizesNhwc, stridesNhwc));
 }

 void Sample::CreateDirectMLResources()