Minor YOLO sample edits to speed up CPU post-processing (#200)
This commit is contained in:
Родитель
57d4cec473
Коммит
66d704bdf7
|
@ -339,49 +339,69 @@ void Sample::GetModelPredictions(
|
|||
// values total.
|
||||
assert(anchors.size() == 6);
|
||||
|
||||
std::vector<float> tensorData = CopyReadbackHeap<float>(modelOutput.readback.Get());
|
||||
TensorView<float> predTensor(tensorData, NchwExtents(modelOutput.desc.sizes));
|
||||
// DirectML writes the final output data in NHWC, where the C channel contains the bounding box & probabilities
|
||||
// for each prediction.
|
||||
const uint32_t predTensorN = modelOutput.desc.sizes[0];
|
||||
const uint32_t predTensorH = modelOutput.desc.sizes[1];
|
||||
const uint32_t predTensorW = modelOutput.desc.sizes[2];
|
||||
const uint32_t predTensorC = modelOutput.desc.sizes[3];
|
||||
|
||||
// YoloV4 predicts 3 boxes per scale, so we expect 3 separate predictions here
|
||||
assert(predTensor.Sizes().n == 3);
|
||||
|
||||
// Channel should contain the bounding box x/y/w/h, a confidence score, followed by probabilities for each class
|
||||
assert(predTensor.Sizes().c == 5 + YoloV4Constants::c_numClasses);
|
||||
assert(predTensorN == 3);
|
||||
|
||||
for (uint32_t n = 0; n < predTensor.Sizes().n; ++n)
|
||||
// Width should contain the bounding box x/y/w/h, a confidence score, the probability for max class, and the class index
|
||||
assert(predTensorC == 7);
|
||||
|
||||
struct PotentialPrediction
|
||||
{
|
||||
for (uint32_t h = 0; h < predTensor.Sizes().h; ++h)
|
||||
{
|
||||
for (uint32_t w = 0; w < predTensor.Sizes().w; ++w)
|
||||
{
|
||||
float bx = predTensor(n, 0, h, w);
|
||||
float by = predTensor(n, 1, h, w);
|
||||
float bw = predTensor(n, 2, h, w);
|
||||
float bh = predTensor(n, 3, h, w);
|
||||
float confidence = predTensor(n, 4, h, w);
|
||||
float bx;
|
||||
float by;
|
||||
float bw;
|
||||
float bh;
|
||||
float confidence;
|
||||
float classMaxProbability;
|
||||
uint32_t classIndex;
|
||||
};
|
||||
|
||||
// Copy the probabilities for each class
|
||||
std::vector<float> probabilities;
|
||||
probabilities.reserve(YoloV4Constants::c_numClasses);
|
||||
for (uint32_t i = 5; i < predTensor.Sizes().c; ++i)
|
||||
// The output tensor should be large enough to hold the expected number of predictions.
|
||||
assert(predTensorN * predTensorH * predTensorW * sizeof(PotentialPrediction) <= modelOutput.desc.totalTensorSizeInBytes);
|
||||
std::vector<PotentialPrediction> tensorData = CopyReadbackHeap<PotentialPrediction>(modelOutput.readback.Get());
|
||||
|
||||
// Scale the boxes to be relative to the original image size
|
||||
auto viewport = m_deviceResources->GetScreenViewport();
|
||||
float xScale = (float)viewport.Width / YoloV4Constants::c_inputWidth;
|
||||
float yScale = (float)viewport.Height / YoloV4Constants::c_inputHeight;
|
||||
|
||||
uint32_t currentPredIndex = 0;
|
||||
for (uint32_t n = 0; n < predTensorN; ++n)
|
||||
{
|
||||
for (uint32_t h = 0; h < predTensorH; ++h)
|
||||
{
|
||||
for (uint32_t w = 0; w < predTensorW; ++w)
|
||||
{
|
||||
const PotentialPrediction& currentPred = tensorData[currentPredIndex++];
|
||||
|
||||
// Discard boxes with low scores
|
||||
float score = currentPred.confidence * currentPred.classMaxProbability;
|
||||
if (score < YoloV4Constants::c_scoreThreshold)
|
||||
{
|
||||
probabilities.push_back(predTensor(n, i, h, w));
|
||||
continue;
|
||||
}
|
||||
|
||||
// We need to do some postprocessing on the raw values before we return them
|
||||
|
||||
// Apply xyScale. Need to apply offsets of half a grid cell here, to ensure the scaling is
|
||||
// centered around zero.
|
||||
bx = xyScale * (bx - 0.5f) + 0.5f;
|
||||
by = xyScale * (by - 0.5f) + 0.5f;
|
||||
float bx = xyScale * (currentPred.bx - 0.5f) + 0.5f;
|
||||
float by = xyScale * (currentPred.by - 0.5f) + 0.5f;
|
||||
|
||||
// Transform the x/y from being relative to the grid cell, to being relative to the whole image
|
||||
bx = (bx + (float)w) * stride;
|
||||
by = (by + (float)h) * stride;
|
||||
|
||||
// Scale the w/h by the supplied anchors
|
||||
bw *= anchors[n * 2];
|
||||
bh *= anchors[n * 2 + 1];
|
||||
float bw = currentPred.bw * anchors[n * 2];
|
||||
float bh = currentPred.bh * anchors[n * 2 + 1];
|
||||
|
||||
// Convert x,y,w,h to xmin,ymin,xmax,ymax
|
||||
float xmin = bx - bw / 2;
|
||||
|
@ -389,12 +409,6 @@ void Sample::GetModelPredictions(
|
|||
float xmax = bx + bw / 2;
|
||||
float ymax = by + bh / 2;
|
||||
|
||||
auto viewport = m_deviceResources->GetScreenViewport();
|
||||
|
||||
// Scale the boxes to be relative to the original image size
|
||||
float xScale = (float)viewport.Width / YoloV4Constants::c_inputWidth;
|
||||
float yScale = (float)viewport.Height / YoloV4Constants::c_inputHeight;
|
||||
|
||||
xmin *= xScale;
|
||||
ymin *= yScale;
|
||||
xmax *= xScale;
|
||||
|
@ -412,22 +426,13 @@ void Sample::GetModelPredictions(
|
|||
continue;
|
||||
}
|
||||
|
||||
// Discard boxes with low scores
|
||||
ptrdiff_t classIndex = std::max_element(probabilities.begin(), probabilities.end()) - probabilities.begin();
|
||||
float probability = probabilities[classIndex];
|
||||
float score = confidence * probability;
|
||||
if (score < YoloV4Constants::c_scoreThreshold)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
Prediction pred = {};
|
||||
pred.xmin = xmin;
|
||||
pred.ymin = ymin;
|
||||
pred.xmax = xmax;
|
||||
pred.ymax = ymax;
|
||||
pred.score = score;
|
||||
pred.predictedClass = static_cast<uint32_t>(classIndex);
|
||||
pred.predictedClass = currentPred.classIndex;
|
||||
out->push_back(pred);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -276,7 +276,7 @@ private:
|
|||
}
|
||||
};
|
||||
|
||||
// Takes a tensor of size [1, 3 * (5 + numClasses), H, W] and returns a tensor of size [3, 5 + numClasses, H, W].
|
||||
// Takes a tensor of size [1, 3 * (5 + numClasses), H, W] and returns a tensor of size [3, H, W, 7].
|
||||
// Sigmoid activation is applied to all channels that represent probabilities (which are not all of them).
|
||||
dml::Expression DecodeModelOutput(dml::Expression output, uint32_t numClasses)
|
||||
{
|
||||
|
@ -294,23 +294,38 @@ dml::Expression DecodeModelOutput(dml::Expression output, uint32_t numClasses)
|
|||
// Since this doesn't transform the data any, this can be accomplished with a simple reinterpret.
|
||||
output = dml::Reinterpret(output, { 3, numClasses + 5, outputSizes[2], outputSizes[3] }, dml::NullOpt);
|
||||
|
||||
// Split the new channel (of size 5+numClasses) into 4 different tensors with channels of 2, 2, 1+numClasses.
|
||||
// These represent the box xy, box wh, confidence+probabilities for each class.
|
||||
std::vector<dml::Expression> split = dml::Split(output, 1, { 2, 2, 1 + numClasses });
|
||||
assert(split.size() == 3);
|
||||
// Split the new channel (of size 5+numClasses) into 4 different tensors with channels of 2, 2, 1, numClasses.
|
||||
// These represent the box xy, box wh, confidence, and probabilities for each class.
|
||||
const uint32_t channelDim = 1;
|
||||
std::vector<dml::Expression> split = dml::Split(output, channelDim, { 2, 2, 1, numClasses });
|
||||
assert(split.size() == 4);
|
||||
|
||||
// Convenience
|
||||
auto convXy = split[0];
|
||||
auto convWh = split[1];
|
||||
auto convConfProb = split[2];
|
||||
auto convConf = split[2];
|
||||
auto convProb = split[3];
|
||||
|
||||
// Apply final activations
|
||||
convXy = dml::ActivationSigmoid(convXy);
|
||||
convWh = dml::Exp(convWh);
|
||||
convConfProb = dml::ActivationSigmoid(convConfProb);
|
||||
convConf = dml::ActivationSigmoid(convConf);
|
||||
convProb = dml::ActivationSigmoid(convProb);
|
||||
|
||||
const uint32_t joinAxis = 1; // Join along channel
|
||||
return dml::Join({ convXy, convWh, convConfProb }, joinAxis);
|
||||
// Compute the max and argmax of the probabilities. The argmax outputs UINT32 indices which
|
||||
// are reinterpreted as float so they can be joined into the same output tensor.
|
||||
auto convProbMax = dml::Reduce(convProb, DML_REDUCE_FUNCTION_MAX, { channelDim });
|
||||
auto convProbArgMax = dml::Reduce(convProb, DML_REDUCE_FUNCTION_ARGMAX, { channelDim });
|
||||
convProbArgMax = dml::Reinterpret(convProbArgMax, DML_TENSOR_DATA_TYPE_FLOAT32);
|
||||
|
||||
// Join the tensors along channel dimension.
|
||||
auto joined = dml::Join({ convXy, convWh, convConf, convProbMax, convProbArgMax }, channelDim);
|
||||
|
||||
// Transpose from NCHW to NHWC for faster reading on the CPU (converts output from SoA to AoS).
|
||||
dml::TensorDimensions sizesNchw = joined.GetOutputDesc().sizes;
|
||||
dml::TensorDimensions sizesNhwc = { sizesNchw[0], sizesNchw[3], sizesNchw[2], sizesNchw[1] };
|
||||
dml::TensorStrides stridesNhwc = { sizesNchw[1] * sizesNchw[2] * sizesNchw[3], sizesNchw[3], 1, sizesNchw[2] * sizesNchw[3] };
|
||||
return dml::Identity(dml::Reinterpret(joined, sizesNhwc, stridesNhwc));
|
||||
}
|
||||
|
||||
void Sample::CreateDirectMLResources()
|
||||
|
|
Загрузка…
Ссылка в новой задаче