Minor YOLO sample edits to speed up CPU post-processing (#200)

This commit is contained in:
Justin Stoecker 2022-02-04 19:10:57 -08:00 коммит произвёл GitHub
Родитель 57d4cec473
Коммит 66d704bdf7
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 70 добавлений и 50 удалений

Просмотреть файл

@ -339,49 +339,69 @@ void Sample::GetModelPredictions(
// values total.
assert(anchors.size() == 6);
std::vector<float> tensorData = CopyReadbackHeap<float>(modelOutput.readback.Get());
TensorView<float> predTensor(tensorData, NchwExtents(modelOutput.desc.sizes));
// DirectML writes the final output data in NHWC, where the C channel contains the bounding box & probabilities
// for each prediction.
const uint32_t predTensorN = modelOutput.desc.sizes[0];
const uint32_t predTensorH = modelOutput.desc.sizes[1];
const uint32_t predTensorW = modelOutput.desc.sizes[2];
const uint32_t predTensorC = modelOutput.desc.sizes[3];
// YoloV4 predicts 3 boxes per scale, so we expect 3 separate predictions here
assert(predTensor.Sizes().n == 3);
// Channel should contain the bounding box x/y/w/h, a confidence score, followed by probabilities for each class
assert(predTensor.Sizes().c == 5 + YoloV4Constants::c_numClasses);
assert(predTensorN == 3);
for (uint32_t n = 0; n < predTensor.Sizes().n; ++n)
// Width should contain the bounding box x/y/w/h, a confidence score, the probability for max class, and the class index
assert(predTensorC == 7);
struct PotentialPrediction
{
for (uint32_t h = 0; h < predTensor.Sizes().h; ++h)
{
for (uint32_t w = 0; w < predTensor.Sizes().w; ++w)
{
float bx = predTensor(n, 0, h, w);
float by = predTensor(n, 1, h, w);
float bw = predTensor(n, 2, h, w);
float bh = predTensor(n, 3, h, w);
float confidence = predTensor(n, 4, h, w);
float bx;
float by;
float bw;
float bh;
float confidence;
float classMaxProbability;
uint32_t classIndex;
};
// Copy the probabilities for each class
std::vector<float> probabilities;
probabilities.reserve(YoloV4Constants::c_numClasses);
for (uint32_t i = 5; i < predTensor.Sizes().c; ++i)
// The output tensor should be large enough to hold the expected number of predictions.
assert(predTensorN * predTensorH * predTensorW * sizeof(PotentialPrediction) <= modelOutput.desc.totalTensorSizeInBytes);
std::vector<PotentialPrediction> tensorData = CopyReadbackHeap<PotentialPrediction>(modelOutput.readback.Get());
// Scale the boxes to be relative to the original image size
auto viewport = m_deviceResources->GetScreenViewport();
float xScale = (float)viewport.Width / YoloV4Constants::c_inputWidth;
float yScale = (float)viewport.Height / YoloV4Constants::c_inputHeight;
uint32_t currentPredIndex = 0;
for (uint32_t n = 0; n < predTensorN; ++n)
{
for (uint32_t h = 0; h < predTensorH; ++h)
{
for (uint32_t w = 0; w < predTensorW; ++w)
{
const PotentialPrediction& currentPred = tensorData[currentPredIndex++];
// Discard boxes with low scores
float score = currentPred.confidence * currentPred.classMaxProbability;
if (score < YoloV4Constants::c_scoreThreshold)
{
probabilities.push_back(predTensor(n, i, h, w));
continue;
}
// We need to do some postprocessing on the raw values before we return them
// Apply xyScale. Need to apply offsets of half a grid cell here, to ensure the scaling is
// centered around zero.
bx = xyScale * (bx - 0.5f) + 0.5f;
by = xyScale * (by - 0.5f) + 0.5f;
float bx = xyScale * (currentPred.bx - 0.5f) + 0.5f;
float by = xyScale * (currentPred.by - 0.5f) + 0.5f;
// Transform the x/y from being relative to the grid cell, to being relative to the whole image
bx = (bx + (float)w) * stride;
by = (by + (float)h) * stride;
// Scale the w/h by the supplied anchors
bw *= anchors[n * 2];
bh *= anchors[n * 2 + 1];
float bw = currentPred.bw * anchors[n * 2];
float bh = currentPred.bh * anchors[n * 2 + 1];
// Convert x,y,w,h to xmin,ymin,xmax,ymax
float xmin = bx - bw / 2;
@ -389,12 +409,6 @@ void Sample::GetModelPredictions(
float xmax = bx + bw / 2;
float ymax = by + bh / 2;
auto viewport = m_deviceResources->GetScreenViewport();
// Scale the boxes to be relative to the original image size
float xScale = (float)viewport.Width / YoloV4Constants::c_inputWidth;
float yScale = (float)viewport.Height / YoloV4Constants::c_inputHeight;
xmin *= xScale;
ymin *= yScale;
xmax *= xScale;
@ -412,22 +426,13 @@ void Sample::GetModelPredictions(
continue;
}
// Discard boxes with low scores
ptrdiff_t classIndex = std::max_element(probabilities.begin(), probabilities.end()) - probabilities.begin();
float probability = probabilities[classIndex];
float score = confidence * probability;
if (score < YoloV4Constants::c_scoreThreshold)
{
continue;
}
Prediction pred = {};
pred.xmin = xmin;
pred.ymin = ymin;
pred.xmax = xmax;
pred.ymax = ymax;
pred.score = score;
pred.predictedClass = static_cast<uint32_t>(classIndex);
pred.predictedClass = currentPred.classIndex;
out->push_back(pred);
}
}

Просмотреть файл

@ -276,7 +276,7 @@ private:
}
};
// Takes a tensor of size [1, 3 * (5 + numClasses), H, W] and returns a tensor of size [3, 5 + numClasses, H, W].
// Takes a tensor of size [1, 3 * (5 + numClasses), H, W] and returns a tensor of size [3, H, W, 7].
// Sigmoid activation is applied to all channels that represent probabilities (which are not all of them).
dml::Expression DecodeModelOutput(dml::Expression output, uint32_t numClasses)
{
@ -294,23 +294,38 @@ dml::Expression DecodeModelOutput(dml::Expression output, uint32_t numClasses)
// Since this doesn't transform the data any, this can be accomplished with a simple reinterpret.
output = dml::Reinterpret(output, { 3, numClasses + 5, outputSizes[2], outputSizes[3] }, dml::NullOpt);
// Split the new channel (of size 5+numClasses) into 4 different tensors with channels of 2, 2, 1+numClasses.
// These represent the box xy, box wh, confidence+probabilities for each class.
std::vector<dml::Expression> split = dml::Split(output, 1, { 2, 2, 1 + numClasses });
assert(split.size() == 3);
// Split the new channel (of size 5+numClasses) into 4 different tensors with channels of 2, 2, 1, numClasses.
// These represent the box xy, box wh, confidence, and probabilities for each class.
const uint32_t channelDim = 1;
std::vector<dml::Expression> split = dml::Split(output, channelDim, { 2, 2, 1, numClasses });
assert(split.size() == 4);
// Convenience
auto convXy = split[0];
auto convWh = split[1];
auto convConfProb = split[2];
auto convConf = split[2];
auto convProb = split[3];
// Apply final activations
convXy = dml::ActivationSigmoid(convXy);
convWh = dml::Exp(convWh);
convConfProb = dml::ActivationSigmoid(convConfProb);
convConf = dml::ActivationSigmoid(convConf);
convProb = dml::ActivationSigmoid(convProb);
const uint32_t joinAxis = 1; // Join along channel
return dml::Join({ convXy, convWh, convConfProb }, joinAxis);
// Compute the max and argmax of the probabilities. The argmax outputs UINT32 indices which
// are reinterpreted as float so they can be joined into the same output tensor.
auto convProbMax = dml::Reduce(convProb, DML_REDUCE_FUNCTION_MAX, { channelDim });
auto convProbArgMax = dml::Reduce(convProb, DML_REDUCE_FUNCTION_ARGMAX, { channelDim });
convProbArgMax = dml::Reinterpret(convProbArgMax, DML_TENSOR_DATA_TYPE_FLOAT32);
// Join the tensors along channel dimension.
auto joined = dml::Join({ convXy, convWh, convConf, convProbMax, convProbArgMax }, channelDim);
// Transpose from NCHW to NHWC for faster reading on the CPU (converts output from SoA to AoS).
dml::TensorDimensions sizesNchw = joined.GetOutputDesc().sizes;
dml::TensorDimensions sizesNhwc = { sizesNchw[0], sizesNchw[3], sizesNchw[2], sizesNchw[1] };
dml::TensorStrides stridesNhwc = { sizesNchw[1] * sizesNchw[2] * sizesNchw[3], sizesNchw[3], 1, sizesNchw[2] * sizesNchw[3] };
return dml::Identity(dml::Reinterpret(joined, sizesNhwc, stridesNhwc));
}
void Sample::CreateDirectMLResources()