diff --git a/Testing/WinMLRunnerTest/WinMLRunnerTest.cpp b/Testing/WinMLRunnerTest/WinMLRunnerTest.cpp
index ee40e892..fda0f037 100644
--- a/Testing/WinMLRunnerTest/WinMLRunnerTest.cpp
+++ b/Testing/WinMLRunnerTest/WinMLRunnerTest.cpp
@@ -749,6 +749,12 @@ namespace WinMLRunnerTest
             Assert::AreEqual(S_OK, RunProc((wchar_t*)command.c_str()));
         }
         
+        TEST_METHOD(TestTopK)
+        {
+            const std::wstring command = BuildCommand({ EXE_PATH, L"-model", L"SqueezeNet.onnx", L"-TopK", L"5" });
+            Assert::AreEqual(S_OK, RunProc((wchar_t*)command.c_str()));
+        }
+
         /* Commenting out test until WinMLRunnerDLL.dll is properly written and ABI friendly
         TEST_METHOD(TestWinMLRunnerDllLinking)
         {
diff --git a/Tools/WinMLRunner/README.md b/Tools/WinMLRunner/README.md
index 6a38138e..f8fb9ee1 100644
--- a/Tools/WinMLRunner/README.md
+++ b/Tools/WinMLRunner/README.md
@@ -45,6 +45,7 @@ Required command-Line arguments:
 -Perf [all]: capture performance measurements such as timing and memory usage. Specifying "all" will output all measurements
 -Iterations : # times perf measurements will be run/averaged
 -Input <fully qualified path>: binds image or CSV to model
+-TopK <number>: print top <number> values in the result. Default to 1
 -PerfOutput [<fully qualified path>]: csv file to write the perf results to
 -SavePerIterationPerf : save per iteration performance results to csv file
 -SaveTensorData <saveMode folderPath>: saveMode: save first iteration or all iteration output tensor results to csv file [First, All]
diff --git a/Tools/WinMLRunner/src/BindingUtilities.h b/Tools/WinMLRunner/src/BindingUtilities.h
index 2cc7a8fa..327db4f9 100644
--- a/Tools/WinMLRunner/src/BindingUtilities.h
+++ b/Tools/WinMLRunner/src/BindingUtilities.h
@@ -13,35 +13,119 @@ using namespace winrt::Windows::Graphics::DirectX;
 using namespace winrt::Windows::Graphics::Imaging;
 using namespace winrt::Windows::Graphics::DirectX::Direct3D11;
 
-template <TensorKind T> struct TensorKindToType { static_assert(true, "No TensorKind mapped for given type!"); };
-template <> struct TensorKindToType<TensorKind::UInt8> { typedef uint8_t Type; };
-template <> struct TensorKindToType<TensorKind::Int8> { typedef uint8_t Type; };
-template <> struct TensorKindToType<TensorKind::UInt16> { typedef uint16_t Type; };
-template <> struct TensorKindToType<TensorKind::Int16> { typedef int16_t Type; };
-template <> struct TensorKindToType<TensorKind::UInt32> { typedef uint32_t Type; };
-template <> struct TensorKindToType<TensorKind::Int32> { typedef int32_t Type; };
-template <> struct TensorKindToType<TensorKind::UInt64> { typedef uint64_t Type; };
-template <> struct TensorKindToType<TensorKind::Int64> { typedef int64_t Type; };
-template <> struct TensorKindToType<TensorKind::Boolean> { typedef boolean Type; };
-template <> struct TensorKindToType<TensorKind::Double> { typedef double Type; };
-template <> struct TensorKindToType<TensorKind::Float> { typedef float Type; };
-template <> struct TensorKindToType<TensorKind::Float16> { typedef float Type; };
-template <> struct TensorKindToType<TensorKind::String> { typedef winrt::hstring Type; };
+template <TensorKind T> struct TensorKindToType
+{
+    static_assert(true, "No TensorKind mapped for given type!");
+};
+template <> struct TensorKindToType<TensorKind::UInt8>
+{
+    typedef uint8_t Type;
+};
+template <> struct TensorKindToType<TensorKind::Int8>
+{
+    typedef uint8_t Type;
+};
+template <> struct TensorKindToType<TensorKind::UInt16>
+{
+    typedef uint16_t Type;
+};
+template <> struct TensorKindToType<TensorKind::Int16>
+{
+    typedef int16_t Type;
+};
+template <> struct TensorKindToType<TensorKind::UInt32>
+{
+    typedef uint32_t Type;
+};
+template <> struct TensorKindToType<TensorKind::Int32>
+{
+    typedef int32_t Type;
+};
+template <> struct TensorKindToType<TensorKind::UInt64>
+{
+    typedef uint64_t Type;
+};
+template <> struct TensorKindToType<TensorKind::Int64>
+{
+    typedef int64_t Type;
+};
+template <> struct TensorKindToType<TensorKind::Boolean>
+{
+    typedef boolean Type;
+};
+template <> struct TensorKindToType<TensorKind::Double>
+{
+    typedef double Type;
+};
+template <> struct TensorKindToType<TensorKind::Float>
+{
+    typedef float Type;
+};
+template <> struct TensorKindToType<TensorKind::Float16>
+{
+    typedef float Type;
+};
+template <> struct TensorKindToType<TensorKind::String>
+{
+    typedef winrt::hstring Type;
+};
 
-template <TensorKind T> struct TensorKindToValue { static_assert(true, "No TensorKind mapped for given type!"); };
-template <> struct TensorKindToValue<TensorKind::UInt8> { typedef TensorUInt8Bit Type; };
-template <> struct TensorKindToValue<TensorKind::Int8> { typedef TensorInt8Bit Type; };
-template <> struct TensorKindToValue<TensorKind::UInt16> { typedef TensorUInt16Bit Type; };
-template <> struct TensorKindToValue<TensorKind::Int16> { typedef TensorInt16Bit Type; };
-template <> struct TensorKindToValue<TensorKind::UInt32> { typedef TensorUInt32Bit Type; };
-template <> struct TensorKindToValue<TensorKind::Int32> { typedef TensorInt32Bit Type; };
-template <> struct TensorKindToValue<TensorKind::UInt64> { typedef TensorUInt64Bit Type; };
-template <> struct TensorKindToValue<TensorKind::Int64> { typedef TensorInt64Bit Type; };
-template <> struct TensorKindToValue<TensorKind::Boolean> { typedef TensorBoolean Type; };
-template <> struct TensorKindToValue<TensorKind::Double> { typedef TensorDouble Type; };
-template <> struct TensorKindToValue<TensorKind::Float> { typedef TensorFloat Type; };
-template <> struct TensorKindToValue<TensorKind::Float16> { typedef TensorFloat16Bit Type; };
-template <> struct TensorKindToValue<TensorKind::String> { typedef TensorString Type; };
+template <TensorKind T> struct TensorKindToValue
+{
+    static_assert(true, "No TensorKind mapped for given type!");
+};
+template <> struct TensorKindToValue<TensorKind::UInt8>
+{
+    typedef TensorUInt8Bit Type;
+};
+template <> struct TensorKindToValue<TensorKind::Int8>
+{
+    typedef TensorInt8Bit Type;
+};
+template <> struct TensorKindToValue<TensorKind::UInt16>
+{
+    typedef TensorUInt16Bit Type;
+};
+template <> struct TensorKindToValue<TensorKind::Int16>
+{
+    typedef TensorInt16Bit Type;
+};
+template <> struct TensorKindToValue<TensorKind::UInt32>
+{
+    typedef TensorUInt32Bit Type;
+};
+template <> struct TensorKindToValue<TensorKind::Int32>
+{
+    typedef TensorInt32Bit Type;
+};
+template <> struct TensorKindToValue<TensorKind::UInt64>
+{
+    typedef TensorUInt64Bit Type;
+};
+template <> struct TensorKindToValue<TensorKind::Int64>
+{
+    typedef TensorInt64Bit Type;
+};
+template <> struct TensorKindToValue<TensorKind::Boolean>
+{
+    typedef TensorBoolean Type;
+};
+template <> struct TensorKindToValue<TensorKind::Double>
+{
+    typedef TensorDouble Type;
+};
+template <> struct TensorKindToValue<TensorKind::Float>
+{
+    typedef TensorFloat Type;
+};
+template <> struct TensorKindToValue<TensorKind::Float16>
+{
+    typedef TensorFloat16Bit Type;
+};
+template <> struct TensorKindToValue<TensorKind::String>
+{
+    typedef TensorString Type;
+};
 
 namespace BindingUtilities
 {
@@ -235,11 +319,12 @@ namespace BindingUtilities
             for (UINT dim = 0; dim < tensorDescriptorShape.Size(); dim++)
             {
                 INT64 dimSize = tensorDescriptorShape.GetAt(dim);
-                if (dimSize > 0) //If the dimension is greater than 0, then it is known.
+                if (dimSize > 0) // If the dimension is greater than 0, then it is known.
                 {
                     vecShape.push_back(dimSize);
                 }
-                else //otherwise, make sure that the dimension is -1, representing free dimension. If not, then it's an invalid model.
+                else // otherwise, make sure that the dimension is -1, representing free dimension. If not, then it's an
+                     // invalid model.
                 {
                     if (dimSize == -1)
                     {
@@ -247,7 +332,8 @@ namespace BindingUtilities
                     }
                     else
                     {
-                        throw hresult_invalid_argument(L"Failed to create a tensor with an unknown dimension of: " + dimSize);
+                        throw hresult_invalid_argument(L"Failed to create a tensor with an unknown dimension of: " +
+                                                       dimSize);
                     }
                 }
             }
@@ -258,7 +344,8 @@ namespace BindingUtilities
 
             BYTE* actualData;
             uint32_t actualSizeInBytes;
-            spTensorValueNative->GetBuffer(&actualData, &actualSizeInBytes); //Need to GetBuffer to have CPU memory backing tensorValue
+            spTensorValueNative->GetBuffer(
+                &actualData, &actualSizeInBytes); // Need to GetBuffer to have CPU memory backing tensorValue
             return tensorValue;
         }
         else
@@ -419,8 +506,8 @@ namespace BindingUtilities
                 com_ptr<ITensorNative> itn = results.Lookup(desc.Name()).as<ITensorNative>();
                 HRESULT(itn->GetBuffer(reinterpret_cast<BYTE**>(&tensor), &uCapacity));
                 int size = 0;
-                float maxValue = 0;
-                int maxIndex = 0;
+                unsigned int topK = args.TopK();
+                std::vector<std::pair<float, int>> maxKValues;
                 std::ofstream fout;
                 if (args.IsSaveTensor())
                 {
@@ -445,12 +532,12 @@ namespace BindingUtilities
                     break;
                     case TensorKind::Float16:
                     {
-                        output.ProcessTensorResult<HALF>(args, tensor, uCapacity, maxValue, maxIndex, fout);
+                        output.ProcessTensorResult<HALF>(args, tensor, uCapacity, maxKValues, fout, topK);
                     }
                     break;
                     case TensorKind::Float:
                     {
-                        output.ProcessTensorResult<float>(args, tensor, uCapacity, maxValue, maxIndex, fout);
+                        output.ProcessTensorResult<float>(args, tensor, uCapacity, maxKValues, fout, topK);
                     }
                     break;
                     case TensorKind::Int64:
@@ -472,16 +559,27 @@ namespace BindingUtilities
                 if (args.IsSaveTensor())
                 {
                     fout.close();
-                    std::string iterationResult =
-                        "Index: " + std::to_string(maxIndex) + "; Value: " + std::to_string(maxValue);
-                    output.SaveResult(iterationNum, iterationResult, static_cast<int>(hash_data(tensor, uCapacity)));
+                    for (auto& pair : maxKValues)
+                    {
+                        auto maxValue = pair.first;
+                        auto maxIndex = pair.second;
+                        std::string iterationResult =
+                            "Index: " + std::to_string(maxIndex) + "; Value: " + std::to_string(maxValue);
+                        output.SaveResult(iterationNum, iterationResult,
+                                          static_cast<int>(hash_data(tensor, uCapacity)));
+                    }
                 }
                 if (!args.IsGarbageInput() && iterationNum == 0)
                 {
-                    std::cout << "Outputting results.. " << std::endl;
+                    std::cout << "Outputting top " << args.TopK() << " values" << std::endl;
                     std::cout << "Feature Name: " << name << std::endl;
-                    std::wcout << " resultVector[" << maxIndex << "] has the maximal value of " << maxValue
-                               << std::endl;
+                    for (auto& pair : maxKValues)
+                    {
+                        auto maxValue = pair.first;
+                        auto maxIndex = pair.second;
+                        std::wcout << " index: " << maxIndex << ", value: " << maxValue
+                                   << std::endl;
+                    }
                 }
             }
             else if (desc.Kind() == LearningModelFeatureKind::Sequence)
diff --git a/Tools/WinMLRunner/src/CommandLineArgs.cpp b/Tools/WinMLRunner/src/CommandLineArgs.cpp
index 765c316f..9adf0567 100644
--- a/Tools/WinMLRunner/src/CommandLineArgs.cpp
+++ b/Tools/WinMLRunner/src/CommandLineArgs.cpp
@@ -31,6 +31,7 @@ void CommandLineArgs::PrintUsage()
               << std::endl;
     std::cout << "  -Iterations : # times perf measurements will be run/averaged" << std::endl;
     std::cout << "  -Input <fully qualified path>: binds image or CSV to model" << std::endl;
+    std::cout << "  -TopK <number>: print top <number> values in the result. Default to 1" << std::endl;
     std::cout << "  -PerfOutput [<fully qualified path>]: csv file to write the perf results to" << std::endl;
     std::cout << "  -SavePerIterationPerf : save per iteration performance results to csv file" << std::endl;
     std::cout << "  -SaveTensorData <saveMode folderPath>: saveMode: save first iteration or all iteration output "
@@ -278,6 +279,11 @@ CommandLineArgs::CommandLineArgs(const std::vector<std::wstring>& args)
             unsigned thread_interval = std::stoi(args[++i].c_str());
             SetThreadInterval(thread_interval);
         }
+        else if ((_wcsicmp(args[i].c_str(), L"-TopK") == 0))
+        {
+            CheckNextArgument(args, i);
+            SetTopK(std::stoi(args[++i].c_str()));
+        }
         else
         {
             std::wstring msg = L"Unknown option ";
diff --git a/Tools/WinMLRunner/src/CommandLineArgs.h b/Tools/WinMLRunner/src/CommandLineArgs.h
index c41396c0..18afd558 100644
--- a/Tools/WinMLRunner/src/CommandLineArgs.h
+++ b/Tools/WinMLRunner/src/CommandLineArgs.h
@@ -72,6 +72,7 @@ public:
     uint32_t NumIterations() const { return m_numIterations; }
     uint32_t NumThreads() const { return m_numThreads; }
     uint32_t ThreadInterval() const { return m_threadInterval; } // Thread interval in milliseconds
+    uint32_t TopK() const { return m_topK; }
 
     void ToggleCPU(bool useCPU) { m_useCPU = useCPU; }
     void ToggleGPU(bool useGPU) { m_useGPU = useGPU; }
@@ -96,6 +97,7 @@ public:
     void SetInputDataPath(const std::wstring& inputDataPath) { m_inputData = inputDataPath; }
     void SetNumThreads(unsigned numThreads) { m_numThreads = numThreads; }
     void SetThreadInterval(unsigned threadInterval) { m_threadInterval = threadInterval; }
+    void SetTopK(unsigned k) { m_topK = k; }
     void SetPerformanceCSVPath(const std::wstring& performanceCSVPath)
     {
         m_perfOutputPath = performanceCSVPath;
@@ -140,6 +142,7 @@ private:
     uint32_t m_numIterations = 1;
     uint32_t m_numThreads = 1;
     uint32_t m_threadInterval = 0;
+    uint32_t m_topK = 1;
 
     void CheckNextArgument(const std::vector<std::wstring>& args, UINT i);
     void CheckForInvalidArguments();
diff --git a/Tools/WinMLRunner/src/OutputHelper.h b/Tools/WinMLRunner/src/OutputHelper.h
index eb71b8eb..4c17a2e6 100644
--- a/Tools/WinMLRunner/src/OutputHelper.h
+++ b/Tools/WinMLRunner/src/OutputHelper.h
@@ -10,6 +10,7 @@
 #include <dxgi.h>
 #include <Windows.Graphics.DirectX.Direct3D11.interop.h>
 #include <direct.h>
+#include <queue>
 
 using namespace winrt::Windows::AI::MachineLearning;
 using namespace winrt::Windows::Storage::Streams;
@@ -699,20 +700,17 @@ public:
     }
 
     template <typename T>
-    void ProcessTensorResult(const CommandLineArgs& args, const void* buffer, const uint32_t uCapacity, float& maxValue,
-                             int& maxIndex, std::ofstream& fout)
+    void ProcessTensorResult(const CommandLineArgs& args, const void* buffer, const uint32_t uCapacity,
+                             std::vector<std::pair<float,int>>& maxValues, std::ofstream& fout,
+                             unsigned int k)
     {
+        // Create a priority queue of size k that pops the lowest value first
+        // We will remove lowest values as we iterate over all the values
+        auto cmp = [](std::pair<float, int> x, std::pair<float, int> y) { return x.first > y.first; };
+        std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>, decltype(cmp)> topKvalues(cmp);
+
         T* tensor = (T*)buffer;
         int size = uCapacity / sizeof(T);
-        if (!std::is_same<T, HALF>::value)
-        {
-            maxValue = *tensor;
-        }
-        else
-        {
-            maxValue = XMConvertHalfToFloat(static_cast<HALF>(*tensor));
-        }
-        maxIndex = 0;
         for (int i = 0; i < size; i++)
         {
             float val = 0;
@@ -728,12 +726,29 @@ public:
             {
                 fout << i << "," << val << std::endl;
             }
-            if (maxValue < val)
+
+            if (topKvalues.size() < k)
             {
-                maxValue = val;
-                maxIndex = i;
+                topKvalues.push({ val, i });
+            }
+            else if (k > 0)
+            {
+                auto maxValue = topKvalues.top().first;
+                if (maxValue < val)
+                {
+                    topKvalues.pop();
+                    topKvalues.push({ val, i });
+                }
             }
         }
+        while (!topKvalues.empty())
+        {
+            auto pair = topKvalues.top();
+            maxValues.push_back(pair);
+            topKvalues.pop();
+        }
+        // Put vector in order of highest value to lowest
+        std::reverse(maxValues.begin(), maxValues.end());
     }
 
     void WritePerformanceDataToCSV(const Profiler<WINML_MODEL_TEST_PERF>& profiler, int numIterations,