Vulkan: add GPU trace events

RendererVk now tries, as best as it can, to match the CPU and GPU timers on init as well as every finish(). A clock-sync event is generated for each such synchronization point. RendererVk::traceGpuEvent() is a new function that, given a command buffer, performs timestamp queries corresponding to GPU events. These queries are read back when done, without incurring GPU bubbles, at which point an event is generated with that timestamp. Bug: angleproject:2908 Change-Id: I08d7d11ff9f8ad6c9f9a9899767c9cd746d0623e Reviewed-on: https://chromium-review.googlesource.com/c/1296954 Commit-Queue: Shahbaz Youssefi <syoussefi@chromium.org> Reviewed-by: Yuly Novikov <ynovikov@chromium.org>
2018-10-22 11:56:02 -04:00 · 2018-10-22 11:56:02 -04:00 · 25224e786f
--- a/BUILD.gn
+++ b/BUILD.gn
@ -80,6 +80,12 @@ config("internal_config") {
  if (angle_force_thread_safety) {
    defines += [ "ANGLE_FORCE_THREAD_SAFETY=1" ]
  }
+
+  if (angle_enable_vulkan) {
+    if (angle_enable_vulkan_gpu_trace_events) {
+      defines += [ "ANGLE_ENABLE_VULKAN_GPU_TRACE_EVENTS=1" ]
+    }
+  }
 }

 config("extra_warnings") {
--- a/gni/angle.gni
+++ b/gni/angle.gni
@ -75,6 +75,11 @@ declare_args() {
  # Disable the layers in ubsan builds because of really slow builds.
  angle_enable_vulkan_validation_layers =
      angle_enable_vulkan && !is_ubsan && !is_tsan && !is_asan
+
+  if (angle_enable_vulkan) {
+    # Enable Vulkan GPU trace event capability
+    angle_enable_vulkan_gpu_trace_events = false
+  }
 }

 if (is_win) {
--- a/src/libANGLE/renderer/vulkan/CommandGraph.cpp
+++ b/src/libANGLE/renderer/vulkan/CommandGraph.cpp
@ -16,6 +16,8 @@
 #include "libANGLE/renderer/vulkan/vk_format_utils.h"
 #include "libANGLE/renderer/vulkan/vk_helpers.h"

+#include "third_party/trace_event/trace_event.h"
+
 namespace rx
 {
 namespace vk
@ -625,12 +627,15 @@ angle::Result CommandGraph::submitCommands(Context *context,
    std::vector<CommandGraphNode *> nodeStack;

    VkCommandBufferBeginInfo beginInfo = {};
-    beginInfo.sType            = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
-    beginInfo.flags            = 0;
-    beginInfo.pInheritanceInfo = nullptr;
+    beginInfo.sType                    = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    beginInfo.flags                    = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    beginInfo.pInheritanceInfo         = nullptr;

    ANGLE_TRY(primaryCommandBufferOut->begin(context, beginInfo));

+    ANGLE_TRY(context->getRenderer()->traceGpuEvent(
+        context, primaryCommandBufferOut, TRACE_EVENT_PHASE_BEGIN, "Primary Command Buffer"));
+
    for (CommandGraphNode *topLevelNode : mNodes)
    {
        // Only process commands that don't have child commands. The others will be pulled in
@ -664,6 +669,9 @@ angle::Result CommandGraph::submitCommands(Context *context,
        }
    }

+    ANGLE_TRY(context->getRenderer()->traceGpuEvent(
+        context, primaryCommandBufferOut, TRACE_EVENT_PHASE_END, "Primary Command Buffer"));
+
    ANGLE_TRY(primaryCommandBufferOut->end(context));

    // TODO(jmadill): Use pool allocation so we don't need to deallocate command graph.
--- a/src/libANGLE/renderer/vulkan/RendererVk.cpp
+++ b/src/libANGLE/renderer/vulkan/RendererVk.cpp
@ -306,7 +306,10 @@ RendererVk::RendererVk()
      mCurrentQueueSerial(mQueueSerialFactory.generate()),
      mDeviceLost(false),
      mPipelineCacheVkUpdateTimeout(kPipelineCacheVkUpdatePeriod),
-      mCommandGraph(kEnableCommandGraphDiagnostics)
+      mCommandGraph(kEnableCommandGraphDiagnostics),
+      mGpuEventsEnabled(false),
+      mGpuClockSync{std::numeric_limits<double>::max(), std::numeric_limits<double>::max()},
+      mGpuEventTimestampOrigin(0)
 {
 }

@ -330,6 +333,7 @@ void RendererVk::onDestroy(vk::Context *context)
    mPipelineCacheVk.destroy(mDevice);
    mSubmitSemaphorePool.destroy(mDevice);
    mShaderLibrary.destroy(mDevice);
+    mGpuEventQueryPool.destroy(mDevice);

    GlslangWrapper::Release();

@ -624,6 +628,25 @@ angle::Result RendererVk::initializeDevice(DisplayVk *displayVk, uint32_t queueF
    // Initialize the submission semaphore pool.
    ANGLE_TRY(mSubmitSemaphorePool.init(displayVk, vk::kDefaultSemaphorePoolSize));

+#if ANGLE_ENABLE_VULKAN_GPU_TRACE_EVENTS
+    angle::PlatformMethods *platform = ANGLEPlatformCurrent();
+    ASSERT(platform);
+
+    // GPU tracing workaround for anglebug.com/2927.  The renderer should not emit gpu events during
+    // platform discovery.
+    const unsigned char *gpuEventsEnabled =
+        platform->getTraceCategoryEnabledFlag(platform, "gpu.angle.gpu");
+    mGpuEventsEnabled = gpuEventsEnabled && *gpuEventsEnabled;
+#endif
+
+    if (mGpuEventsEnabled)
+    {
+        // Calculate the difference between CPU and GPU clocks for GPU event reporting.
+        ANGLE_TRY(mGpuEventQueryPool.init(displayVk, VK_QUERY_TYPE_TIMESTAMP,
+                                          vk::kDefaultTimestampQueryPoolSize));
+        ANGLE_TRY(synchronizeCpuGpuTime(displayVk));
+    }
+
    return angle::Result::Continue();
 }

@ -781,11 +804,13 @@ void RendererVk::ensureCapsInitialized() const

 void RendererVk::getSubmitWaitSemaphores(
    vk::Context *context,
-    angle::FixedVector<VkSemaphore, kMaxWaitSemaphores> *waitSemaphores)
+    angle::FixedVector<VkSemaphore, kMaxWaitSemaphores> *waitSemaphores,
+    angle::FixedVector<VkPipelineStageFlags, kMaxWaitSemaphores> *waitStageMasks)
 {
    if (mSubmitLastSignaledSemaphore.getSemaphore())
    {
        waitSemaphores->push_back(mSubmitLastSignaledSemaphore.getSemaphore()->getHandle());
+        waitStageMasks->push_back(VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT);

        // Return the semaphore to the pool (which will remain valid and unused until the
        // queue it's about to be waited on has finished execution).
@ -795,6 +820,8 @@ void RendererVk::getSubmitWaitSemaphores(
    for (vk::SemaphoreHelper &semaphore : mSubmitWaitSemaphores)
    {
        waitSemaphores->push_back(semaphore.getSemaphore()->getHandle());
+        waitStageMasks->push_back(VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT);
+
        mSubmitSemaphorePool.freeSemaphore(context, &semaphore);
    }
    mSubmitWaitSemaphores.clear();
@ -845,18 +872,15 @@ angle::Result RendererVk::finish(vk::Context *context)
        vk::Scoped<vk::CommandBuffer> commandBatch(mDevice);
        ANGLE_TRY(flushCommandGraph(context, &commandBatch.get()));

-        VkPipelineStageFlags waitStageMask[2] = {
-            VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
-        };
-
        angle::FixedVector<VkSemaphore, kMaxWaitSemaphores> waitSemaphores;
-        getSubmitWaitSemaphores(context, &waitSemaphores);
+        angle::FixedVector<VkPipelineStageFlags, kMaxWaitSemaphores> waitStageMasks;
+        getSubmitWaitSemaphores(context, &waitSemaphores, &waitStageMasks);

        VkSubmitInfo submitInfo         = {};
        submitInfo.sType                = VK_STRUCTURE_TYPE_SUBMIT_INFO;
        submitInfo.waitSemaphoreCount   = static_cast<uint32_t>(waitSemaphores.size());
        submitInfo.pWaitSemaphores      = waitSemaphores.data();
-        submitInfo.pWaitDstStageMask    = waitStageMask;
+        submitInfo.pWaitDstStageMask    = waitStageMasks.data();
        submitInfo.commandBufferCount   = 1;
        submitInfo.pCommandBuffers      = commandBatch.get().ptr();
        submitInfo.signalSemaphoreCount = 0;
@ -868,6 +892,20 @@ angle::Result RendererVk::finish(vk::Context *context)
    ASSERT(mQueue != VK_NULL_HANDLE);
    ANGLE_VK_TRY(context, vkQueueWaitIdle(mQueue));
    freeAllInFlightResources();
+
+    if (mGpuEventsEnabled)
+    {
+        // Recalculate the CPU/GPU time difference to account for clock drifting.  Note that
+        // currently, the perftest event handler does not correctly handle out of order gpu and sync
+        // events, so make sure all gpu events are completed.  This loop should in practice execute
+        // once since the queue is already idle.
+        while (mInFlightGpuEventQueries.size() > 0)
+        {
+            ANGLE_TRY(checkCompletedGpuEvents(context));
+        }
+        ANGLE_TRY(synchronizeCpuGpuTime(context));
+    }
+
    return angle::Result::Continue();
 }

@ -958,6 +996,11 @@ angle::Result RendererVk::submitFrame(vk::Context *context,

    ANGLE_TRY(checkCompletedCommands(context));

+    if (mGpuEventsEnabled)
+    {
+        ANGLE_TRY(checkCompletedGpuEvents(context));
+    }
+
    // Simply null out the command buffer here - it was allocated using the command pool.
    commandBuffer.releaseHandle();

@ -965,12 +1008,10 @@ angle::Result RendererVk::submitFrame(vk::Context *context,
    // TODO(jmadill): Consider reusing command pools.
    VkCommandPoolCreateInfo poolInfo = {};
    poolInfo.sType            = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
-    poolInfo.flags            = 0;
+    poolInfo.flags                   = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT;
    poolInfo.queueFamilyIndex = mCurrentQueueFamilyIndex;

-    ANGLE_TRY(mCommandPool.init(context, poolInfo));
-
-    return angle::Result::Continue();
+    return mCommandPool.init(context, poolInfo);
 }

 bool RendererVk::isSerialInUse(Serial serial) const
@ -1052,12 +1093,9 @@ angle::Result RendererVk::flush(vk::Context *context)
    vk::Scoped<vk::CommandBuffer> commandBatch(mDevice);
    ANGLE_TRY(flushCommandGraph(context, &commandBatch.get()));

-    VkPipelineStageFlags waitStageMask[2] = {
-        VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
-    };
-
    angle::FixedVector<VkSemaphore, kMaxWaitSemaphores> waitSemaphores;
-    getSubmitWaitSemaphores(context, &waitSemaphores);
+    angle::FixedVector<VkPipelineStageFlags, kMaxWaitSemaphores> waitStageMasks;
+    getSubmitWaitSemaphores(context, &waitSemaphores, &waitStageMasks);

    // On every flush, create a semaphore to be signaled.  On the next submission, this semaphore
    // will be waited on.
@ -1067,7 +1105,7 @@ angle::Result RendererVk::flush(vk::Context *context)
    submitInfo.sType                = VK_STRUCTURE_TYPE_SUBMIT_INFO;
    submitInfo.waitSemaphoreCount   = static_cast<uint32_t>(waitSemaphores.size());
    submitInfo.pWaitSemaphores      = waitSemaphores.data();
-    submitInfo.pWaitDstStageMask    = waitStageMask;
+    submitInfo.pWaitDstStageMask    = waitStageMasks.data();
    submitInfo.commandBufferCount   = 1;
    submitInfo.pCommandBuffers      = commandBatch.get().ptr();
    submitInfo.signalSemaphoreCount = 1;
@ -1198,6 +1236,346 @@ vk::ShaderLibrary *RendererVk::getShaderLibrary()
    return &mShaderLibrary;
 }

+angle::Result RendererVk::synchronizeCpuGpuTime(vk::Context *context)
+{
+    ASSERT(mGpuEventsEnabled);
+
+    angle::PlatformMethods *platform = ANGLEPlatformCurrent();
+    ASSERT(platform);
+
+    // To synchronize CPU and GPU times, we need to get the CPU timestamp as close as possible to
+    // the GPU timestamp.  The process of getting the GPU timestamp is as follows:
+    //
+    //             CPU                            GPU
+    //
+    //     Record command buffer
+    //     with timestamp query
+    //
+    //     Submit command buffer
+    //
+    //     Post-submission work             Begin execution
+    //
+    //            ????                    Write timstamp Tgpu
+    //
+    //            ????                       End execution
+    //
+    //            ????                    Return query results
+    //
+    //            ????
+    //
+    //       Get query results
+    //
+    // The areas of unknown work (????) on the CPU indicate that the CPU may or may not have
+    // finished post-submission work while the GPU is executing in parallel. With no further work,
+    // querying CPU timestamps before submission and after getting query results give the bounds to
+    // Tgpu, which could be quite large.
+    //
+    // Using VkEvents, the GPU can be made to wait for the CPU and vice versa, in an effort to
+    // reduce this range. This function implements the following procedure:
+    //
+    //             CPU                            GPU
+    //
+    //     Record command buffer
+    //     with timestamp query
+    //
+    //     Submit command buffer
+    //
+    //     Post-submission work             Begin execution
+    //
+    //            ????                    Set Event GPUReady
+    //
+    //    Wait on Event GPUReady         Wait on Event CPUReady
+    //
+    //       Get CPU Time Ts             Wait on Event CPUReady
+    //
+    //      Set Event CPUReady           Wait on Event CPUReady
+    //
+    //      Get CPU Time Tcpu              Get GPU Time Tgpu
+    //
+    //    Wait on Event GPUDone            Set Event GPUDone
+    //
+    //       Get CPU Time Te                 End Execution
+    //
+    //            Idle                    Return query results
+    //
+    //      Get query results
+    //
+    // If Te-Ts > epsilon, a GPU or CPU interruption can be assumed and the operation can be
+    // retried.  Once Te-Ts < epsilon, Tcpu can be taken to presumably match Tgpu.  Finding an
+    // epsilon that's valid for all devices may be difficult, so the loop can be performed only a
+    // limited number of times and the Tcpu,Tgpu pair corresponding to smallest Te-Ts used for
+    // calibration.
+    //
+    // Note: Once VK_EXT_calibrated_timestamps is ubiquitous, this should be redone.
+
+    // Make sure nothing is running
+    ASSERT(mCommandGraph.empty());
+
+    TRACE_EVENT0("gpu.angle", "RendererVk::synchronizeCpuGpuTime");
+
+    // Create a query used to receive the GPU timestamp
+    vk::QueryHelper timestampQuery;
+    ANGLE_TRY(mGpuEventQueryPool.allocateQuery(context, &timestampQuery));
+
+    // Create the three events
+    VkEventCreateInfo eventCreateInfo = {};
+    eventCreateInfo.sType             = VK_STRUCTURE_TYPE_EVENT_CREATE_INFO;
+    eventCreateInfo.flags             = 0;
+
+    vk::Scoped<vk::Event> cpuReady(mDevice), gpuReady(mDevice), gpuDone(mDevice);
+    ANGLE_TRY(cpuReady.get().init(context, eventCreateInfo));
+    ANGLE_TRY(gpuReady.get().init(context, eventCreateInfo));
+    ANGLE_TRY(gpuDone.get().init(context, eventCreateInfo));
+
+    constexpr uint32_t kRetries = 10;
+
+    // Time suffixes used are S for seconds and Cycles for cycles
+    double tightestRangeS = 1e6f;
+    double TcpuS          = 0;
+    uint64_t TgpuCycles   = 0;
+    for (uint32_t i = 0; i < kRetries; ++i)
+    {
+        // Reset the events
+        ANGLE_TRY(cpuReady.get().reset(context));
+        ANGLE_TRY(gpuReady.get().reset(context));
+        ANGLE_TRY(gpuDone.get().reset(context));
+
+        // Record the command buffer
+        vk::Scoped<vk::CommandBuffer> commandBatch(mDevice);
+        vk::CommandBuffer &commandBuffer = commandBatch.get();
+
+        VkCommandBufferAllocateInfo commandBufferInfo = {};
+        commandBufferInfo.sType              = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+        commandBufferInfo.commandPool        = mCommandPool.getHandle();
+        commandBufferInfo.level              = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+        commandBufferInfo.commandBufferCount = 1;
+
+        ANGLE_TRY(commandBuffer.init(context, commandBufferInfo));
+
+        VkCommandBufferBeginInfo beginInfo = {};
+        beginInfo.sType                    = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+        beginInfo.flags                    = 0;
+        beginInfo.pInheritanceInfo         = nullptr;
+
+        ANGLE_TRY(commandBuffer.begin(context, beginInfo));
+
+        commandBuffer.setEvent(gpuReady.get(), VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT);
+        commandBuffer.waitEvents(1, cpuReady.get().ptr(), VK_PIPELINE_STAGE_HOST_BIT,
+                                 VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, 0, nullptr, 0, nullptr, 0,
+                                 nullptr);
+
+        commandBuffer.resetQueryPool(timestampQuery.getQueryPool()->getHandle(),
+                                     timestampQuery.getQuery(), 1);
+        commandBuffer.writeTimestamp(VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+                                     timestampQuery.getQueryPool()->getHandle(),
+                                     timestampQuery.getQuery());
+
+        commandBuffer.setEvent(gpuDone.get(), VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT);
+
+        ANGLE_TRY(commandBuffer.end(context));
+
+        // Submit the command buffer
+        angle::FixedVector<VkSemaphore, kMaxWaitSemaphores> waitSemaphores;
+        angle::FixedVector<VkPipelineStageFlags, kMaxWaitSemaphores> waitStageMasks;
+        getSubmitWaitSemaphores(context, &waitSemaphores, &waitStageMasks);
+
+        VkSubmitInfo submitInfo         = {};
+        submitInfo.sType                = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+        submitInfo.waitSemaphoreCount   = static_cast<uint32_t>(waitSemaphores.size());
+        submitInfo.pWaitSemaphores      = waitSemaphores.data();
+        submitInfo.pWaitDstStageMask    = waitStageMasks.data();
+        submitInfo.commandBufferCount   = 1;
+        submitInfo.pCommandBuffers      = commandBuffer.ptr();
+        submitInfo.signalSemaphoreCount = 0;
+        submitInfo.pSignalSemaphores    = nullptr;
+
+        ANGLE_TRY(submitFrame(context, submitInfo, std::move(commandBuffer)));
+
+        // Wait for GPU to be ready.  This is a short busy wait.
+        angle::Result result = angle::Result::Incomplete();
+        do
+        {
+            result = gpuReady.get().getStatus(context);
+            ANGLE_TRY(result);
+        } while (result == angle::Result::Incomplete());
+
+        double TsS = platform->monotonicallyIncreasingTime(platform);
+
+        // Tell the GPU to go ahead with the timestamp query.
+        ANGLE_TRY(cpuReady.get().set(context));
+        double cpuTimestampS = platform->monotonicallyIncreasingTime(platform);
+
+        // Wait for GPU to be done.  Another short busy wait.
+        do
+        {
+            result = gpuDone.get().getStatus(context);
+            ANGLE_TRY(result);
+        } while (result == angle::Result::Incomplete());
+
+        double TeS = platform->monotonicallyIncreasingTime(platform);
+
+        // Get the query results
+        ANGLE_TRY(finishToSerial(context, getLastSubmittedQueueSerial()));
+
+        constexpr VkQueryResultFlags queryFlags = VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT;
+
+        uint64_t gpuTimestampCycles = 0;
+        ANGLE_TRY(timestampQuery.getQueryPool()->getResults(
+            context, timestampQuery.getQuery(), 1, sizeof(gpuTimestampCycles), &gpuTimestampCycles,
+            sizeof(gpuTimestampCycles), queryFlags));
+
+        // Use the first timestamp queried as origin.
+        if (mGpuEventTimestampOrigin == 0)
+        {
+            mGpuEventTimestampOrigin = gpuTimestampCycles;
+        }
+
+        // Take these CPU and GPU timestamps if there is better confidence.
+        double confidenceRangeS = TeS - TsS;
+        if (confidenceRangeS < tightestRangeS)
+        {
+            tightestRangeS = confidenceRangeS;
+            TcpuS          = cpuTimestampS;
+            TgpuCycles     = gpuTimestampCycles;
+        }
+    }
+
+    mGpuEventQueryPool.freeQuery(context, &timestampQuery);
+
+    // timestampPeriod gives nanoseconds/cycle.
+    double TgpuS = (TgpuCycles - mGpuEventTimestampOrigin) *
+                   static_cast<double>(mPhysicalDeviceProperties.limits.timestampPeriod) /
+                   1'000'000'000.0;
+
+    flushGpuEvents(TgpuS, TcpuS);
+
+    mGpuClockSync.gpuTimestampS = TgpuS;
+    mGpuClockSync.cpuTimestampS = TcpuS;
+
+    return angle::Result::Continue();
+}
+
+angle::Result RendererVk::traceGpuEventImpl(vk::Context *context,
+                                            vk::CommandBuffer *commandBuffer,
+                                            char phase,
+                                            const char *name)
+{
+    ASSERT(mGpuEventsEnabled);
+
+    GpuEventQuery event;
+
+    event.name   = name;
+    event.phase  = phase;
+    event.serial = mCurrentQueueSerial;
+
+    ANGLE_TRY(mGpuEventQueryPool.allocateQuery(context, &event.queryPoolIndex, &event.queryIndex));
+
+    commandBuffer->resetQueryPool(
+        mGpuEventQueryPool.getQueryPool(event.queryPoolIndex)->getHandle(), event.queryIndex, 1);
+    commandBuffer->writeTimestamp(
+        VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+        mGpuEventQueryPool.getQueryPool(event.queryPoolIndex)->getHandle(), event.queryIndex);
+
+    mInFlightGpuEventQueries.push_back(std::move(event));
+
+    return angle::Result::Continue();
+}
+
+angle::Result RendererVk::checkCompletedGpuEvents(vk::Context *context)
+{
+    ASSERT(mGpuEventsEnabled);
+
+    angle::PlatformMethods *platform = ANGLEPlatformCurrent();
+    ASSERT(platform);
+
+    int finishedCount = 0;
+
+    for (GpuEventQuery &eventQuery : mInFlightGpuEventQueries)
+    {
+        // Only check the timestamp query if the submission has finished.
+        if (eventQuery.serial > mLastCompletedQueueSerial)
+        {
+            break;
+        }
+
+        // See if the results are available.
+        uint64_t gpuTimestampCycles = 0;
+        angle::Result result        = mGpuEventQueryPool.getQueryPool(eventQuery.queryPoolIndex)
+                                   ->getResults(context, eventQuery.queryIndex, 1,
+                                                sizeof(gpuTimestampCycles), &gpuTimestampCycles,
+                                                sizeof(gpuTimestampCycles), VK_QUERY_RESULT_64_BIT);
+        ANGLE_TRY(result);
+
+        if (result == angle::Result::Incomplete())
+        {
+            break;
+        }
+
+        mGpuEventQueryPool.freeQuery(context, eventQuery.queryPoolIndex, eventQuery.queryIndex);
+
+        GpuEvent event;
+        event.gpuTimestampCycles = gpuTimestampCycles;
+        event.name               = eventQuery.name;
+        event.phase              = eventQuery.phase;
+
+        mGpuEvents.emplace_back(event);
+
+        ++finishedCount;
+    }
+
+    mInFlightGpuEventQueries.erase(mInFlightGpuEventQueries.begin(),
+                                   mInFlightGpuEventQueries.begin() + finishedCount);
+
+    return angle::Result::Continue();
+}
+
+void RendererVk::flushGpuEvents(double nextSyncGpuTimestampS, double nextSyncCpuTimestampS)
+{
+    if (mGpuEvents.size() == 0)
+    {
+        return;
+    }
+
+    angle::PlatformMethods *platform = ANGLEPlatformCurrent();
+    ASSERT(platform);
+
+    // Find the slope of the clock drift for adjustment
+    double lastGpuSyncTimeS  = mGpuClockSync.gpuTimestampS;
+    double lastGpuSyncDiffS  = mGpuClockSync.cpuTimestampS - mGpuClockSync.gpuTimestampS;
+    double gpuSyncDriftSlope = 0;
+
+    double nextGpuSyncTimeS = nextSyncGpuTimestampS;
+    double nextGpuSyncDiffS = nextSyncCpuTimestampS - nextSyncGpuTimestampS;
+
+    // No gpu trace events should have been generated before the clock sync, so if there is no
+    // "previous" clock sync, there should be no gpu events (i.e. the function early-outs above).
+    ASSERT(mGpuClockSync.gpuTimestampS != std::numeric_limits<double>::max() &&
+           mGpuClockSync.cpuTimestampS != std::numeric_limits<double>::max());
+
+    gpuSyncDriftSlope =
+        (nextGpuSyncDiffS - lastGpuSyncDiffS) / (nextGpuSyncTimeS - lastGpuSyncTimeS);
+
+    for (const GpuEvent &event : mGpuEvents)
+    {
+        double gpuTimestampS =
+            (event.gpuTimestampCycles - mGpuEventTimestampOrigin) *
+            static_cast<double>(mPhysicalDeviceProperties.limits.timestampPeriod) * 1e-9;
+
+        // Account for clock drift.
+        gpuTimestampS += lastGpuSyncDiffS + gpuSyncDriftSlope * (gpuTimestampS - lastGpuSyncTimeS);
+
+        // Generate the trace now that the GPU timestamp is available and clock drifts are accounted
+        // for.
+        static long long eventId = 1;
+        static const unsigned char *categoryEnabled =
+            TRACE_EVENT_API_GET_CATEGORY_ENABLED("gpu.angle.gpu");
+        platform->addTraceEvent(platform, event.phase, categoryEnabled, event.name, eventId++,
+                                gpuTimestampS, 0, nullptr, nullptr, nullptr, TRACE_EVENT_FLAG_NONE);
+    }
+
+    mGpuEvents.clear();
+}
+
 uint32_t GetUniformBufferDescriptorCount()
 {
    return kUniformBufferDescriptorsPerDescriptorSet;
--- a/src/libANGLE/renderer/vulkan/RendererVk.h
+++ b/src/libANGLE/renderer/vulkan/RendererVk.h
@ -18,6 +18,7 @@
 #include "libANGLE/Caps.h"
 #include "libANGLE/renderer/vulkan/CommandGraph.h"
 #include "libANGLE/renderer/vulkan/FeaturesVk.h"
+#include "libANGLE/renderer/vulkan/QueryVk.h"
 #include "libANGLE/renderer/vulkan/vk_format_utils.h"
 #include "libANGLE/renderer/vulkan/vk_helpers.h"
 #include "libANGLE/renderer/vulkan/vk_internal_shaders.h"
@ -172,6 +173,19 @@ class RendererVk : angle::NonCopyable
    vk::ShaderLibrary *getShaderLibrary();
    const FeaturesVk &getFeatures() const { return mFeatures; }

+    // Create Begin/End/Instant GPU trace events, which take their timestamps from GPU queries.
+    // The events are queued until the query results are available.  Possible values for `phase`
+    // are TRACE_EVENT_PHASE_*
+    ANGLE_INLINE angle::Result traceGpuEvent(vk::Context *context,
+                                             vk::CommandBuffer *commandBuffer,
+                                             char phase,
+                                             const char *name)
+    {
+        if (mGpuEventsEnabled)
+            return traceGpuEventImpl(context, commandBuffer, phase, name);
+        return angle::Result::Continue();
+    }
+
  private:
    // Number of semaphores for external entities to renderer to issue a wait, such as surface's
    // image acquire.
@ -184,7 +198,8 @@ class RendererVk : angle::NonCopyable
    void ensureCapsInitialized() const;
    void getSubmitWaitSemaphores(
        vk::Context *context,
-        angle::FixedVector<VkSemaphore, kMaxWaitSemaphores> *waitSemaphores);
+        angle::FixedVector<VkSemaphore, kMaxWaitSemaphores> *waitSemaphores,
+        angle::FixedVector<VkPipelineStageFlags, kMaxWaitSemaphores> *waitStageMasks);
    angle::Result submitFrame(vk::Context *context,
                              const VkSubmitInfo &submitInfo,
                              vk::CommandBuffer &&commandBuffer);
@ -194,6 +209,14 @@ class RendererVk : angle::NonCopyable
    void initPipelineCacheVkKey();
    angle::Result initPipelineCacheVk(DisplayVk *display);

+    angle::Result synchronizeCpuGpuTime(vk::Context *context);
+    angle::Result traceGpuEventImpl(vk::Context *context,
+                                    vk::CommandBuffer *commandBuffer,
+                                    char phase,
+                                    const char *name);
+    angle::Result checkCompletedGpuEvents(vk::Context *context);
+    void flushGpuEvents(double nextSyncGpuTimestampS, double nextSyncCpuTimestampS);
+
    mutable bool mCapsInitialized;
    mutable gl::Caps mNativeCaps;
    mutable gl::TextureCapsMap mNativeTextureCaps;
@ -277,6 +300,58 @@ class RendererVk : angle::NonCopyable

    // Internal shader library.
    vk::ShaderLibrary mShaderLibrary;
+
+    // The GpuEventQuery struct holds together a timestamp query and enough data to create a
+    // trace event based on that. Use traceGpuEvent to insert such queries.  They will be readback
+    // when the results are available, without inserting a GPU bubble.
+    //
+    // - eventName will be the reported name of the event
+    // - phase is either 'B' (duration begin), 'E' (duration end) or 'i' (instant // event).
+    //   See Google's "Trace Event Format":
+    //   https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU
+    // - serial is the serial of the batch the query was submitted on.  Until the batch is
+    //   submitted, the query is not checked to avoid incuring a flush.
+    struct GpuEventQuery final
+    {
+        const char *name;
+        char phase;
+
+        uint32_t queryIndex;
+        size_t queryPoolIndex;
+
+        Serial serial;
+    };
+
+    // Once a query result is available, the timestamp is read and a GpuEvent object is kept until
+    // the next clock sync, at which point the clock drift is compensated in the results before
+    // handing them off to the application.
+    struct GpuEvent final
+    {
+        uint64_t gpuTimestampCycles;
+        const char *name;
+        char phase;
+    };
+
+    bool mGpuEventsEnabled;
+    vk::DynamicQueryPool mGpuEventQueryPool;
+    // A list of queries that have yet to be turned into an event (their result is not yet
+    // available).
+    std::vector<GpuEventQuery> mInFlightGpuEventQueries;
+    // A list of gpu events since the last clock sync.
+    std::vector<GpuEvent> mGpuEvents;
+
+    // Hold information from the last gpu clock sync for future gpu-to-cpu timestamp conversions.
+    struct GpuClockSyncInfo
+    {
+        double gpuTimestampS;
+        double cpuTimestampS;
+    };
+    GpuClockSyncInfo mGpuClockSync;
+
+    // The very first timestamp queried for a GPU event is used as origin, so event timestamps would
+    // have a value close to zero, to avoid losing 12 bits when converting these 64 bit values to
+    // double.
+    uint64_t mGpuEventTimestampOrigin;
 };

 uint32_t GetUniformBufferDescriptorCount();
--- a/src/libANGLE/renderer/vulkan/vk_helpers.cpp
+++ b/src/libANGLE/renderer/vulkan/vk_helpers.cpp
@ -518,13 +518,11 @@ angle::Result DynamicQueryPool::allocateQuery(Context *context, QueryHelper *que
 {
    ASSERT(!queryOut->getQueryPool());

-    if (mCurrentFreeEntry >= mPoolSize)
-    {
-        // No more queries left in this pool, create another one.
-        ANGLE_TRY(allocateNewPool(context));
-    }
+    size_t poolIndex    = 0;
+    uint32_t queryIndex = 0;
+    ANGLE_TRY(allocateQuery(context, &poolIndex, &queryIndex));

-    queryOut->init(this, mCurrentPool, mCurrentFreeEntry++);
+    queryOut->init(this, poolIndex, queryIndex);

    return angle::Result::Continue();
 }
@ -536,11 +534,34 @@ void DynamicQueryPool::freeQuery(Context *context, QueryHelper *query)
        size_t poolIndex = query->getQueryPoolIndex();
        ASSERT(query->getQueryPool()->valid());

-        onEntryFreed(context, poolIndex);
+        freeQuery(context, poolIndex, query->getQuery());
+
        query->deinit();
    }
 }

+angle::Result DynamicQueryPool::allocateQuery(Context *context,
+                                              size_t *poolIndex,
+                                              uint32_t *queryIndex)
+{
+    if (mCurrentFreeEntry >= mPoolSize)
+    {
+        // No more queries left in this pool, create another one.
+        ANGLE_TRY(allocateNewPool(context));
+    }
+
+    *poolIndex  = mCurrentPool;
+    *queryIndex = mCurrentFreeEntry++;
+
+    return angle::Result::Continue();
+}
+
+void DynamicQueryPool::freeQuery(Context *context, size_t poolIndex, uint32_t queryIndex)
+{
+    ANGLE_UNUSED_VARIABLE(queryIndex);
+    onEntryFreed(context, poolIndex);
+}
+
 angle::Result DynamicQueryPool::allocateNewPool(Context *context)
 {
    if (findFreeEntryPool(context))
--- a/src/libANGLE/renderer/vulkan/vk_helpers.h
+++ b/src/libANGLE/renderer/vulkan/vk_helpers.h
@ -229,6 +229,11 @@ class DynamicQueryPool final : public DynamicallyGrowingPool<QueryPool>
    angle::Result allocateQuery(Context *context, QueryHelper *queryOut);
    void freeQuery(Context *context, QueryHelper *query);

+    // Special allocator that doesn't work with QueryHelper, which is a CommandGraphResource.
+    // Currently only used with RendererVk::GpuEventQuery.
+    angle::Result allocateQuery(Context *context, size_t *poolIndex, uint32_t *queryIndex);
+    void freeQuery(Context *context, size_t poolIndex, uint32_t queryIndex);
+
    const QueryPool *getQueryPool(size_t index) const { return &mPools[index]; }

  private:
--- a/src/tests/angle_perftests_main.cpp
+++ b/src/tests/angle_perftests_main.cpp
@ -27,8 +27,7 @@ int main(int argc, char **argv)
        }
        if (strcmp("--trace-file", argv[i]) == 0 && i < argc - 1)
        {
-            gTraceFile = argv[i + 1];
-            argc++;
+            gTraceFile = argv[++i];
        }
    }

--- a/src/tests/perf_tests/ANGLEPerfTest.cpp
+++ b/src/tests/perf_tests/ANGLEPerfTest.cpp
@ -9,6 +9,7 @@

 #include "ANGLEPerfTest.h"
 #include "third_party/perf/perf_test.h"
+#include "third_party/trace_event/trace_event.h"

 #include <cassert>
 #include <cmath>
@ -21,6 +22,17 @@ namespace
 {
 constexpr size_t kInitialTraceEventBufferSize = 50000;

+struct TraceCategory
+{
+    unsigned char enabled;
+    const char *name;
+};
+
+constexpr TraceCategory gTraceCategories[2] = {
+    {1, "gpu.angle"},
+    {1, "gpu.angle.gpu"},
+};
+
 void EmptyPlatformMethod(angle::PlatformMethods *, const char *)
 {
 }
@ -43,17 +55,33 @@ angle::TraceEventHandle AddTraceEvent(angle::PlatformMethods *platform,
                                      const unsigned long long *argValues,
                                      unsigned char flags)
 {
+    // Discover the category name based on categoryEnabledFlag.  This flag comes from the first
+    // parameter of TraceCategory, and corresponds to one of the entries in gTraceCategories.
+    static_assert(offsetof(TraceCategory, enabled) == 0,
+                  "|enabled| must be the first field of the TraceCategory class.");
+    const TraceCategory *category = reinterpret_cast<const TraceCategory *>(categoryEnabledFlag);
+    ptrdiff_t categoryIndex       = category - gTraceCategories;
+    ASSERT(categoryIndex >= 0 && static_cast<size_t>(categoryIndex) < ArraySize(gTraceCategories));
+
    ANGLERenderTest *renderTest     = static_cast<ANGLERenderTest *>(platform->context);
    std::vector<TraceEvent> &buffer = renderTest->getTraceEventBuffer();
-    buffer.emplace_back(phase, name, timestamp);
+    buffer.emplace_back(phase, category->name, name, timestamp);
    return buffer.size();
 }

 const unsigned char *GetTraceCategoryEnabledFlag(angle::PlatformMethods *platform,
                                                 const char *categoryName)
 {
-    constexpr static unsigned char kNonZero = 1;
-    return &kNonZero;
+    for (const TraceCategory &category : gTraceCategories)
+    {
+        if (strcmp(category.name, categoryName) == 0)
+        {
+            return &category.enabled;
+        }
+    }
+
+    constexpr static unsigned char kZero = 0;
+    return &kZero;
 }

 void UpdateTraceEventDuration(angle::PlatformMethods *platform,
@ -67,7 +95,10 @@ void UpdateTraceEventDuration(angle::PlatformMethods *platform,
 double MonotonicallyIncreasingTime(angle::PlatformMethods *platform)
 {
    ANGLERenderTest *renderTest = static_cast<ANGLERenderTest *>(platform->context);
-    return renderTest->getTimer()->getElapsedTime();
+    // Move the time origin to the first call to this function, to avoid generating unnecessarily
+    // large timestamps.
+    static double origin = renderTest->getTimer()->getAbsoluteTime();
+    return renderTest->getTimer()->getAbsoluteTime() - origin;
 }

 void DumpTraceEventsToJSONFile(const std::vector<TraceEvent> &traceEvents,
@ -86,11 +117,11 @@ void DumpTraceEventsToJSONFile(const std::vector<TraceEvent> &traceEvents,
            static_cast<unsigned long long>(traceEvent.timestamp * 1000.0 * 1000.0);

        value["name"] = traceEvent.name;
-        value["cat"]  = "gpu.angle";
+        value["cat"]  = traceEvent.categoryName;
        value["ph"]   = phaseName.str();
        value["ts"]   = microseconds;
        value["pid"]  = "ANGLE";
-        value["tid"]  = "CPU";
+        value["tid"]  = strcmp(traceEvent.categoryName, "gpu.angle.gpu") == 0 ? "GPU" : "CPU";

        eventsValue.append(value);
    }
--- a/src/tests/perf_tests/ANGLEPerfTest.h
+++ b/src/tests/perf_tests/ANGLEPerfTest.h
@ -44,12 +44,13 @@ struct TraceEvent final
 {
    TraceEvent() {}

-    TraceEvent(char phaseIn, const char *nameIn, double timestampIn)
-        : phase(phaseIn), name(nameIn), timestamp(timestampIn)
+    TraceEvent(char phaseIn, const char *categoryNameIn, const char *nameIn, double timestampIn)
+        : phase(phaseIn), categoryName(categoryNameIn), name(nameIn), timestamp(timestampIn)
    {
    }

    char phase       = 0;
+    const char *categoryName = nullptr;
    const char *name = nullptr;
    double timestamp = 0;
 };