diff --git a/.azure-pipelines/cuda-unit-test.yml b/.azure-pipelines/cuda-unit-test.yml
index 2f9642d1..6468aa07 100644
--- a/.azure-pipelines/cuda-unit-test.yml
+++ b/.azure-pipelines/cuda-unit-test.yml
@@ -16,13 +16,13 @@ steps:
       python3 -m pip install .[test,torch]
     displayName: Install dependencies
   - script: |
-      make cppbuild
+      SB_MICRO_PATH=$PWD/bin make cppbuild
     displayName: Build benchmarks
   - script: |
       python3 setup.py lint
     displayName: Run code lint
   - script: |
-      python3 setup.py test
+      SB_MICRO_PATH=$PWD/bin python3 setup.py test
     displayName: Run unit tests
     timeoutInMinutes: 10
   - script: |
diff --git a/.clang-format b/.clang-format
index 80d11fef..434d4899 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,3 +2,4 @@
 
 BasedOnStyle: LLVM
 ColumnLimit: 120
+IndentWidth: 4
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_common.cmake b/superbench/benchmarks/micro_benchmarks/cuda_common.cmake
new file mode 100644
index 00000000..1e777312
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cuda_common.cmake
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+if(NOT DEFINED CMAKE_CUDA_STANDARD)
+    set(CMAKE_CUDA_STANDARD 11)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+endif()
+
+if(NOT DEFINED NVCC_ARCHS_SUPPORTED)
+    # Reference: https://github.com/NVIDIA/cutlass/blob/0e137486498a52954eff239d874ee27ab23358e7/CMakeLists.txt#L89
+    set(NVCC_ARCHS_SUPPORTED "")
+    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 7.5)
+      list(APPEND NVCC_ARCHS_SUPPORTED 53)
+    endif()
+    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 8.0)
+      list(APPEND NVCC_ARCHS_SUPPORTED 60 61)
+    endif()
+    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.0)
+      list(APPEND NVCC_ARCHS_SUPPORTED 70)
+    endif()
+    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.2)
+      list(APPEND NVCC_ARCHS_SUPPORTED 72)
+    endif()
+    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10.0)
+      list(APPEND NVCC_ARCHS_SUPPORTED 75)
+    endif()
+    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
+      list(APPEND NVCC_ARCHS_SUPPORTED 80)
+    endif()
+    if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.1)
+      list(APPEND NVCC_ARCHS_SUPPORTED 86)
+    endif()
+endif()
diff --git a/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/CMakeLists.txt
new file mode 100644
index 00000000..77641068
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/CMakeLists.txt
@@ -0,0 +1,11 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+cmake_minimum_required(VERSION 3.18)
+project(kernel_launch_overhead LANGUAGES CUDA CXX)
+
+include(../cuda_common.cmake)
+
+add_executable(kernel_launch_overhead cuda_kernel_launch.cu)
+set_property(TARGET kernel_launch_overhead PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
+install (TARGETS kernel_launch_overhead RUNTIME DESTINATION .)
diff --git a/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/cuda_kernel_launch.cu b/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/cuda_kernel_launch.cu
new file mode 100644
index 00000000..46e4d203
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/cuda_kernel_launch.cu
@@ -0,0 +1,107 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Kernel launch benchmark which will launch one empty kernel and record the cost in event mode and wall mode.
+//   event mode: using cuda event to record the elapsed time of kernel launch on device.
+//   wall mode: using host timer to record the elapsed time kernel launch on both host and device.
+
+#include <algorithm>
+#include <chrono>
+#include <stdio.h>
+#include <string>
+#include <sys/time.h>
+#include <thread>
+
+#include "cuda_runtime.h"
+
+__global__ void EmptyKernel() {}
+
+double test_cuda_kernel_launch_event_time(int num_warmups, int num_steps) {
+    float time = 0.f;
+    double total_time = 0.0;
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    for (int i = 0; i < num_warmups; i++) {
+        cudaEventRecord(start, 0);
+        EmptyKernel<<<1, 1>>>();
+        cudaEventRecord(stop, 0);
+        cudaEventSynchronize(stop);
+    }
+
+    for (int i = 0; i < num_steps; i++) {
+        cudaEventRecord(start, 0);
+        EmptyKernel<<<1, 1>>>();
+        cudaEventRecord(stop, 0);
+        cudaEventSynchronize(stop);
+        cudaEventElapsedTime(&time, start, stop);
+        total_time += time;
+    }
+
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+
+    return total_time;
+}
+
+double test_cuda_kernel_launch_wall_time(int num_warmups, int num_steps) {
+    double total_time = 0.0;
+
+    for (int i = 0; i < num_warmups; i++) {
+        EmptyKernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+    }
+
+    struct timeval begin_tv, end_tv;
+    for (int i = 0; i < num_steps; i++) {
+        gettimeofday(&begin_tv, NULL);
+        EmptyKernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+        gettimeofday(&end_tv, NULL);
+        total_time += (((end_tv.tv_sec) * 1000 + (end_tv.tv_usec) / 1000) -
+                       ((begin_tv.tv_sec) * 1000 + (begin_tv.tv_usec) / 1000));
+    }
+
+    return total_time;
+}
+
+char *getCmdOption(char **begin, char **end, const std::string &option) {
+    char **itr = std::find(begin, end, option);
+    if (itr != end && ++itr != end) {
+        return *itr;
+    }
+    return 0;
+}
+
+int main(int argc, char *argv[]) {
+    int num_warmups = 100;
+    int num_steps = 2000000;
+    int interval = 2000;
+
+    if (char *value = getCmdOption(argv, argv + argc, "-w")) {
+        num_warmups = std::stoi(value);
+    }
+
+    if (char *value = getCmdOption(argv, argv + argc, "-n")) {
+        num_steps = std::stoi(value);
+    }
+
+    if (char *value = getCmdOption(argv, argv + argc, "-i")) {
+        interval = std::stoi(value);
+    }
+
+    // Test the kernel launch event time.
+    double event_total_time = test_cuda_kernel_launch_event_time(num_warmups, num_steps);
+    printf("Kernel launch overhead - event time: %3.5f ms \n", event_total_time / num_steps);
+
+    // Sleep for interval milliseconds and run the next test.
+    std::this_thread::sleep_for(std::chrono::milliseconds(interval));
+
+    // Test the kernel launch wall time.
+    double wall_total_time = test_cuda_kernel_launch_wall_time(num_warmups, num_steps);
+    printf("Kernel launch overhead - wall time: %3.5f ms \n", wall_total_time / num_steps);
+
+    return 0;
+}