From 4e9e01cb3c008335a2471c27dbdf7dd5d12e4224 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Sun, 19 Dec 2021 20:54:29 -0800
Subject: [PATCH] Fix SDL warnings in CPU EP (#9975)

---
 cmake/CMakeLists.txt                          |   9 +-
 cmake/EnableVisualStudioCodeAnalysis.props    |  12 +
 cmake/Sdl.ruleset                             | 268 ++++++++++++++++++
 cmake/onnxruntime_common.cmake                |   5 -
 cmake/onnxruntime_graph.cmake                 |   5 +-
 cmake/onnxruntime_mlas.cmake                  |   3 +
 cmake/onnxruntime_providers.cmake             |   4 +
 cmake/onnxruntime_unittests.cmake             |  29 +-
 include/onnxruntime/core/common/common.h      |   8 +-
 .../onnxruntime/core/common/gsl_suppress.h    |  15 +
 include/onnxruntime/core/common/status.h      |  36 +--
 .../core/framework/kernel_def_builder.h       |   6 +-
 .../core/framework/kernel_registry.h          |   2 +-
 .../onnxruntime/core/framework/op_kernel.h    |  18 +-
 .../core/framework/op_kernel_info.h           |   4 -
 include/onnxruntime/core/graph/basic_types.h  |   5 +-
 include/onnxruntime/core/graph/graph.h        |  30 +-
 include/onnxruntime/core/graph/node_arg.h     |   4 +-
 .../platform/EigenNonBlockingThreadPool.h     |   2 +-
 .../onnxruntime/core/session/environment.h    |  12 +-
 .../core/session/onnxruntime_c_api.h          |   1 +
 .../core/session/onnxruntime_cxx_api.h        |   8 +-
 onnxruntime/contrib_ops/cpu/activations.h     |   2 +-
 .../cpu/attnlstm/bahdanau_attention.cc        |   3 +-
 .../cpu/attnlstm/deep_cpu_attn_lstm.cc        |   1 +
 .../cpu/attnlstm/uni_dir_attn_lstm.cc         |   1 +
 .../contrib_ops/cpu/bert/attention_cpu_base.h |   1 +
 .../contrib_ops/cpu/crop_and_resize.cc        |   1 +
 onnxruntime/contrib_ops/cpu/nchwc_ops.cc      |   6 +-
 .../cpu/quantization/qlinear_pool.cc          |  12 +-
 onnxruntime/contrib_ops/cpu/tokenizer.cc      |   4 +-
 .../cpu/transformers/beam_search.cc           |   8 +-
 .../cpu/transformers/beam_search_scorer.cc    |   7 +-
 .../cpu/transformers/gpt_subgraph.cc          |   5 +-
 .../cpu/transformers/logits_processor.cc      |   5 +-
 .../contrib_ops/cpu/transformers/sequences.cc |   5 +-
 onnxruntime/core/common/threadpool.cc         |  19 +-
 onnxruntime/core/framework/allocator.cc       |  12 +-
 onnxruntime/core/framework/bfc_arena.cc       |  13 +-
 onnxruntime/core/framework/bfc_arena.h        |   6 +-
 onnxruntime/core/framework/callback.cc        |   4 +-
 onnxruntime/core/framework/data_types.cc      |   5 +-
 onnxruntime/core/framework/error_code.cc      |   5 +-
 onnxruntime/core/framework/func_kernel.h      |  32 ++-
 .../core/framework/fuse_nodes_funcs.cc        |   4 +-
 onnxruntime/core/framework/fuse_nodes_funcs.h |   8 +-
 .../core/framework/graph_partitioner.cc       |  22 +-
 onnxruntime/core/framework/kernel_registry.cc |   6 +-
 .../core/framework/kernel_registry_manager.cc |  13 +-
 .../core/framework/kernel_registry_manager.h  |   8 +-
 .../framework/onnxruntime_map_type_info.cc    |   4 +-
 .../onnxruntime_sequence_type_info.cc         |  18 +-
 .../core/framework/onnxruntime_typeinfo.cc    |   4 +-
 .../core/framework/onnxruntime_typeinfo.h     |   1 +
 onnxruntime/core/framework/op_kernel_info.cc  |   7 +-
 onnxruntime/core/framework/run_options.cc     |   4 +-
 onnxruntime/core/framework/session_options.h  |   1 +
 onnxruntime/core/framework/session_state.cc   |   6 +-
 onnxruntime/core/framework/session_state.h    |  45 ++-
 onnxruntime/core/framework/sparse_tensor.cc   |   4 +-
 .../core/framework/tensor_type_and_shape.cc   |  12 +-
 .../core/framework/tensorprotoutils.cc        |  12 +-
 onnxruntime/core/framework/utils.cc           |   6 +-
 onnxruntime/core/graph/graph.cc               |  31 +-
 onnxruntime/core/graph/model.cc               |   6 +-
 onnxruntime/core/graph/model.h                |   2 +-
 onnxruntime/core/mlas/lib/convolve.cpp        |   1 +
 onnxruntime/core/mlas/lib/dgemm.cpp           |   1 +
 .../core/mlas/lib/qgemm_kernel_default.cpp    |   4 +-
 onnxruntime/core/mlas/lib/sgemm.cpp           |   1 +
 .../optimizer/optimizer_execution_frame.cc    |   5 +-
 .../qdq_selector_action_transformer.cc        |  24 +-
 onnxruntime/core/platform/windows/env.cc      |  71 ++++-
 .../core/platform/windows/stacktrace.cc       |   2 +-
 onnxruntime/core/providers/common.h           |   6 +-
 .../providers/cpu/activation/activations.h    |   7 +-
 .../core/providers/cpu/controlflow/loop.cc    |  30 +-
 .../core/providers/cpu/controlflow/scan_8.cc  |   6 +-
 .../core/providers/cpu/controlflow/scan_9.cc  |   6 +-
 .../providers/cpu/controlflow/scan_utils.cc   |   4 +-
 .../providers/cpu/controlflow/scan_utils.h    |  12 +-
 .../providers/cpu/cpu_provider_factory.cc     |   4 +-
 .../cpu/element_wise_ranged_transform.h       |   2 +
 .../core/providers/cpu/generator/range.cc     |   5 +-
 onnxruntime/core/providers/cpu/math/clip.h    |   4 +-
 onnxruntime/core/providers/cpu/math/det.cc    |   5 +
 .../einsum_compute_preprocessor.cc            |   2 +-
 .../providers/cpu/math/element_wise_ops.cc    |   4 +-
 .../providers/cpu/math/element_wise_ops.h     |  10 +-
 onnxruntime/core/providers/cpu/ml/ml_common.h |   2 +-
 .../core/providers/cpu/ml/svmclassifier.cc    |   5 +
 .../core/providers/cpu/nn/batch_norm.h        |   8 +-
 .../core/providers/cpu/nn/batch_norm_helper.h |  10 +-
 .../core/providers/cpu/nn/conv_attributes.h   |   4 +-
 .../core/providers/cpu/nn/conv_transpose.cc   |   2 +-
 .../cpu/nn/conv_transpose_attributes.h        |   2 +-
 onnxruntime/core/providers/cpu/nn/lrn.cc      |   8 +-
 .../core/providers/cpu/nn/pool_attributes.h   |   9 +-
 onnxruntime/core/providers/cpu/nn/pool_base.h |   6 +-
 onnxruntime/core/providers/cpu/nn/shrink.cc   |   5 +-
 .../core/providers/cpu/nn/tfidfvectorizer.cc  |   6 +-
 .../object_detection/non_max_suppression.cc   |   2 +-
 .../cpu/object_detection/roialign.cc          |  10 +-
 .../providers/cpu/quantization/qlinearconv.cc |   2 +-
 .../providers/cpu/reduction/reduction_ops.cc  |  15 +-
 .../providers/cpu/reduction/reduction_ops.h   |  11 +-
 .../core/providers/cpu/rnn/deep_cpu_gru.cc    |  12 +-
 .../core/providers/cpu/rnn/deep_cpu_lstm.cc   |   2 +-
 .../core/providers/cpu/rnn/lstm_base.cc       |   5 +-
 onnxruntime/core/providers/cpu/rnn/rnn.cc     |   6 +-
 .../core/providers/cpu/rnn/rnn_helpers.cc     |  32 ++-
 .../core/providers/cpu/rnn/rnn_helpers.h      |   2 +-
 .../providers/cpu/rnn/uni_directional_lstm.cc |   8 +-
 .../providers/cpu/sequence/sequence_ops.cc    |   4 +-
 .../core/providers/cpu/tensor/expand.cc       |   6 +-
 .../core/providers/cpu/tensor/gather_nd.cc    |   2 +-
 .../core/providers/cpu/tensor/nonzero_op.cc   |   2 +-
 .../core/providers/cpu/tensor/onehot.cc       |   2 +-
 .../providers/cpu/tensor/reverse_sequence.cc  |   8 +-
 .../core/providers/cpu/tensor/scatter_nd.cc   |   2 +-
 .../providers/cpu/tensor/space_depth_ops.cc   |   2 +-
 .../core/providers/cpu/tensor/split.cc        |   2 +-
 .../core/providers/cpu/tensor/transpose.cc    |   2 +-
 .../core/providers/cpu/tensor/upsample.h      |  13 +-
 onnxruntime/core/providers/cuda/cuda_kernel.h |   2 +-
 .../src/AbiCustomRegistry.cpp                 |   5 +-
 .../src/GraphPartitioner.cpp                  |   5 +-
 onnxruntime/core/session/IOBinding.h          |  42 +--
 .../core/session/abi_session_options.cc       |   1 +
 .../core/session/allocator_adapters.cc        |   4 +-
 onnxruntime/core/session/custom_ops.cc        |   5 +-
 onnxruntime/core/session/environment.cc       |   2 +-
 onnxruntime/core/session/inference_session.cc |   3 +-
 onnxruntime/core/session/onnxruntime_c_api.cc |   2 +
 onnxruntime/core/session/ort_env.cc           |  31 +-
 onnxruntime/core/session/ort_env.h            |  13 +-
 .../core/session/provider_bridge_ort.cc       |   4 +-
 onnxruntime/core/util/qmath.h                 |   8 +-
 onnxruntime/core/util/thread_utils.cc         |  11 +-
 onnxruntime/python/numpy_helper.h             |  12 +
 .../python/onnxruntime_pybind_mlvalue.cc      |   8 +-
 .../python/onnxruntime_pybind_mlvalue.h       |   2 -
 .../python/onnxruntime_pybind_ortvalue.cc     |   1 +
 .../onnxruntime_pybind_sparse_tensor.cc       |   1 +
 .../python/onnxruntime_pybind_state.cc        |  10 +-
 .../python/onnxruntime_pybind_state_common.h  |  15 +-
 .../test/contrib_ops/tensor_op_test.cc        |  18 +-
 onnxruntime/test/eager/ort_invoker_test.cc    |   5 +-
 .../test/framework/allocation_planner_test.cc |   6 +-
 onnxruntime/test/framework/float_16_test.cc   |   2 +-
 .../test/framework/kernel_registry_test.cc    |   5 +-
 .../framework/local_kernel_registry_test.cc   |  10 +-
 .../test/framework/opaque_kernels_test.cc     |   4 +-
 .../test/framework/parallel_executor_test.cc  |   6 +-
 .../test/framework/session_state_test.cc      |   8 +-
 .../test/framework/sparse_kernels_test.cc     |  30 +-
 onnxruntime/test/mlas/bench/bench_qgemm.cpp   |   6 +-
 .../test/mlas/unittest/test_conv2d_nchwc.h    |   8 +-
 onnxruntime/test/onnx/TestCase.cc             |  24 +-
 onnxruntime/test/onnx/dataitem_request.cc     |   2 +-
 onnxruntime/test/onnx/dataitem_request.h      |   5 +-
 .../test/onnx/microbenchmark/activation.cc    |   2 +-
 onnxruntime/test/onnx/tensorprotoutils.cc     |   6 +-
 onnxruntime/test/onnx/testcase_request.cc     |   2 +-
 onnxruntime/test/onnx/testcase_request.h      |   6 +-
 .../test/optimizer/graph_transform_test.cc    |  16 +-
 onnxruntime/test/perftest/TFModelInfo.cc      |   5 +-
 onnxruntime/test/perftest/TFModelInfo.h       |   2 +-
 .../test/perftest/performance_runner.cc       |   5 +-
 .../cpu/activation/activation_op_test.cc      |  30 +-
 .../providers/cpu/generator/random_test.cc    |  64 ++---
 .../cpu/math/element_wise_ops_test.cc         |   3 +-
 .../cpu/math/quantize_linear_matmul_test.cc   |   6 +-
 .../cpu/reduction/reduction_ops_test.cc       |  18 +-
 .../test/providers/cpu/tensor/copy_test.cc    |  20 +-
 .../providers/cpu/tensor/split_op_test.cc     |  46 +--
 .../providers/cpu/tensor/tensor_op_test.cc    |   4 +-
 .../providers/cpu/tensor/unique_op_test.cc    |  16 +-
 .../internal_testing_partitioning_tests.cc    |  12 +-
 .../internal_testing_tests.cc                 |  14 +-
 .../my_allocator.cc                           |   1 +
 onnxruntime/test/util/test_allocator.cc       |   1 +
 tools/ci_build/build.py                       |   3 +-
 .../azure-pipelines/nuget/templates/gpu.yml   |   8 +-
 .../{ => nuget}/templates/win-ci-2019.yml     |  10 +-
 .../azure-pipelines/templates/win-cpu-ci.yml  |  41 +--
 .../azure-pipelines/templates/win-gpu-ci.yml  |  57 ++--
 .../azure-pipelines/win-gpu-ci-pipeline.yml   | 255 +++--------------
 188 files changed, 1328 insertions(+), 953 deletions(-)
 create mode 100644 cmake/EnableVisualStudioCodeAnalysis.props
 create mode 100644 cmake/Sdl.ruleset
 create mode 100644 include/onnxruntime/core/common/gsl_suppress.h
 create mode 100644 onnxruntime/python/numpy_helper.h
 rename tools/ci_build/github/azure-pipelines/{ => nuget}/templates/win-ci-2019.yml (97%)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 59a9bb9688..adfaa4c11f 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1114,7 +1114,6 @@ if (onnxruntime_USE_MIGRAPHX)
     list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_MIGRAPHX=1)
     list(APPEND ONNXRUNTIME_PROVIDER_NAMES migraphx)
 endif()
-
 if (onnxruntime_USE_ARMNN)
     list(APPEND ORT_PROVIDER_FLAGS  -DUSE_ARMNN=1)
     list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_ARMNN=1)
@@ -1142,8 +1141,8 @@ function(onnxruntime_set_compile_flags target_name)
       set_target_properties(${target_name}
                       PROPERTIES VS_GLOBAL_CAExcludePath "${ORT_BINARY_DIR};${ORT_SOURCE_DIR}")
       if (onnxruntime_ENABLE_STATIC_ANALYSIS)
-        target_compile_options(${target_name} PRIVATE  "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/analyze:stacksize 131072>")
-        target_compile_options(${target_name} PRIVATE  "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/analyze:external->")
+        target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /analyze>" "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/analyze>")
+        target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /analyze:external->" "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/analyze:external->")
       endif()
     else()
       # Enable warning
@@ -1190,6 +1189,10 @@ function(onnxruntime_configure_target target_name)
   target_link_directories(${target_name} PRIVATE ${onnxruntime_LINK_DIRS})
   onnxruntime_set_compile_flags(${target_name})
   onnxruntime_set_source_file_properties(${target_name})
+  #Uncomment the following three lines to reproduce static analysis errors locally
+  #if(WIN32 AND onnxruntime_ENABLE_STATIC_ANALYSIS)
+  #  set_target_properties(${target_name} PROPERTIES VS_USER_PROPS ${PROJECT_SOURCE_DIR}/EnableVisualStudioCodeAnalysis.props)
+  #endif()
   target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
   if (onnxruntime_ENABLE_LTO)
     set_target_properties(${target_name} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE)
diff --git a/cmake/EnableVisualStudioCodeAnalysis.props b/cmake/EnableVisualStudioCodeAnalysis.props
new file mode 100644
index 0000000000..44f93c37bf
--- /dev/null
+++ b/cmake/EnableVisualStudioCodeAnalysis.props
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CodeAnalysisRuleSet>$(MSBuildThisFileDirectory)Sdl.ruleset</CodeAnalysisRuleSet>
+    <!-- External libraries are in or below the directory with the sln file. Source is under \onnxruntime so not affected by this.
+         Also need to exclude things under \cmake such as \cmake\external\protobuf, and the easiest way to do that in all
+         environments is to use the directory this file is in.
+         -->
+    <CAExcludePath>$(SolutionDir);$(MSBuildThisFileDirectory)</CAExcludePath>
+	<RunCodeAnalysis>true</RunCodeAnalysis>
+  </PropertyGroup>  
+</Project>
diff --git a/cmake/Sdl.ruleset b/cmake/Sdl.ruleset
new file mode 100644
index 0000000000..2909df90a1
--- /dev/null
+++ b/cmake/Sdl.ruleset
@@ -0,0 +1,268 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<RuleSet Name="ONNX Runtime SDL Rules" Description="These rules focus on the most critical and common problems in your native code, including potential security holes and application crashes.  You should include this rule set in any custom rule set you create for your native projects.  This ruleset is designed to work with Visual Studio Professional edition and higher." ToolsVersion="16.0">
+  <Rules AnalyzerId="Microsoft.Analyzers.NativeCodeAnalysis" RuleNamespace="Microsoft.Rules.Native">
+    <Rule Id="C26100" Action="Error" />
+    <Rule Id="C26101" Action="Error" />
+    <Rule Id="C26110" Action="Error" />
+    <Rule Id="C26111" Action="Error" />
+    <Rule Id="C26112" Action="Error" />
+    <Rule Id="C26115" Action="Error" />
+    <Rule Id="C26116" Action="Error" />
+    <Rule Id="C26117" Action="Error" />
+    <Rule Id="C26140" Action="Error" />
+    <Rule Id="C26400" Action="Error" />
+    <Rule Id="C26404" Action="Error" />
+    <Rule Id="C26406" Action="Error" />
+    <Rule Id="C26408" Action="Error" />
+    <Rule Id="C26409" Action="Error" />
+    <Rule Id="C26437" Action="Error" />
+    <Rule Id="C26439" Action="Error" />
+    <Rule Id="C26441" Action="Error" />
+    <Rule Id="C26444" Action="Error" />
+    <Rule Id="C26449" Action="Error" />
+    <Rule Id="C26450" Action="Error" />
+    <Rule Id="C26451" Action="Error" />
+    <Rule Id="C26452" Action="Error" />
+    <Rule Id="C26453" Action="Error" />
+    <Rule Id="C26454" Action="Error" />
+    <Rule Id="C26464" Action="Error" />
+    <Rule Id="C26478" Action="Error" />
+    <Rule Id="C26479" Action="Error" />
+    <Rule Id="C26488" Action="Error" />
+    <Rule Id="C26497" Action="Error" />
+    <Rule Id="C26498" Action="Error" />
+    <Rule Id="C26810" Action="Error" />
+    <Rule Id="C26811" Action="Error" />
+    <Rule Id="C26812" Action="Error" />
+    <Rule Id="C26814" Action="Error" />
+    <Rule Id="C26815" Action="Error" />
+    <Rule Id="C26816" Action="Error" />
+    <Rule Id="C26817" Action="Error" />
+    <Rule Id="C26819" Action="Error" />
+    <Rule Id="C26820" Action="Error" />
+    <Rule Id="C28020" Action="Error" />
+    <Rule Id="C28021" Action="Error" />
+    <Rule Id="C28022" Action="Error" />
+    <Rule Id="C28023" Action="Error" />
+    <Rule Id="C28024" Action="Error" />
+    <Rule Id="C28039" Action="Error" />
+    <Rule Id="C28112" Action="Error" />
+    <Rule Id="C28113" Action="Error" />
+    <Rule Id="C28125" Action="Error" />
+    <Rule Id="C28137" Action="Error" />
+    <Rule Id="C28138" Action="Error" />
+    <Rule Id="C28159" Action="Error" />
+    <Rule Id="C28160" Action="Error" />
+    <Rule Id="C28163" Action="Error" />
+    <Rule Id="C28164" Action="Error" />
+    <Rule Id="C28182" Action="Error" />
+    <Rule Id="C28183" Action="Error" />
+    <Rule Id="C28193" Action="Error" />
+    <Rule Id="C28196" Action="Error" />
+    <Rule Id="C28202" Action="Error" />
+    <Rule Id="C28203" Action="Error" />
+    <Rule Id="C28204" Action="Error" />
+    <Rule Id="C28205" Action="Error" />
+    <Rule Id="C28206" Action="Error" />
+    <Rule Id="C28207" Action="Error" />
+    <Rule Id="C28208" Action="Error" />
+    <Rule Id="C28209" Action="Error" />
+    <Rule Id="C28210" Action="Error" />
+    <Rule Id="C28211" Action="Error" />
+    <Rule Id="C28212" Action="Error" />
+    <Rule Id="C28213" Action="Error" />
+    <Rule Id="C28214" Action="Error" />
+    <Rule Id="C28215" Action="Error" />
+    <Rule Id="C28216" Action="Error" />
+    <Rule Id="C28217" Action="Error" />
+    <Rule Id="C28218" Action="Error" />
+    <Rule Id="C28219" Action="Error" />
+    <Rule Id="C28220" Action="Error" />
+    <Rule Id="C28221" Action="Error" />
+    <Rule Id="C28222" Action="Error" />
+    <Rule Id="C28223" Action="Error" />
+    <Rule Id="C28224" Action="Error" />
+    <Rule Id="C28225" Action="Error" />
+    <Rule Id="C28226" Action="Error" />
+    <Rule Id="C28227" Action="Error" />
+    <Rule Id="C28228" Action="Error" />
+    <Rule Id="C28229" Action="Error" />
+    <Rule Id="C28230" Action="Error" />
+    <Rule Id="C28231" Action="Error" />
+    <Rule Id="C28232" Action="Error" />
+    <Rule Id="C28233" Action="Error" />
+    <Rule Id="C28234" Action="Error" />
+    <Rule Id="C28235" Action="Error" />
+    <Rule Id="C28236" Action="Error" />
+    <Rule Id="C28237" Action="Error" />
+    <Rule Id="C28238" Action="Error" />
+    <Rule Id="C28239" Action="Error" />
+    <Rule Id="C28240" Action="Error" />
+    <Rule Id="C28241" Action="Error" />
+    <Rule Id="C28243" Action="Error" />
+    <Rule Id="C28244" Action="Error" />
+    <Rule Id="C28245" Action="Error" />
+    <Rule Id="C28246" Action="Error" />
+    <Rule Id="C28250" Action="Error" />
+    <Rule Id="C28251" Action="Error" />
+    <Rule Id="C28252" Action="Error" />
+    <Rule Id="C28253" Action="Error" />
+    <Rule Id="C28254" Action="Error" />
+    <Rule Id="C28260" Action="Error" />
+    <Rule Id="C28262" Action="Error" />
+    <Rule Id="C28263" Action="Error" />
+    <Rule Id="C28266" Action="Error" />
+    <Rule Id="C28267" Action="Error" />
+    <Rule Id="C28272" Action="Error" />
+    <Rule Id="C28273" Action="Error" />
+    <Rule Id="C28275" Action="Error" />
+    <Rule Id="C28279" Action="Error" />
+    <Rule Id="C28280" Action="Error" />
+    <Rule Id="C28282" Action="Error" />
+    <Rule Id="C28285" Action="Error" />
+    <Rule Id="C28286" Action="Error" />
+    <Rule Id="C28287" Action="Error" />
+    <Rule Id="C28288" Action="Error" />
+    <Rule Id="C28289" Action="Error" />
+    <Rule Id="C28290" Action="Error" />
+    <Rule Id="C28291" Action="Error" />
+    <Rule Id="C28300" Action="Error" />
+    <Rule Id="C28301" Action="Error" />
+    <Rule Id="C28302" Action="Error" />
+    <Rule Id="C28303" Action="Error" />
+    <Rule Id="C28304" Action="Error" />
+    <Rule Id="C28305" Action="Error" />
+    <Rule Id="C28306" Action="Error" />
+    <Rule Id="C28307" Action="Error" />
+    <Rule Id="C28308" Action="Error" />
+    <Rule Id="C28309" Action="Error" />
+    <Rule Id="C28350" Action="Error" />
+    <Rule Id="C28351" Action="Error" />
+    <Rule Id="C33001" Action="Error" />
+    <Rule Id="C33004" Action="Error" />
+    <Rule Id="C33005" Action="Error" />
+    <Rule Id="C33010" Action="Error" />
+    <Rule Id="C33011" Action="Error" />
+    <Rule Id="C33020" Action="Error" />
+    <Rule Id="C6001" Action="Error" />
+    <Rule Id="C6011" Action="Error" />
+    <Rule Id="C6029" Action="Error" />
+    <Rule Id="C6031" Action="Error" />
+    <Rule Id="C6053" Action="Error" />
+    <Rule Id="C6054" Action="Error" />
+    <Rule Id="C6059" Action="Error" />
+    <Rule Id="C6063" Action="Error" />
+    <Rule Id="C6064" Action="Error" />
+    <Rule Id="C6066" Action="Error" />
+    <Rule Id="C6067" Action="Error" />
+    <Rule Id="C6101" Action="Error" />
+    <Rule Id="C6200" Action="Error" />
+    <Rule Id="C6201" Action="Error" />
+    <Rule Id="C6214" Action="Error" />
+    <Rule Id="C6215" Action="Error" />
+    <Rule Id="C6216" Action="Error" />
+    <Rule Id="C6217" Action="Error" />
+    <Rule Id="C6220" Action="Error" />
+    <Rule Id="C6226" Action="Error" />
+    <Rule Id="C6230" Action="Error" />
+    <Rule Id="C6235" Action="Error" />
+    <Rule Id="C6236" Action="Error" />
+    <Rule Id="C6237" Action="Error" />
+    <Rule Id="C6242" Action="Error" />
+    <Rule Id="C6248" Action="Error" />
+    <Rule Id="C6250" Action="Error" />
+    <Rule Id="C6255" Action="Error" />
+    <Rule Id="C6258" Action="Error" />
+    <Rule Id="C6259" Action="Error" />
+    <Rule Id="C6260" Action="Error" />
+    <Rule Id="C6262" Action="Error" />
+    <Rule Id="C6263" Action="Error" />
+    <Rule Id="C6268" Action="Error" />
+    <Rule Id="C6269" Action="Error" />
+    <Rule Id="C6270" Action="Error" />
+    <Rule Id="C6271" Action="Error" />
+    <Rule Id="C6272" Action="Error" />
+    <Rule Id="C6273" Action="Error" />
+    <Rule Id="C6274" Action="Error" />
+    <Rule Id="C6276" Action="Error" />
+    <Rule Id="C6277" Action="Error" />
+    <Rule Id="C6278" Action="Error" />
+    <Rule Id="C6279" Action="Error" />
+    <Rule Id="C6280" Action="Error" />
+    <Rule Id="C6281" Action="Error" />
+    <Rule Id="C6282" Action="Error" />
+    <Rule Id="C6283" Action="Error" />
+    <Rule Id="C6284" Action="Error" />
+    <Rule Id="C6285" Action="Error" />
+    <Rule Id="C6286" Action="Error" />
+    <Rule Id="C6287" Action="Error" />
+    <Rule Id="C6288" Action="Error" />
+    <Rule Id="C6289" Action="Error" />
+    <Rule Id="C6290" Action="Error" />
+    <Rule Id="C6291" Action="Error" />
+    <Rule Id="C6292" Action="Error" />
+    <Rule Id="C6293" Action="Error" />
+    <Rule Id="C6294" Action="Error" />
+    <Rule Id="C6295" Action="Error" />
+    <Rule Id="C6296" Action="Error" />
+    <Rule Id="C6297" Action="Error" />
+    <Rule Id="C6299" Action="Error" />
+    <Rule Id="C6302" Action="Error" />
+    <Rule Id="C6303" Action="Error" />
+    <Rule Id="C6305" Action="Error" />
+    <Rule Id="C6306" Action="Error" />
+    <Rule Id="C6308" Action="Error" />
+    <Rule Id="C6310" Action="Error" />
+    <Rule Id="C6312" Action="Error" />
+    <Rule Id="C6314" Action="Error" />
+    <Rule Id="C6317" Action="Error" />
+    <Rule Id="C6318" Action="Error" />
+    <Rule Id="C6319" Action="Error" />
+    <Rule Id="C6324" Action="Error" />
+    <Rule Id="C6326" Action="Error" />
+    <Rule Id="C6328" Action="Error" />
+    <Rule Id="C6331" Action="Error" />
+    <Rule Id="C6332" Action="Error" />
+    <Rule Id="C6333" Action="Error" />
+    <Rule Id="C6335" Action="Error" />
+    <Rule Id="C6381" Action="Error" />
+    <Rule Id="C6383" Action="Error" />
+    <Rule Id="C6384" Action="Error" />
+    <Rule Id="C6385" Action="Error" />
+    <Rule Id="C6386" Action="Error" />
+    <Rule Id="C6387" Action="Error" />
+    <Rule Id="C6388" Action="Error" />
+    <Rule Id="C6500" Action="Error" />
+    <Rule Id="C6501" Action="Error" />
+    <Rule Id="C6503" Action="Error" />
+    <Rule Id="C6504" Action="Error" />
+    <Rule Id="C6505" Action="Error" />
+    <Rule Id="C6506" Action="Error" />
+    <Rule Id="C6508" Action="Error" />
+    <Rule Id="C6509" Action="Error" />
+    <Rule Id="C6510" Action="Error" />
+    <Rule Id="C6511" Action="Error" />
+    <Rule Id="C6513" Action="Error" />
+    <Rule Id="C6514" Action="Error" />
+    <Rule Id="C6515" Action="Error" />
+    <Rule Id="C6516" Action="Error" />
+    <Rule Id="C6517" Action="Error" />
+    <Rule Id="C6518" Action="Error" />
+    <Rule Id="C6522" Action="Error" />
+    <Rule Id="C6525" Action="Error" />
+    <Rule Id="C6527" Action="Error" />
+    <Rule Id="C6530" Action="Error" />
+    <Rule Id="C6540" Action="Error" />
+    <Rule Id="C6551" Action="Error" />
+    <Rule Id="C6552" Action="Error" />
+    <Rule Id="C6701" Action="Error" />
+    <Rule Id="C6702" Action="Error" />
+    <Rule Id="C6703" Action="Error" />
+    <Rule Id="C6704" Action="Error" />
+    <Rule Id="C6705" Action="Error" />
+    <Rule Id="C6706" Action="Error" />
+    <Rule Id="C6993" Action="Error" />
+    <Rule Id="C6995" Action="Error" />
+    <Rule Id="C6997" Action="Error" />
+  </Rules>
+</RuleSet>
\ No newline at end of file
diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
index 3ee4a14e4c..9b8b5bf818 100644
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@@ -127,11 +127,6 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/common  DEST
 set_target_properties(onnxruntime_common PROPERTIES LINKER_LANGUAGE CXX)
 set_target_properties(onnxruntime_common PROPERTIES FOLDER "ONNXRuntime")
 
-if(WIN32)
-    # Add Code Analysis properties to enable C++ Core checks. Have to do it via a props file include.
-    set_target_properties(onnxruntime_common PROPERTIES VS_USER_PROPS ${PROJECT_SOURCE_DIR}/EnableVisualStudioCodeAnalysis.props)
-endif()
-
 # check if we need to link against librt on Linux
 include(CheckLibraryExists)
 include(CheckFunctionExists)
diff --git a/cmake/onnxruntime_graph.cmake b/cmake/onnxruntime_graph.cmake
index dde1fbd138..db09ae7a36 100644
--- a/cmake/onnxruntime_graph.cmake
+++ b/cmake/onnxruntime_graph.cmake
@@ -120,8 +120,5 @@ if (WIN32)
     target_compile_options(onnxruntime_graph PRIVATE
         /EHsc   # exception handling - C++ may throw, extern "C" will not
     )
-  endif()
-
-  # Add Code Analysis properties to enable C++ Core checks. Have to do it via a props file include.
-  set_target_properties(onnxruntime_graph PROPERTIES VS_USER_PROPS ${PROJECT_SOURCE_DIR}/EnableVisualStudioCodeAnalysis.props)
+  endif()  
 endif()
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 263f580977..d1ddc06301 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -475,4 +475,7 @@ endforeach()
 set_target_properties(onnxruntime_mlas PROPERTIES FOLDER "ONNXRuntime")
 if (WIN32)
   target_compile_options(onnxruntime_mlas PRIVATE "/wd6385" "/wd4127")
+  if (onnxruntime_ENABLE_STATIC_ANALYSIS)
+    target_compile_options(onnxruntime_mlas PRIVATE  "/analyze:stacksize 131072")
+  endif()
 endif()
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index a2f6d6bc22..90240c5f68 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -454,6 +454,10 @@ if (onnxruntime_USE_DNNL)
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_dnnl_cc_srcs})
   onnxruntime_add_shared_library_module(onnxruntime_providers_dnnl ${onnxruntime_providers_dnnl_cc_srcs})
   target_link_directories(onnxruntime_providers_dnnl PRIVATE ${DNNL_LIB_DIR})
+  if (MSVC AND onnxruntime_ENABLE_STATIC_ANALYSIS)
+    # dnnl_convgrad.cc(47,0): Warning C6262: Function uses '38816' bytes of stack:  exceeds /analyze:stacksize '16384'.  Consider moving some data to heap.
+    target_compile_options(onnxruntime_providers_dnnl PRIVATE  "/analyze:stacksize 131072")
+  endif()
 
   add_dependencies(onnxruntime_providers_dnnl onnxruntime_providers_shared project_dnnl ${onnxruntime_EXTERNAL_DEPENDENCIES})
   target_include_directories(onnxruntime_providers_dnnl PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} ${DNNL_INCLUDE_DIR} ${DNNL_OCL_INCLUDE_DIR})
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 2a9eeca37e..ee3c33dea2 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -81,6 +81,9 @@ function(AddTest)
       # Lot of such things came from gtest
       target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd6326>"
                 "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd6326>")
+      # Raw new and delete. A lot of such things came from googletest.
+      target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd26409>"
+                "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26409>")
     endif()
     target_compile_options(${_UT_TARGET} PRIVATE ${disabled_warnings})
   else()
@@ -681,7 +684,12 @@ AddTest(
     onnx_test_data_proto nlohmann_json::nlohmann_json
   DEPENDS ${all_dependencies}
 )
-if(NOT MSVC)
+if (MSVC)
+  # The warning means the type of two integral values around a binary operator is narrow than their result.
+  # If we promote the two input values first, it could be more tolerant to integer overflow.
+  # However, this is test code. We are less concerned. 
+  target_compile_options(onnxruntime_test_all PRIVATE "/wd26451")
+else()
   target_compile_options(onnxruntime_test_all PRIVATE "-Wno-parentheses")
 endif()
 # the default logger tests conflict with the need to have an overall default logger
@@ -846,6 +854,15 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
     if(WIN32)
       target_compile_options(onnxruntime_benchmark PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd4141>"
                         "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4141>")
+      # Avoid using new and delete. But this is a benchmark program, it's ok if it has a chance to leak.
+      target_compile_options(onnxruntime_benchmark PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd26409>"
+                        "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26409>")
+      target_compile_options(onnxruntime_benchmark PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd26400>"
+                        "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26400>")
+      target_compile_options(onnxruntime_benchmark PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd26814>"
+                        "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26814>")
+      target_compile_options(onnxruntime_benchmark PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd26814>"
+                        "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26497>")
       target_compile_options(onnxruntime_benchmark PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
               "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
     endif()
@@ -858,7 +875,11 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
     onnxruntime_add_executable(onnxruntime_mlas_benchmark ${MLAS_BENCH_SOURCE_FILES})
     target_include_directories(onnxruntime_mlas_benchmark PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc)
     target_link_libraries(onnxruntime_mlas_benchmark PRIVATE benchmark::benchmark onnxruntime_util onnxruntime_framework ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS})
-    if(NOT WIN32)
+    if(WIN32)
+      target_link_libraries(onnxruntime_mlas_benchmark PRIVATE debug Dbghelp)
+      # Avoid using new and delete. But this is a benchmark program, it's ok if it has a chance to leak.
+      target_compile_options(onnxruntime_mlas_benchmark PRIVATE /wd26409)
+    else()
       target_link_libraries(onnxruntime_mlas_benchmark PRIVATE nsync_cpp ${CMAKE_DL_LIBS})
     endif()
     set_target_properties(onnxruntime_mlas_benchmark PROPERTIES FOLDER "ONNXRuntimeTest")
@@ -1100,6 +1121,8 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
   )
   onnxruntime_add_executable(onnxruntime_mlas_test ${onnxruntime_mlas_test_src})
   if(MSVC)
+    target_compile_options(onnxruntime_mlas_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd26409>"
+                "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26409>")
     target_compile_options(onnxruntime_mlas_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
             "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
     target_compile_options(onnxruntime_mlas_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd6326>"
@@ -1147,6 +1170,8 @@ if(UNIX)
   endif()
 else()
   set(ONNXRUNTIME_CUSTOM_OP_LIB_LINK_FLAG "-DEF:${TEST_SRC_DIR}/testdata/custom_op_library/custom_op_library.def")
+  target_compile_options(custom_op_library PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd26409>"
+                "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26409>")
 endif()
 set_property(TARGET custom_op_library APPEND_STRING PROPERTY LINK_FLAGS ${ONNXRUNTIME_CUSTOM_OP_LIB_LINK_FLAG})
 
diff --git a/include/onnxruntime/core/common/common.h b/include/onnxruntime/core/common/common.h
index 233311596d..6860bd169c 100644
--- a/include/onnxruntime/core/common/common.h
+++ b/include/onnxruntime/core/common/common.h
@@ -35,6 +35,7 @@
 #include "core/common/exceptions.h"
 #include "core/common/make_string.h"
 #include "core/common/status.h"
+#include "core/common/gsl_suppress.h"
 
 namespace onnxruntime {
 
@@ -253,13 +254,6 @@ void LogRuntimeError(uint32_t session_id, const common::Status& status, const ch
     }                                  \
   } while (0)
 
-// C++ Core Guideline check suppression.
-#if defined(_MSC_VER) && !defined(__NVCC__) && !defined(__clang__)
-#define GSL_SUPPRESS(tag) [[gsl::suppress(tag)]]
-#else
-#define GSL_SUPPRESS(tag)
-#endif
-
 inline long long TimeDiffMicroSeconds(TimePoint start_time) {
   auto end_time = std::chrono::high_resolution_clock::now();
   return std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
diff --git a/include/onnxruntime/core/common/gsl_suppress.h b/include/onnxruntime/core/common/gsl_suppress.h
new file mode 100644
index 0000000000..66702d0424
--- /dev/null
+++ b/include/onnxruntime/core/common/gsl_suppress.h
@@ -0,0 +1,15 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#ifndef GSL_SUPPRESS
+#if defined(__clang__) && !defined(__NVCC__)
+#define GSL_SUPPRESS(x) [[gsl::suppress("x")]]
+#else
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__NVCC__)
+#define GSL_SUPPRESS(x) [[gsl::suppress(x)]]
+#else
+#define GSL_SUPPRESS(x)
+#endif  // _MSC_VER
+#endif  // __clang__
+#endif
\ No newline at end of file
diff --git a/include/onnxruntime/core/common/status.h b/include/onnxruntime/core/common/status.h
index f348e7e653..b7c36df2b2 100644
--- a/include/onnxruntime/core/common/status.h
+++ b/include/onnxruntime/core/common/status.h
@@ -19,7 +19,7 @@ limitations under the License.
 #ifdef _WIN32
 #include <winerror.h>
 #endif
-
+#include "core/common/gsl_suppress.h"
 namespace onnxruntime {
 namespace common {
 
@@ -80,35 +80,34 @@ constexpr const char* StatusCodeToString(StatusCode status) noexcept {
 
 #ifdef _WIN32
 constexpr HRESULT StatusCodeToHRESULT(StatusCode status) noexcept {
-    switch (status)
-    {
+  switch (status) {
     case StatusCode::OK:
-        return S_OK;
+      return S_OK;
     case StatusCode::FAIL:
-        return E_FAIL;
+      return E_FAIL;
     case StatusCode::INVALID_ARGUMENT:
-        return E_INVALIDARG;
+      return E_INVALIDARG;
     case StatusCode::NO_SUCHFILE:
-        return HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND);
+      return HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND);
     case StatusCode::NO_MODEL:
-        return HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND);
+      return HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND);
     case StatusCode::ENGINE_ERROR:
-        return E_FAIL;
+      return E_FAIL;
     case StatusCode::RUNTIME_EXCEPTION:
-        return E_FAIL;
+      return E_FAIL;
     case StatusCode::INVALID_PROTOBUF:
-        return HRESULT_FROM_WIN32(ERROR_FILE_CORRUPT);
+      return HRESULT_FROM_WIN32(ERROR_FILE_CORRUPT);
     case StatusCode::MODEL_LOADED:
-        return HRESULT_FROM_WIN32(ERROR_INTERNAL_ERROR);
+      return HRESULT_FROM_WIN32(ERROR_INTERNAL_ERROR);
     case StatusCode::NOT_IMPLEMENTED:
-        return E_NOTIMPL;
+      return E_NOTIMPL;
     case StatusCode::INVALID_GRAPH:
-        return HRESULT_FROM_WIN32(ERROR_FILE_CORRUPT);
+      return HRESULT_FROM_WIN32(ERROR_FILE_CORRUPT);
     case StatusCode::EP_FAIL:
-        return HRESULT_FROM_WIN32(ERROR_INTERNAL_ERROR);
+      return HRESULT_FROM_WIN32(ERROR_INTERNAL_ERROR);
     default:
-        return E_FAIL;
-    }
+      return E_FAIL;
+  }
 }
 #endif
 
@@ -122,9 +121,10 @@ class [[nodiscard]] Status {
 
   Status(StatusCategory category, int code);
 
+  GSL_SUPPRESS(r.11)
   Status(const Status& other)
       : state_((other.state_ == nullptr) ? nullptr : new State(*other.state_)) {}
-
+  GSL_SUPPRESS(r.11)
   Status& operator=(const Status& other) {
     if (state_ != other.state_) {
       if (other.state_ == nullptr) {
diff --git a/include/onnxruntime/core/framework/kernel_def_builder.h b/include/onnxruntime/core/framework/kernel_def_builder.h
index a25599be6f..2d35c49ee9 100644
--- a/include/onnxruntime/core/framework/kernel_def_builder.h
+++ b/include/onnxruntime/core/framework/kernel_def_builder.h
@@ -183,7 +183,7 @@ class KernelDefBuilder {
   static std::unique_ptr<KernelDefBuilder> Create() { return std::make_unique<KernelDefBuilder>(); }
 
   explicit KernelDefBuilder()
-      : kernel_def_(new KernelDef()) {}
+      : kernel_def_(std::make_unique<KernelDef>()) {}
 
   KernelDefBuilder& SetName(const std::string& op_name);
   KernelDefBuilder& SetName(const char* op_name);
@@ -274,7 +274,7 @@ class KernelDefBuilder {
   KernelDefBuilder& Alias(int input_index, int output_index);
 
   /**
-     Apply variadic number of alias mapping from inputs to outputs. 
+     Apply variadic number of alias mapping from inputs to outputs.
      This is effectively applying Alias(i + input_offset, i + output_offset) for i >= 0
   */
   KernelDefBuilder& VariadicAlias(int input_offset, int output_offset);
@@ -290,7 +290,7 @@ class KernelDefBuilder {
   }
 
   /**
-     Specify that this kernel's output buffers are passed from external, 
+     Specify that this kernel's output buffers are passed from external,
      i.e. not created or managed by ORT's memory allocator.
   */
   KernelDefBuilder& ExternalOutputs() {
diff --git a/include/onnxruntime/core/framework/kernel_registry.h b/include/onnxruntime/core/framework/kernel_registry.h
index a9875df58d..1d068e7625 100644
--- a/include/onnxruntime/core/framework/kernel_registry.h
+++ b/include/onnxruntime/core/framework/kernel_registry.h
@@ -37,7 +37,7 @@ class KernelRegistry {
   // TODO(Task:132) Make usage of unique_ptr/shared_ptr as out param consistent
   Status TryCreateKernel(const Node& node, const IExecutionProvider& execution_provider,
                          const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
-                         const OrtValueNameIdxMap& mlvalue_name_idx_map, const FuncManager& funcs_mgr,
+                         const OrtValueNameIdxMap& mlvalue_name_idx_map, FuncManager& funcs_mgr,
                          const DataTransferManager& data_transfer_mgr,
                          std::unique_ptr<OpKernel>& op_kernel) const;
 
diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h
index 434119986e..59cbef7c66 100644
--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@@ -125,9 +125,9 @@ class OpKernel {
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OpKernel);
   std::unique_ptr<OpKernelInfo> op_kernel_info_;
 };
-
-using KernelCreateFn = std::function<OpKernel*(const OpKernelInfo& info)>;
-using KernelCreatePtrFn = std::add_pointer<OpKernel*(const OpKernelInfo& info)>::type;
+class FuncManager;
+using KernelCreateFn = std::function<Status(FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out)>;
+using KernelCreatePtrFn = std::add_pointer<Status(FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out)>::type;
 
 struct KernelCreateInfo {
   std::unique_ptr<KernelDef> kernel_def;  // Owned and stored in the global kernel registry.
@@ -197,7 +197,7 @@ using BuildKernelCreateInfoFn = KernelCreateInfo (*)();
             .SinceVersion(ver)                                                                                        \
             .Provider(provider)                                                                                       \
             .Build(),                                                                                                 \
-        static_cast<KernelCreatePtrFn>([](const OpKernelInfo& info) -> OpKernel* { return new __VA_ARGS__(info); })); \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
   }
 
 #define ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(provider, domain, startver, endver, name) \
@@ -220,7 +220,7 @@ using BuildKernelCreateInfoFn = KernelCreateInfo (*)();
             .SinceVersion(startver, endver)                                                                           \
             .Provider(provider)                                                                                       \
             .Build(),                                                                                                 \
-        static_cast<KernelCreatePtrFn>([](const OpKernelInfo& info) -> OpKernel* { return new __VA_ARGS__(info); })); \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
   }
 
 #define ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(provider, domain, ver, type, name) \
@@ -246,7 +246,7 @@ using BuildKernelCreateInfoFn = KernelCreateInfo (*)();
             .SinceVersion(ver)                                                                                        \
             .Provider(provider)                                                                                       \
             .Build(),                                                                                                 \
-        static_cast<KernelCreatePtrFn>([](const OpKernelInfo& info) -> OpKernel* { return new __VA_ARGS__(info); })); \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
   }
 
 #define ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(provider, domain, ver, type1, type2, name) \
@@ -263,7 +263,7 @@ using BuildKernelCreateInfoFn = KernelCreateInfo (*)();
             .SinceVersion(ver)                                                                                        \
             .Provider(provider)                                                                                       \
             .Build(),                                                                                                 \
-        static_cast<KernelCreatePtrFn>([](const OpKernelInfo& info) -> OpKernel* { return new __VA_ARGS__(info); })); \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
   }
 
 #define ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(provider, domain, startver, endver, type, name) \
@@ -293,7 +293,7 @@ using BuildKernelCreateInfoFn = KernelCreateInfo (*)();
             .SinceVersion(startver, endver)                                                                           \
             .Provider(provider)                                                                                       \
             .Build(),                                                                                                 \
-        static_cast<KernelCreatePtrFn>([](const OpKernelInfo& info) -> OpKernel* { return new __VA_ARGS__(info); })); \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
   }
 
 #define ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(provider, domain, startver, endver, type1, type2, name) \
@@ -312,7 +312,7 @@ using BuildKernelCreateInfoFn = KernelCreateInfo (*)();
             .SinceVersion(startver, endver)                                                                           \
             .Provider(provider)                                                                                       \
             .Build(),                                                                                                 \
-        static_cast<KernelCreatePtrFn>([](const OpKernelInfo& info) -> OpKernel* { return new __VA_ARGS__(info); })); \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
   }
 
 template <typename... Types>
diff --git a/include/onnxruntime/core/framework/op_kernel_info.h b/include/onnxruntime/core/framework/op_kernel_info.h
index 0259e7e68c..dca4df8192 100644
--- a/include/onnxruntime/core/framework/op_kernel_info.h
+++ b/include/onnxruntime/core/framework/op_kernel_info.h
@@ -26,7 +26,6 @@ class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
                         const IExecutionProvider& execution_provider,
                         const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
                         const OrtValueNameIdxMap& mlvalue_name_idx_map,
-                        const FuncManager& funcs_mgr,
                         const DataTransferManager& data_transfer_mgr);
 
   OpKernelInfo(const OpKernelInfo& other);
@@ -45,8 +44,6 @@ class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
 
   bool TryGetConstantInput(int input_index, const Tensor** constant_input_value) const;
 
-  common::Status GetFusedFuncs(NodeComputeInfo*& compute_info) const;
-
  private:
   ORT_DISALLOW_MOVE(OpKernelInfo);
   ORT_DISALLOW_ASSIGNMENT(OpKernelInfo);
@@ -58,7 +55,6 @@ class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
   gsl::not_null<const ::onnxruntime::IExecutionProvider*> execution_provider_;
   const std::unordered_map<int, OrtValue>& constant_initialized_tensors_;
   const OrtValueNameIdxMap& ort_value_name_idx_map_;
-  const FuncManager& funcs_mgr_;
   const DataTransferManager& data_transfer_mgr_;
   ProtoHelperNodeContext proto_helper_context_;
 };
diff --git a/include/onnxruntime/core/graph/basic_types.h b/include/onnxruntime/core/graph/basic_types.h
index a79479b488..36984d0405 100644
--- a/include/onnxruntime/core/graph/basic_types.h
+++ b/include/onnxruntime/core/graph/basic_types.h
@@ -11,6 +11,7 @@
 #include <functional>
 
 #include "core/common/basic_types.h"
+#include "core/common/status.h"
 
 namespace ONNX_NAMESPACE {
 class ValueInfoProto;
@@ -44,6 +45,6 @@ using IOnnxRuntimeOpSchemaCollectionPtr = std::shared_ptr<IOnnxRuntimeOpSchemaCo
 
 class OpKernel;
 class OpKernelInfo;
-
-using KernelCreateFn = std::function<OpKernel*(const OpKernelInfo& info)>;
+class FuncManager;
+using KernelCreateFn = std::function<onnxruntime::common::Status(FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out)>;
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index d54f145d40..3fe38fa7ef 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -401,8 +401,8 @@ class Node {
   }
 
   /** Sets initialized function body for node. This is called right after function body initialization for a node.
-  * or during function inlining when a nested function is encountered.
-  */
+   * or during function inlining when a nested function is encountered.
+   */
   void SetFunctionBody(Function& func);
 
   /** Call the provided function for all explicit inputs, implicit inputs, and outputs of this Node.
@@ -500,15 +500,14 @@ class Node {
     ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Relationships);
   };
 
- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Node);
-
   // NOTE: This friendship relationship should ONLY be used for calling methods of the Node class and not accessing
   // the data members directly, so that the Node can maintain its internal invariants.
   friend class Graph;
-
   Node(NodeIndex index, Graph& graph) : index_(index), graph_(&graph) {}
 
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Node);
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   void Init(const std::string& name,
             const std::string& op_type,
@@ -648,7 +647,7 @@ class Graph {
   /** Check if a given name is a sparse initializer's name in the model
    * we currently convert sparse_initializer field in the model into dense Tensor instances.
    * However, we sometimes want to check if this initializer was stored as sparse in the model.
-  */
+   */
   bool IsSparseInitializer(const std::string& name) const;
 #endif
 
@@ -978,7 +977,7 @@ class Graph {
   @remarks As a new Graph instance for the fused nodes is not created, a GraphViewer can be constructed with the
            IndexedSubGraph information to provide a view of the subgraph. The original nodes are left in place
            while this is in use.
-		   Call FinalizeFuseSubGraph to remove them once the fused replacement node is fully created.
+                   Call FinalizeFuseSubGraph to remove them once the fused replacement node is fully created.
   */
   Node& BeginFuseSubGraph(const IndexedSubGraph& sub_graph, const std::string& fused_node_name);
 
@@ -1200,9 +1199,6 @@ class Graph {
   }
 #endif
 
- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Graph);
-
   // This friendship relationship should only be used to call Graph::Graph and
   // Graph::LoadGraph All other access should be via the public API.
   friend class Model;
@@ -1243,6 +1239,8 @@ class Graph {
         const std::vector<const ONNX_NAMESPACE::FunctionProto*>& model_functions,
         const logging::Logger& logger);
 
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Graph);
   void InitializeStateFromModelFileGraphProto();
 
   // Add node with specified <node_proto>.
@@ -1530,16 +1528,16 @@ std::ostream& operator<<(std::ostream& out, const NodeArg& node_arg);
 // Print Node as,
 //  (operator's name, operator's type, domain, version) : (input0, input1, ...) -> (output0, output1, ...)
 // For example,
-//  ("Add_14", Add, "", 7) : ("110": tensor(float),"109": tensor(float),) -> ("111": tensor(float),) 
+//  ("Add_14", Add, "", 7) : ("110": tensor(float),"109": tensor(float),) -> ("111": tensor(float),)
 std::ostream& operator<<(std::ostream& out, const Node& node);
 // Print Graph as, for example,
 // Inputs:
 //    "Input": tensor(float)
 // Nodes:
-//    ("add0", Add, "", 7) : ("Input": tensor(float),"Bias": tensor(float),) -> ("add0_out": tensor(float),) 
-//    ("matmul", MatMul, "", 9) : ("add0_out": tensor(float),"matmul_weight": tensor(float),) -> ("matmul_out": tensor(float),) 
-//    ("add1", Add, "", 7) : ("matmul_out": tensor(float),"add_weight": tensor(float),) -> ("add1_out": tensor(float),) 
-//    ("reshape", Reshape, "", 5) : ("add1_out": tensor(float),"concat_out": tensor(int64),) -> ("Result": tensor(float),) 
+//    ("add0", Add, "", 7) : ("Input": tensor(float),"Bias": tensor(float),) -> ("add0_out": tensor(float),)
+//    ("matmul", MatMul, "", 9) : ("add0_out": tensor(float),"matmul_weight": tensor(float),) -> ("matmul_out": tensor(float),)
+//    ("add1", Add, "", 7) : ("matmul_out": tensor(float),"add_weight": tensor(float),) -> ("add1_out": tensor(float),)
+//    ("reshape", Reshape, "", 5) : ("add1_out": tensor(float),"concat_out": tensor(int64),) -> ("Result": tensor(float),)
 // Outputs:
 //    "Result": tensor(float)
 // Inputs' and outputs' format is described in document of NodeArg's operator<< above.
diff --git a/include/onnxruntime/core/graph/node_arg.h b/include/onnxruntime/core/graph/node_arg.h
index ecb5bdc131..80f5dda2dc 100644
--- a/include/onnxruntime/core/graph/node_arg.h
+++ b/include/onnxruntime/core/graph/node_arg.h
@@ -106,12 +106,12 @@ class NodeArg {
   Optional inputs are allowed in ONNX and an empty #Name represents a non-existent input argument. */
   bool Exists() const noexcept;
 
- private:
-  ORT_DISALLOW_COPY_AND_ASSIGNMENT(NodeArg);
   friend class Graph;
 
   NodeArg(NodeArgInfo&& node_arg_info);
 
+ private:
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(NodeArg);
 #if !defined(ORT_MINIMAL_BUILD)
   void SetType(const std::string* p_type);
   void SetType(const ONNX_NAMESPACE::TypeProto& type_proto);
diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
index 0842400c2c..95bab63ba0 100644
--- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
+++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
@@ -786,7 +786,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
 // unique_ptr.  The explicit deleter avoids the Eigen-specific
 // definition of ThreadPoolParallelSection needing to be avilable in
 // threadpool.h where the user-facing parallel section API is defined.
-
+GSL_SUPPRESS(r.11)
 std::unique_ptr<ThreadPoolParallelSection, void(*)(ThreadPoolParallelSection*)> AllocateParallelSection() override {
   return std::unique_ptr<ThreadPoolParallelSection, void(*)(ThreadPoolParallelSection*)>
     (new ThreadPoolParallelSection,
diff --git a/include/onnxruntime/core/session/environment.h b/include/onnxruntime/core/session/environment.h
index f7a4638371..cc511ccad5 100644
--- a/include/onnxruntime/core/session/environment.h
+++ b/include/onnxruntime/core/session/environment.h
@@ -58,31 +58,31 @@ class Environment {
   /**
    * Registers an allocator for sharing between multiple sessions.
    * Return an error if an allocator with the same OrtMemoryInfo is already registered.
-  */
+   */
   Status RegisterAllocator(AllocatorPtr allocator);
 
   /**
    * Creates and registers an allocator for sharing between multiple sessions.
    * Return an error if an allocator with the same OrtMemoryInfo is already registered.
-  */
+   */
   Status CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info, const OrtArenaCfg* arena_cfg = nullptr);
 
   /**
    * Returns the list of registered allocators in this env.
-  */
+   */
   const std::vector<AllocatorPtr>& GetRegisteredSharedAllocators() const {
     return shared_allocators_;
   }
 
   /**
    * Removes registered allocator that was previously registered for sharing between multiple sessions.
-  */
+   */
   Status UnregisterAllocator(const OrtMemoryInfo& mem_info);
 
+  Environment() = default;
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Environment);
-
-  Environment() = default;
   Status Initialize(std::unique_ptr<logging::LoggingManager> logging_manager,
                     const OrtThreadingOptions* tp_options = nullptr,
                     bool create_global_thread_pools = false);
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 9384ad8700..8c32416061 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -129,6 +129,7 @@ extern "C" {
 
 // Used in *.cc files. Almost as same as ORT_API_STATUS, except without ORT_MUST_USE_RESULT and ORT_EXPORT
 #define ORT_API_STATUS_IMPL(NAME, ...) \
+  GSL_SUPPRESS(r .11)                  \
   _Success_(return == 0) _Check_return_ _Ret_maybenull_ OrtStatusPtr ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION
 
 #define ORT_CLASS_RELEASE(X) void(ORT_API_CALL * Release##X)(_Frees_ptr_opt_ Ort##X * input)
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 048421099b..1358d13072 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -950,8 +950,14 @@ struct CustomOpBase : OrtCustomOp {
     OrtCustomOp::GetOutputType = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetOutputType(index); };
 
     OrtCustomOp::KernelCompute = [](void* op_kernel, OrtKernelContext* context) { static_cast<TKernel*>(op_kernel)->Compute(context); };
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 26409)
+#endif
     OrtCustomOp::KernelDestroy = [](void* op_kernel) { delete static_cast<TKernel*>(op_kernel); };
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
     OrtCustomOp::GetInputCharacteristic = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetInputCharacteristic(index); };
     OrtCustomOp::GetOutputCharacteristic = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetOutputCharacteristic(index); };
   }
diff --git a/onnxruntime/contrib_ops/cpu/activations.h b/onnxruntime/contrib_ops/cpu/activations.h
index a156af1405..3bd07bb27b 100644
--- a/onnxruntime/contrib_ops/cpu/activations.h
+++ b/onnxruntime/contrib_ops/cpu/activations.h
@@ -67,7 +67,7 @@ class Gelu : public OpKernel {
 
     concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
     int64_t elem_count = input->Shape().Size();
-    static const int64_t length_per_task = 4096;  // this number comes from FastGelu.
+    constexpr int64_t length_per_task = 4096;  // this number comes from FastGelu.
     int64_t task_count = (elem_count + length_per_task - 1) / length_per_task;
     concurrency::ThreadPool::TryBatchParallelFor(
         tp, static_cast<int32_t>(task_count),
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
index fed268ed9c..6398ff2852 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
@@ -8,8 +8,9 @@
 #include <memory.h>
 
 using onnxruntime::rnn::detail::Allocate;
-//TODO: fix the warnings
+// TODO: fix the warnings
 #if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 namespace onnxruntime {
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
index 0a13083d9a..02a09e88db 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
@@ -12,6 +12,7 @@
 #include "core/framework/allocator.h"
 //TODO: fix the warnings
 #if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 namespace onnxruntime {
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
index 9c0246347a..f084e72dcd 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
@@ -16,6 +16,7 @@
 #endif
 //TODO: fix the warnings
 #if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 using namespace onnxruntime::rnn::detail;
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
index 5ebb515575..db2c93e377 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
@@ -12,6 +12,7 @@
 //TODO: fix the warnings
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(push)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 namespace onnxruntime {
diff --git a/onnxruntime/contrib_ops/cpu/crop_and_resize.cc b/onnxruntime/contrib_ops/cpu/crop_and_resize.cc
index f0548f66de..2ed4d32bf2 100644
--- a/onnxruntime/contrib_ops/cpu/crop_and_resize.cc
+++ b/onnxruntime/contrib_ops/cpu/crop_and_resize.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "core/providers/cpu/object_detection/roialign.h"
 //TODO: fix the warnings
 #if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 using namespace onnxruntime::concurrency;
diff --git a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
index 6439b2c8f4..c88ffd2dce 100644
--- a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
+++ b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
@@ -68,7 +68,7 @@ Status ReorderInput::Compute(OpKernelContext* context) const {
 
     if (channels_last_) {
       int64_t work_index = static_cast<int64_t>(work.start);
-      int64_t work_remaining = static_cast<int64_t>(work.end - work.start);
+      int64_t work_remaining = static_cast<int64_t>(work.end) - work.start;
 
       while (work_remaining > 0) {
         const int64_t batch_index = work_index / spatial_size;
@@ -87,7 +87,7 @@ Status ReorderInput::Compute(OpKernelContext* context) const {
       }
     } else {
       int64_t work_index = static_cast<int64_t>(work.start) * nchwc_block_size;
-      int64_t work_remaining = static_cast<int64_t>(work.end - work.start) * nchwc_block_size;
+      int64_t work_remaining = (static_cast<int64_t>(work.end) - work.start) * nchwc_block_size;
 
       while (work_remaining > 0) {
         const int64_t batch_index = work_index / nchwc_channels;
@@ -331,7 +331,7 @@ Status NchwcUpsample::Compute(OpKernelContext* context) const {
     auto upsample_worker = [&](ptrdiff_t batch) {
       auto work = concurrency::ThreadPool::PartitionWork(batch, worker_count, total_work);
       int64_t work_index = static_cast<int64_t>(work.start);
-      int64_t work_remaining = static_cast<int64_t>(work.end - work.start);
+      int64_t work_remaining = static_cast<int64_t>(work.end) - work.start;
 
       while (work_remaining > 0) {
         // Limit the current loop iteration to the same source image.
diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_pool.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_pool.cc
index e97b654994..f06a80512a 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_pool.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_pool.cc
@@ -119,7 +119,7 @@ struct QLinearPoolNhwc1DTask final {
     int64_t batch = begin / y_image_size;
     int64_t offset = begin % y_image_size;
 
-    for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
+    for (std::ptrdiff_t remains = end - begin; remains > 0; offset = 0, batch++) {
       if (offset + remains <= y_image_size) {
         operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
         remains = 0;
@@ -247,7 +247,7 @@ struct QLinearPoolNhwc2DTask final {
     int64_t batch = begin / y_image_size;
     int64_t offset = begin % y_image_size;
 
-    for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
+    for (int64_t remains = static_cast<int64_t>(end) - begin; remains > 0; offset = 0, batch++) {
       if (offset + remains <= y_image_size) {
         operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
         remains = 0;
@@ -268,7 +268,7 @@ struct QLinearPoolNhwc2DTask final {
     start_pw -= (start_ph * pooled_width);
 
     int64_t pool_index = channels * begin;
-    int64_t remains = end - begin;
+    std::ptrdiff_t remains = end - begin;
     std::vector<float> Yh(channels);
 
     for (int64_t ph = start_ph; remains > 0 && ph < pooled_height; ++ph) {
@@ -281,7 +281,7 @@ struct QLinearPoolNhwc2DTask final {
         wstart = std::max(wstart, static_cast<int64_t>(0));
 
         // do the pooling here
-        float pool_init_value = PoolType::Initialize();
+        constexpr float pool_init_value = PoolType::Initialize();
         std::fill(Yh.data(), Yh.data() + channels, pool_init_value);
         for (int64_t h = hstart; h < hend; ++h) {
           int64_t input_index = channels * (h * width + wstart);
@@ -415,7 +415,7 @@ struct QLinearPoolNhwc3DTask final {
     int64_t batch = begin / y_image_size;
     int64_t offset = begin % y_image_size;
 
-    for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
+    for (int64_t remains = static_cast<int64_t>(end) - begin; remains > 0; offset = 0, batch++) {
       if (offset + remains <= y_image_size) {
         operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
         remains = 0;
@@ -437,7 +437,7 @@ struct QLinearPoolNhwc3DTask final {
     int64_t start_pw = start_pd / pooled_depth;
     start_pd = start_pd - start_pw * pooled_depth;
     int64_t pool_index = channels * begin;
-    int64_t remains = end - begin;
+    int64_t remains = static_cast<int64_t>(end) - begin;
 
     std::vector<float> Yh(channels);
 
diff --git a/onnxruntime/contrib_ops/cpu/tokenizer.cc b/onnxruntime/contrib_ops/cpu/tokenizer.cc
index d972692950..b26fa11fe2 100644
--- a/onnxruntime/contrib_ops/cpu/tokenizer.cc
+++ b/onnxruntime/contrib_ops/cpu/tokenizer.cc
@@ -93,7 +93,7 @@ Tokenizer::Tokenizer(const OpKernelInfo& info) : OpKernel(info) {
       re2::RE2::Options options;
       options.set_longest_match(true);
       for (const auto& sep : separators) {
-        std::unique_ptr<re2::RE2> regex(new re2::RE2(sep, options));
+        std::unique_ptr<re2::RE2> regex = std::make_unique<re2::RE2>(sep, options);
         if (!regex->ok()) {
           ORT_THROW("Can not digest separators: ", sep, " ", regex->error());
         }
@@ -104,7 +104,7 @@ Tokenizer::Tokenizer(const OpKernelInfo& info) : OpKernel(info) {
       assert(!tokenexp.empty());
       re2::RE2::Options options;
       options.set_longest_match(true);
-      std::unique_ptr<re2::RE2> regex(new re2::RE2(tokenexp, options));
+      std::unique_ptr<re2::RE2> regex = std::make_unique<re2::RE2>(tokenexp, options);
       if (!regex->ok()) {
         ORT_THROW("Can not digest tokenexp: ", regex->error());
       }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
index 1225442192..9925501b1d 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
@@ -30,6 +30,8 @@
 
 #ifdef _MSC_VER
 #pragma warning(pop)
+// Could reduce the chance of arithmetic overflow. TODO: fix it
+#pragma warning(disable : 26451)
 #endif
 
 using namespace ONNX_NAMESPACE;
@@ -431,10 +433,10 @@ Status BeamSearchImpl<T>::ProcessLogits(
   Tensor::InitOrtValue(element_type, next_token_scores_shape, next_token_scores.data(), allocator->Info(), next_token_scores_value);
   const Tensor& input = next_token_scores_value.Get<Tensor>();
 
-  const int axis = 1;
+  constexpr int axis = 1;
   const unsigned top_k = static_cast<unsigned>(2 * parameters_->num_beams);
-  const bool largest = true;
-  const bool sorted = true;  // results returned in sorted order.
+  constexpr bool largest = true;
+  constexpr bool sorted = true;  // results returned in sorted order.
 
   std::unique_ptr<Tensor> topk_scores;
   std::unique_ptr<Tensor> topk_indices;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
index bb7aeb989e..d2bb36e0b1 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
@@ -10,7 +10,10 @@
 #include "core/providers/cpu/tensor/utils.h"
 #include "core/providers/cpu/rnn/rnn_helpers.h"
 #include "beam_search_scorer.h"
-
+#ifdef _MSC_VER
+// Could reduce the chance of arithmetic overflow. TODO: fix it
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 namespace contrib {
 namespace transformers {
@@ -126,7 +129,7 @@ void BeamSearchScorer<T>::Initialize(AllocatorPtr& allocator, int sequence_lengt
   ORT_ENFORCE(next_beam_scores_.empty()); // Make sure this is called only once.
 
   size_t batch_beam_size = static_cast<size_t>(batch_size_ * num_beams_);
-  const bool no_fill = false; // do not fill values after allocation
+  constexpr bool no_fill = false;  // do not fill values after allocation
   next_beam_scores_ = Allocate<T>(allocator, batch_beam_size, next_beam_scores_ptr_, no_fill);
   next_beam_tokens_ = Allocate<int64_t>(allocator, batch_beam_size, next_beam_tokens_ptr_, no_fill);
   next_beam_indices_ = Allocate<int64_t>(allocator, batch_beam_size, next_beam_indices_ptr_, no_fill);
diff --git a/onnxruntime/contrib_ops/cpu/transformers/gpt_subgraph.cc b/onnxruntime/contrib_ops/cpu/transformers/gpt_subgraph.cc
index 80825c7e26..db24804232 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/gpt_subgraph.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/gpt_subgraph.cc
@@ -21,7 +21,10 @@
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
-
+#ifdef _MSC_VER
+// Could reduce the chance of arithmetic overflow. TODO: fix it
+#pragma warning(disable : 26451)
+#endif
 using namespace ONNX_NAMESPACE;
 using namespace onnxruntime::common;
 
diff --git a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
index 10ef8b6f69..ca8e722bf8 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
@@ -1,7 +1,10 @@
 #include <assert.h>
 #include "logits_processor.h"
 #include "dump_tensor.h"
-
+#ifdef _MSC_VER
+// Could reduce the chance of arithmetic overflow. TODO: fix it
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 namespace contrib {
 namespace transformers {
diff --git a/onnxruntime/contrib_ops/cpu/transformers/sequences.cc b/onnxruntime/contrib_ops/cpu/transformers/sequences.cc
index a9c70ef410..87e6aabe95 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/sequences.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/sequences.cc
@@ -1,5 +1,8 @@
 #include "sequences.h"
-
+#ifdef _MSC_VER
+// Could reduce the chance of arithmetic overflow. TODO: fix it
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 namespace contrib {
 namespace transformers {
diff --git a/onnxruntime/core/common/threadpool.cc b/onnxruntime/core/common/threadpool.cc
index eb709dcadc..8357388884 100644
--- a/onnxruntime/core/common/threadpool.cc
+++ b/onnxruntime/core/common/threadpool.cc
@@ -36,7 +36,10 @@ limitations under the License.
 #include <sched.h>
 #endif
 #endif
-
+#if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 
 namespace concurrency {
@@ -66,7 +69,7 @@ void ThreadPoolProfiler::Start() {
 ThreadPoolProfiler::MainThreadStat& ThreadPoolProfiler::GetMainThreadStat() {
   static thread_local std::unique_ptr<MainThreadStat> stat;
   if (!stat) {
-    stat.reset(new MainThreadStat());
+    stat = std::make_unique<MainThreadStat>();
   }
   return *stat;
 }
@@ -336,8 +339,8 @@ class alignas(CACHE_LINE_BYTES) LoopCounter {
   //   Hence, at low thread counts, each of N threads will get its own
   //   shard representing 1/N of the work.
   constexpr static unsigned GetNumShards(uint64_t num_iterations,
-                               uint64_t d_of_p,
-                               uint64_t block_size) {
+                                         uint64_t d_of_p,
+                                         uint64_t block_size) {
     unsigned num_shards = 0;
     auto num_blocks = num_iterations / block_size;
     if (num_blocks == 0) {
@@ -376,10 +379,10 @@ ThreadPool::ThreadPool(Env* env,
     int threads_to_create = degree_of_parallelism - 1;
     extended_eigen_threadpool_ =
         std::make_unique<ThreadPoolTempl<Env> >(name,
-                                                        threads_to_create,
-                                                        low_latency_hint,
-                                                        *env,
-                                                        thread_options_);
+                                                threads_to_create,
+                                                low_latency_hint,
+                                                *env,
+                                                thread_options_);
     underlying_threadpool_ = extended_eigen_threadpool_.get();
   }
 }
diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
index 98a8bf4d98..c1d3366fdd 100644
--- a/onnxruntime/core/framework/allocator.cc
+++ b/onnxruntime/core/framework/allocator.cc
@@ -110,7 +110,10 @@ void CPUAllocator::Free(void* p) {
 }  // namespace onnxruntime
 
 std::ostream& operator<<(std::ostream& out, const OrtMemoryInfo& info) { return (out << info.ToString()); }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 26409)
+#endif
 ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtAllocatorType type, int id1,
                     enum OrtMemType mem_type1, _Outptr_ OrtMemoryInfo** out) {
   if (strcmp(name1, onnxruntime::CPU) == 0) {
@@ -126,7 +129,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
   } else if (strcmp(name1, onnxruntime::OpenVINO_GPU) == 0) {
     *out = new OrtMemoryInfo(
         onnxruntime::OpenVINO_GPU, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)),
-         id1, mem_type1);
+        id1, mem_type1);
   } else if (strcmp(name1, onnxruntime::DML) == 0) {
     *out = new OrtMemoryInfo(
         onnxruntime::DML, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)),
@@ -138,7 +141,9 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
 }
 
 ORT_API(void, OrtApis::ReleaseMemoryInfo, _Frees_ptr_opt_ OrtMemoryInfo* p) { delete p; }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
 ORT_API_STATUS_IMPL(OrtApis::MemoryInfoGetName, _In_ const OrtMemoryInfo* ptr, _Out_ const char** out) {
   *out = ptr->name;
   return nullptr;
@@ -164,4 +169,3 @@ ORT_API_STATUS_IMPL(OrtApis::CompareMemoryInfo, _In_ const OrtMemoryInfo* info1,
   *out = (*info1 == *info2) ? 0 : -1;
   return nullptr;
 }
-
diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc
index 57e4fea00b..9bbb2b419f 100644
--- a/onnxruntime/core/framework/bfc_arena.cc
+++ b/onnxruntime/core/framework/bfc_arena.cc
@@ -164,8 +164,19 @@ Status BFCArena::Extend(size_t rounded_bytes) {
   static constexpr float kBackpedalFactor = 0.9f;
   // Try allocating less memory.
   while (mem_addr == nullptr) {
+  // kBackpedalFactor is float, bytes is size_t. The result of bytes * kBackpedalFactor is float. When we cast it to
+  // size_t, which is a smaller type, it could loss data. This is what C4244 complains. The "static_cast<size_t>" here 
+  // is to suppress the warning. C26451 suggest we may change kBackpedalFactor to double to get better accuary. But if
+  // we do that, AMD GPU CI build pipeline will have an "out-of-memory" error. So I choose to keep this piece of code
+  // untouched and disable the warning first.
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 26451)
+#endif
     bytes = RoundedBytes(static_cast<size_t>(bytes * kBackpedalFactor));
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
     // give up if we can't satisfy the requested size, or we're attempting an allocation of less than 8K.
     //
     // the latter protects against an infinite loop that occurs when bytes is less than 2560. at that point the 10%
diff --git a/onnxruntime/core/framework/bfc_arena.h b/onnxruntime/core/framework/bfc_arena.h
index f38d6b64e0..e664ba50d1 100644
--- a/onnxruntime/core/framework/bfc_arena.h
+++ b/onnxruntime/core/framework/bfc_arena.h
@@ -211,7 +211,7 @@ class BFCArena : public IAllocator {
       ORT_ENFORCE(0 == memory_size % kMinAllocationSize);
       const size_t n_handles =
           (memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
-      handles_ = new ChunkHandle[n_handles];
+      handles_ = std::make_unique<ChunkHandle[]>(n_handles);
       for (size_t i = 0; i < n_handles; i++) {
         handles_[i] = kInvalidChunkHandle;
       }
@@ -219,7 +219,7 @@ class BFCArena : public IAllocator {
 
     AllocationRegion() = default;
 
-    ~AllocationRegion() { delete[] handles_; }
+    ~AllocationRegion() = default;
 
     AllocationRegion(AllocationRegion&& other) noexcept { Swap(other); }
 
@@ -266,7 +266,7 @@ class BFCArena : public IAllocator {
     // Array of size "memory_size / kMinAllocationSize".  It is
     // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle
     // for the memory allocation represented by "p"
-    ChunkHandle* handles_ = nullptr;
+    std::unique_ptr<ChunkHandle[]> handles_;
 
     ORT_DISALLOW_ASSIGNMENT(AllocationRegion);
   };
diff --git a/onnxruntime/core/framework/callback.cc b/onnxruntime/core/framework/callback.cc
index deb4d1e277..7e288b1a66 100644
--- a/onnxruntime/core/framework/callback.cc
+++ b/onnxruntime/core/framework/callback.cc
@@ -2,7 +2,9 @@
 // Licensed under the MIT License.
 
 #include "core/framework/callback.h"
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 namespace onnxruntime {
 void OrtRunCallback(OrtCallback* f) noexcept {
   if (f == nullptr) return;
diff --git a/onnxruntime/core/framework/data_types.cc b/onnxruntime/core/framework/data_types.cc
index dc66fba500..90afec10e9 100644
--- a/onnxruntime/core/framework/data_types.cc
+++ b/onnxruntime/core/framework/data_types.cc
@@ -314,7 +314,10 @@ struct TensorTypeBase::Impl : public data_types_internal::TypeProtoImpl {
 const ONNX_NAMESPACE::TypeProto* TensorTypeBase::GetTypeProto() const {
   return impl_->GetProto();
 }
-
+//TODO: Fix the warning
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 TensorTypeBase::TensorTypeBase()
     : DataTypeImpl{DataTypeImpl::GeneralType::kTensor, sizeof(Tensor)},
       impl_(new Impl()) {}
diff --git a/onnxruntime/core/framework/error_code.cc b/onnxruntime/core/framework/error_code.cc
index 0bcad00233..d1d509fbe0 100644
--- a/onnxruntime/core/framework/error_code.cc
+++ b/onnxruntime/core/framework/error_code.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/common/gsl_suppress.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/ort_apis.h"
 #include "core/common/status.h"
@@ -66,5 +67,7 @@ ORT_API(OrtErrorCode, OrtApis::GetErrorCode, _In_ const OrtStatus* status) {
 ORT_API(const char*, OrtApis::GetErrorMessage, _In_ const OrtStatus* status) {
   return status->msg;
 }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 ORT_API(void, OrtApis::ReleaseStatus, _Frees_ptr_opt_ OrtStatus* value) { delete[] reinterpret_cast<uint8_t*>(value); }
diff --git a/onnxruntime/core/framework/func_kernel.h b/onnxruntime/core/framework/func_kernel.h
index ece2c24304..9bf7a5bff6 100644
--- a/onnxruntime/core/framework/func_kernel.h
+++ b/onnxruntime/core/framework/func_kernel.h
@@ -13,21 +13,29 @@ void release_helper_func(void* allocator, void* p);
 //A kernel that wrapper the ComputeFunction call generated by execution provider when fuse the sub-graph
 class FunctionKernel : public OpKernel {
  public:
+  explicit FunctionKernel(const OpKernelInfo& info, const NodeComputeInfo* compute) : OpKernel(info), compute_info_(compute) {}
+
   //The original design is we load the dll, find the entry point and wrapper it.
   //Here for quick prototype, we keep the entry pointer in the node.
-  explicit FunctionKernel(const OpKernelInfo& info) : OpKernel(info) {
-    num_inputs_ = info.node().InputDefs().size();
-    num_outputs_ = info.node().OutputDefs().size();
-    auto status = info.GetFusedFuncs(compute_info_);
-    ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
-    if (compute_info_->create_state_func) {
+  static Status Create(FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) {
+    const NodeComputeInfo* compute;
+    ORT_RETURN_IF_ERROR(func_mgr.GetFuncs(info.node().Name(), compute));
+    std::unique_ptr<FunctionKernel> funckernel = std::make_unique<FunctionKernel>(info, compute);
+    funckernel->num_inputs_ = info.node().InputDefs().size();
+    funckernel->num_outputs_ = info.node().OutputDefs().size();
+
+    if (compute->create_state_func) {
       //TODO: we are only provide host allocate method in compute context.
       //Do we need to hold the ref-counting here?
-      host_allocator_ = info.GetAllocator(0, OrtMemType::OrtMemTypeDefault);
-      ComputeContext context = {allocate_helper_func, release_helper_func, host_allocator_.get(),
+      funckernel->host_allocator_ = info.GetAllocator(0, OrtMemType::OrtMemTypeDefault);
+      ComputeContext context = {allocate_helper_func, release_helper_func, funckernel->host_allocator_.get(),
                                 info.node().Name().c_str()};
-      ORT_ENFORCE(compute_info_->create_state_func(&context, &func_state_) == 0);
+      int ret = funckernel->compute_info_->create_state_func(&context, &funckernel->func_state_);
+      if (ret != 0)
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Create state function failed. Return value:", ret);
     }
+    out = std::move(funckernel);
+    return Status::OK();
   }
 
   ~FunctionKernel() override {
@@ -38,12 +46,14 @@ class FunctionKernel : public OpKernel {
 
   virtual Status Compute(OpKernelContext* context) const override {
     auto* context_internal = static_cast<OpKernelContextInternal*>(context);
-    return compute_info_->compute_func(func_state_, OrtGetApiBase()->GetApi(ORT_API_VERSION),
+    const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
+    if (api == nullptr) return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "API VERSION ", ORT_API_VERSION, " is invalid.");
+    return compute_info_->compute_func(func_state_, api,
                                        reinterpret_cast<OrtKernelContext*>(context_internal));
   }
 
  private:
-  NodeComputeInfo* compute_info_{nullptr};
+  const NodeComputeInfo* const compute_info_;
   FunctionState func_state_{nullptr};
   size_t num_inputs_;
   size_t num_outputs_;
diff --git a/onnxruntime/core/framework/fuse_nodes_funcs.cc b/onnxruntime/core/framework/fuse_nodes_funcs.cc
index f4678afbed..d75b2eba16 100644
--- a/onnxruntime/core/framework/fuse_nodes_funcs.cc
+++ b/onnxruntime/core/framework/fuse_nodes_funcs.cc
@@ -22,7 +22,7 @@ Status FuncManager::AddFuncInfo(const std::string& name, NodeComputeInfo&& compu
   return Status::OK();
 }
 
-Status FuncManager::GetFuncs(const std::string& name, NodeComputeInfo*& compute_info) const {
+Status FuncManager::GetFuncs(const std::string& name, const NodeComputeInfo*& compute_info) {
   auto it = fused_funcs_->find(name);
   if (it == fused_funcs_->end())
     return Status(common::ONNXRUNTIME, common::FAIL, "func info for node: " + name + " not found.");
@@ -30,7 +30,7 @@ Status FuncManager::GetFuncs(const std::string& name, NodeComputeInfo*& compute_
   if (!it->second.compute_info.compute_func) {
     //load from path
     void* handle = nullptr;
-    ORT_RETURN_IF_ERROR(lib_loader_->LoadExternalLib(it->second.dso_path, &handle));
+    ORT_RETURN_IF_ERROR(lib_loader_.LoadExternalLib(it->second.dso_path, &handle));
     void* create_func_symbol_handle = nullptr;
     ORT_RETURN_IF_ERROR(Env::Default().GetSymbolFromLibrary(handle,
                                                             kCreateStateFuncSymbol + name,
diff --git a/onnxruntime/core/framework/fuse_nodes_funcs.h b/onnxruntime/core/framework/fuse_nodes_funcs.h
index 665f7d5d22..628880b066 100644
--- a/onnxruntime/core/framework/fuse_nodes_funcs.h
+++ b/onnxruntime/core/framework/fuse_nodes_funcs.h
@@ -8,15 +8,15 @@ namespace onnxruntime {
 class FuncManager {
  public:
   FuncManager()
-      : fused_funcs_(std::make_shared<std::unordered_map<std::string, FuncInfo> >()),
-        lib_loader_(std::make_unique<ExLibLoader>()) {
+      : fused_funcs_(std::make_shared<std::unordered_map<std::string, FuncInfo> >()) {
   }
 
   Status AddFuncInfo(const std::string& name, const std::string& dll_path);
 
   Status AddFuncInfo(const std::string& name, NodeComputeInfo&& compute_info);
 
-  Status GetFuncs(const std::string& name, NodeComputeInfo*& compute_info) const;
+  //Do not call AddFuncInfo after this function is called.
+  Status GetFuncs(const std::string& name, const NodeComputeInfo*& compute_info);
 
   size_t NumFuncs() const { return fused_funcs_->size(); }
 
@@ -38,7 +38,7 @@ class FuncManager {
   // because it's filled in by the time main graph is traversed,
   // while subgraph session state is created later
   std::shared_ptr<std::unordered_map<std::string, FuncInfo> > fused_funcs_;
-  std::unique_ptr<ExLibLoader> lib_loader_;
+  ExLibLoader lib_loader_;
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(FuncManager);
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index eff7992ce4..1ebccacad4 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -258,10 +258,9 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, bool export_dll, FuncMa
         KernelDefBuilder builder;
         BuildFusedKernelDef(builder, *node);
         ORT_RETURN_IF_ERROR(fused_kernel_registry.Register(builder,
-                                                           static_cast<KernelCreatePtrFn>(
-                                                               [](const OpKernelInfo& info) -> OpKernel* {
-                                                                 return new FunctionKernel(info);
-                                                               })));
+                                                           [](FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
+                                                             return FunctionKernel::Create(func_mgr, info, out);
+                                                           }));
       }
 
     } else {
@@ -298,10 +297,9 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, bool export_dll, FuncMa
         KernelDefBuilder builder;
         BuildFusedKernelDef(builder, metadef, type);
         ORT_RETURN_IF_ERROR(fused_kernel_registry.Register(builder,
-                                                           static_cast<KernelCreatePtrFn>(
-                                                               [](const OpKernelInfo& info) -> OpKernel* {
-                                                                 return new FunctionKernel(info);
-                                                               })));
+                                                           [](FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
+                                                             return FunctionKernel::Create(func_mgr, info, out);
+                                                           }));
 
         // now that we're done compiling we can remove the original nodes from the Graph and wire in the new one
         graph.FinalizeFuseSubGraph(indexed_sub_graph, *node);
@@ -477,10 +475,10 @@ static Status PartitionOrtFormatModelImpl(Graph& graph, FuncManager& func_mgr,
       }
 
       ORT_RETURN_IF_ERROR(fused_kernel_registry.Register(
-          KernelCreateInfo(std::move(kernel_def), static_cast<KernelCreatePtrFn>(
-                                                      [](const OpKernelInfo& info) -> OpKernel* {
-                                                        return new FunctionKernel(info);
-                                                      }))));
+          KernelCreateInfo(std::move(kernel_def),
+                           [](FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
+                             return FunctionKernel::Create(func_mgr, info, out);
+                           })));
 
       // now that we're done compiling we can remove the original nodes from the Graph and wire in the new one
       graph.FinalizeFuseSubGraph(indexed_sub_graph, node);
diff --git a/onnxruntime/core/framework/kernel_registry.cc b/onnxruntime/core/framework/kernel_registry.cc
index 07db21c6b2..de6c47e018 100644
--- a/onnxruntime/core/framework/kernel_registry.cc
+++ b/onnxruntime/core/framework/kernel_registry.cc
@@ -206,7 +206,7 @@ Status KernelRegistry::TryCreateKernel(const Node& node,
                                        const IExecutionProvider& execution_provider,
                                        const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
                                        const OrtValueNameIdxMap& ort_value_name_idx_map,
-                                       const FuncManager& funcs_mgr,
+                                       FuncManager& funcs_mgr,
                                        const DataTransferManager& data_transfer_mgr,
                                        /*out*/ std::unique_ptr<OpKernel>& op_kernel) const {
   const KernelCreateInfo* kernel_create_info = nullptr;
@@ -216,10 +216,8 @@ Status KernelRegistry::TryCreateKernel(const Node& node,
                            execution_provider,
                            constant_initialized_tensors,
                            ort_value_name_idx_map,
-                           funcs_mgr,
                            data_transfer_mgr);
-  op_kernel.reset(kernel_create_info->kernel_create_func(kernel_info));
-  return Status::OK();
+  return kernel_create_info->kernel_create_func(funcs_mgr, kernel_info, op_kernel);
 }
 
 static std::string ToString(const std::vector<std::string>& error_strs) {
diff --git a/onnxruntime/core/framework/kernel_registry_manager.cc b/onnxruntime/core/framework/kernel_registry_manager.cc
index e2137f072a..2041b0d7ba 100644
--- a/onnxruntime/core/framework/kernel_registry_manager.cc
+++ b/onnxruntime/core/framework/kernel_registry_manager.cc
@@ -13,18 +13,17 @@
 using namespace ONNX_NAMESPACE;
 using namespace ::onnxruntime::common;
 namespace onnxruntime {
-std::unique_ptr<OpKernel> KernelRegistryManager::CreateKernel(const onnxruntime::Node& node,
-                                                              const IExecutionProvider& execution_provider,
-                                                              const SessionState& session_state,
-                                                              const KernelCreateInfo& kernel_create_info) const {
+Status KernelRegistryManager::CreateKernel(const onnxruntime::Node& node,
+                                           const IExecutionProvider& execution_provider,
+                                           SessionState& session_state,
+                                           const KernelCreateInfo& kernel_create_info,
+                                           std::unique_ptr<OpKernel>& out) const {
   OpKernelInfo kernel_info(node, *kernel_create_info.kernel_def, execution_provider,
                            session_state.GetConstantInitializedTensors(),
                            session_state.GetOrtValueNameIdxMap(),
-                           session_state.GetFuncMgr(),
                            session_state.GetDataTransferMgr());
 
-  // OpKernel is abstract base class so can't use make_unique
-  return std::unique_ptr<OpKernel>(kernel_create_info.kernel_create_func(kernel_info));
+  return kernel_create_info.kernel_create_func(session_state.GetMutableFuncMgr(), kernel_info, out);
 }
 
 Status KernelRegistryManager::RegisterKernels(const ExecutionProviders& execution_providers) {
diff --git a/onnxruntime/core/framework/kernel_registry_manager.h b/onnxruntime/core/framework/kernel_registry_manager.h
index 6a4fb6defc..4d9da148af 100644
--- a/onnxruntime/core/framework/kernel_registry_manager.h
+++ b/onnxruntime/core/framework/kernel_registry_manager.h
@@ -77,10 +77,10 @@ class KernelRegistryManager {
   bool SearchKernelRegistriesByHash(HashValue kernel_def_hash,
                                     const KernelCreateInfo** kernel_create_info) const;
 
-  std::unique_ptr<OpKernel> CreateKernel(const onnxruntime::Node& node,
-                                         const IExecutionProvider& execution_provider,
-                                         const SessionState& session_state,
-                                         const KernelCreateInfo& kernel_create_info) const ORT_MUST_USE_RESULT;
+  Status CreateKernel(const onnxruntime::Node& node,
+                      const IExecutionProvider& execution_provider,
+                      SessionState& session_state,
+                      const KernelCreateInfo& kernel_create_info, std::unique_ptr<OpKernel>& out) const;
 
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(KernelRegistryManager);
 
diff --git a/onnxruntime/core/framework/onnxruntime_map_type_info.cc b/onnxruntime/core/framework/onnxruntime_map_type_info.cc
index e0b5ffc538..9b18ba6703 100644
--- a/onnxruntime/core/framework/onnxruntime_map_type_info.cc
+++ b/onnxruntime/core/framework/onnxruntime_map_type_info.cc
@@ -32,7 +32,9 @@ ToONNXTensorElementDataType(ONNX_NAMESPACE::TensorProto_DataType data_type) {
     default:                                               { return ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;    }
   }
 }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 OrtStatus* OrtMapTypeInfo::FromTypeProto(const ONNX_NAMESPACE::TypeProto* type_proto, OrtMapTypeInfo** out) {
   auto value_case = type_proto->value_case();
   if (value_case != ONNX_NAMESPACE::TypeProto::kMapType)
diff --git a/onnxruntime/core/framework/onnxruntime_sequence_type_info.cc b/onnxruntime/core/framework/onnxruntime_sequence_type_info.cc
index 18427fd5fc..acae583c7a 100644
--- a/onnxruntime/core/framework/onnxruntime_sequence_type_info.cc
+++ b/onnxruntime/core/framework/onnxruntime_sequence_type_info.cc
@@ -6,21 +6,20 @@
 #include "core/session/ort_apis.h"
 #include "core/framework/error_code_helper.h"
 
-OrtSequenceTypeInfo::OrtSequenceTypeInfo(OrtTypeInfo* sequence_key_type) noexcept :
-	sequence_key_type_(sequence_key_type, &OrtApis::ReleaseTypeInfo) {
+OrtSequenceTypeInfo::OrtSequenceTypeInfo(OrtTypeInfo* sequence_key_type) noexcept : sequence_key_type_(sequence_key_type, &OrtApis::ReleaseTypeInfo) {
 }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 OrtStatus* OrtSequenceTypeInfo::FromTypeProto(const ONNX_NAMESPACE::TypeProto* type_proto, OrtSequenceTypeInfo** out) {
   auto value_case = type_proto->value_case();
-  if (value_case != ONNX_NAMESPACE::TypeProto::kSequenceType)
-  {
-    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "type_proto is not of type sequence!");;
+  if (value_case != ONNX_NAMESPACE::TypeProto::kSequenceType) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "type_proto is not of type sequence!");
   }
 
   auto type_proto_sequence = type_proto->sequence_type();
   OrtTypeInfo* sequence_key_type_info = nullptr;
-  if (auto status = OrtTypeInfo::FromTypeProto(&type_proto_sequence.elem_type(), &sequence_key_type_info))
-  {
+  if (auto status = OrtTypeInfo::FromTypeProto(&type_proto_sequence.elem_type(), &sequence_key_type_info)) {
     return status;
   }
 
@@ -30,8 +29,7 @@ OrtStatus* OrtSequenceTypeInfo::FromTypeProto(const ONNX_NAMESPACE::TypeProto* t
 
 OrtStatus* OrtSequenceTypeInfo::Clone(OrtSequenceTypeInfo** out) {
   OrtTypeInfo* sequence_key_type_copy = nullptr;
-  if (auto status = sequence_key_type_->Clone(&sequence_key_type_copy))
-  {
+  if (auto status = sequence_key_type_->Clone(&sequence_key_type_copy)) {
     return status;
   }
   *out = new OrtSequenceTypeInfo(sequence_key_type_copy);
diff --git a/onnxruntime/core/framework/onnxruntime_typeinfo.cc b/onnxruntime/core/framework/onnxruntime_typeinfo.cc
index 5a26fe2066..e3a07f84ef 100644
--- a/onnxruntime/core/framework/onnxruntime_typeinfo.cc
+++ b/onnxruntime/core/framework/onnxruntime_typeinfo.cc
@@ -27,7 +27,9 @@ using onnxruntime::Tensor;
 using onnxruntime::TensorShape;
 
 namespace on = ONNX_NAMESPACE;
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 OrtTypeInfo::OrtTypeInfo(ONNXType type1) noexcept : type(type1) {
 }
 
diff --git a/onnxruntime/core/framework/onnxruntime_typeinfo.h b/onnxruntime/core/framework/onnxruntime_typeinfo.h
index 5b9145d32e..67e270f5be 100644
--- a/onnxruntime/core/framework/onnxruntime_typeinfo.h
+++ b/onnxruntime/core/framework/onnxruntime_typeinfo.h
@@ -4,6 +4,7 @@
 #pragma once
 #include <atomic>
 #include <string>
+#include "core/common/gsl_suppress.h"
 #include "core/session/onnxruntime_c_api.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/framework/op_kernel_info.cc b/onnxruntime/core/framework/op_kernel_info.cc
index 4be25b5fe7..0ae43026e6 100644
--- a/onnxruntime/core/framework/op_kernel_info.cc
+++ b/onnxruntime/core/framework/op_kernel_info.cc
@@ -13,7 +13,6 @@ OpKernelInfo::OpKernelInfo(const onnxruntime::Node& node,
                            const IExecutionProvider& execution_provider,
                            const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
                            const OrtValueNameIdxMap& ort_value_name_idx_map,
-                           const FuncManager& funcs_mgr,
                            const DataTransferManager& data_transfer_mgr)
     : OpNodeProtoHelper(&proto_helper_context_),
       node_(node),
@@ -21,13 +20,12 @@ OpKernelInfo::OpKernelInfo(const onnxruntime::Node& node,
       execution_provider_(&execution_provider),
       constant_initialized_tensors_(constant_initialized_tensors),
       ort_value_name_idx_map_(ort_value_name_idx_map),
-      funcs_mgr_(funcs_mgr),
       data_transfer_mgr_(data_transfer_mgr),
       proto_helper_context_(node) {}
 
 OpKernelInfo::OpKernelInfo(const OpKernelInfo& other)
     : OpKernelInfo(other.node_, other.kernel_def_, *other.execution_provider_, other.constant_initialized_tensors_,
-                   other.ort_value_name_idx_map_, other.funcs_mgr_, other.data_transfer_mgr_) {}
+                   other.ort_value_name_idx_map_, other.data_transfer_mgr_) {}
 
 const OrtMemoryInfo& OpKernelInfo::GetMemoryInfo(int device_id, OrtMemType mem_type) const {
   AllocatorPtr alloc = GetAllocator(device_id, mem_type);
@@ -79,7 +77,4 @@ bool OpKernelInfo::TryGetConstantInput(int input_index, const Tensor** constant_
   return true;
 }
 
-common::Status OpKernelInfo::GetFusedFuncs(NodeComputeInfo*& compute_info) const {
-  return funcs_mgr_.GetFuncs(node_.Name(), compute_info);
-}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/run_options.cc b/onnxruntime/core/framework/run_options.cc
index 0a62265515..95c111009c 100644
--- a/onnxruntime/core/framework/run_options.cc
+++ b/onnxruntime/core/framework/run_options.cc
@@ -5,7 +5,9 @@
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/ort_apis.h"
 #include "core/framework/error_code_helper.h"
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 ORT_API_STATUS_IMPL(OrtApis::CreateRunOptions, _Outptr_ OrtRunOptions** out) {
   API_IMPL_BEGIN
   *out = new OrtRunOptions();
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index 2f17779238..09e679a657 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -5,6 +5,7 @@
 
 #include <string>
 #include <vector>
+#include "core/common/gsl_suppress.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/optimizer/graph_transformer_level.h"
 #include "core/util/thread_utils.h"
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 0414d99553..f93b912618 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -164,7 +164,7 @@ Status SessionState::CreateKernels(const KernelRegistryManager& kernel_registry_
       max_nodeid = std::max(max_nodeid, node.Index());
     }
     session_kernels_.clear();
-    session_kernels_.resize(max_nodeid + 1, nullptr);
+    session_kernels_.resize(max_nodeid + 1);
     for (const auto& node : nodes) {
       // construct and save the kernels
       const KernelCreateInfo& kci = GetNodeKernelCreateInfo(node.Index());
@@ -173,10 +173,8 @@ Status SessionState::CreateKernels(const KernelRegistryManager& kernel_registry_
       onnxruntime::ProviderType exec_provider_name = node.GetExecutionProviderType();
       const IExecutionProvider& exec_provider = *execution_providers_.Get(exec_provider_name);
 
-      auto op_kernel = kernel_registry_manager.CreateKernel(node, exec_provider, *this, kci);
-
       // assumes vector is already resize()'ed to the number of nodes in the graph
-      session_kernels_[node.Index()] = op_kernel.release();
+      ORT_RETURN_IF_ERROR(kernel_registry_manager.CreateKernel(node, exec_provider, *this, kci, session_kernels_[node.Index()]));
     }
   }
   node_index_info_ = std::make_unique<NodeIndexInfo>(*graph_viewer_, ort_value_name_idx_map_);
diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
index bc3be7b01f..b8ec5cb4f8 100644
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@@ -109,9 +109,6 @@ class SessionState {
   }
 
   ~SessionState() {
-    for (auto* p : session_kernels_) {
-      delete p;
-    }
     for (auto& kvp : deleter_for_initialized_tensors_) {
       kvp.second.f(kvp.second.param);
     }
@@ -124,11 +121,11 @@ class SessionState {
   // Get kernel for specified node.
   // It should called right before graph execution only.
   const OpKernel* GetKernel(size_t node_id) const {
-    return (node_id < session_kernels_.size()) ? session_kernels_[node_id] : nullptr;
+    return (node_id < session_kernels_.size()) ? session_kernels_[node_id].get() : nullptr;
   }
 
   OpKernel* GetMutableKernel(size_t node_id) {
-    return (node_id < session_kernels_.size()) ? session_kernels_[node_id] : nullptr;
+    return (node_id < session_kernels_.size()) ? session_kernels_[node_id].get() : nullptr;
   }
 
   const ExecutionProviders& GetExecutionProviders() const noexcept { return execution_providers_; }
@@ -144,27 +141,27 @@ class SessionState {
   const OrtValueNameIdxMap& GetOrtValueNameIdxMap() const noexcept { return ort_value_name_idx_map_; }
 
   /**
-     * Adds an initialized tensor (weight) so that it can be used by the
-     * execution frame to setup the appropriate OrtValue vectors.
-     * This function will take a shallow copy of d if d is not NULL.
-     * If 'constant' is true the tensor value cannot be overridden by an input at runtime.
-     * If 'sparse' is true the tensor value represents a densified weight that was initially stored in the model
-     * as sparse tensor.
-     */
+   * Adds an initialized tensor (weight) so that it can be used by the
+   * execution frame to setup the appropriate OrtValue vectors.
+   * This function will take a shallow copy of d if d is not NULL.
+   * If 'constant' is true the tensor value cannot be overridden by an input at runtime.
+   * If 'sparse' is true the tensor value represents a densified weight that was initially stored in the model
+   * as sparse tensor.
+   */
   Status AddInitializedTensor(int ort_value_index, const OrtValue& ort_value, const OrtCallback* d, bool constant, bool sparse);
 
   /**
-     * Gets the map of ort_value_index to initialized tensors (weights) so that it can be used by the
-     * execution frame to setup the appropriate OrtValue vectors.
-     * The lifetime of returned OrtValues are limited by this SessionState object.
-     */
+   * Gets the map of ort_value_index to initialized tensors (weights) so that it can be used by the
+   * execution frame to setup the appropriate OrtValue vectors.
+   * The lifetime of returned OrtValues are limited by this SessionState object.
+   */
   const std::unordered_map<int, OrtValue>& GetInitializedTensors() const;
 
   /**
-     * Gets the map of ort_value_index to initialized tensors (e.g. weights) that are constant
-     * and cannot be overridden at runtime.
-     * The lifetime of returned OrtValues are limited by this SessionState object.
-     */
+   * Gets the map of ort_value_index to initialized tensors (e.g. weights) that are constant
+   * and cannot be overridden at runtime.
+   * The lifetime of returned OrtValues are limited by this SessionState object.
+   */
   const std::unordered_map<int, OrtValue>& GetConstantInitializedTensors() const;
 
 #if !defined(DISABLE_SPARSE_TENSORS)
@@ -355,9 +352,9 @@ class SessionState {
   void CleanInitializedTensorsFromGraph();
 
   /**
-  * Prepack the constant initialized tensors for better performance.
-  * The original constant initialized tensors will be removed to save memory.
-  */
+   * Prepack the constant initialized tensors for better performance.
+   * The original constant initialized tensors will be removed to save memory.
+   */
   Status PrepackConstantInitializedTensors(std::unordered_map<std::string, size_t>& constant_initializers_use_count,
                                            const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map);
 
@@ -402,7 +399,7 @@ class SessionState {
   std::unordered_map<std::string, HashValue> compiled_kernel_hashes_;
 
   // cache of the constructed kernels to avoid spending construction time per executor
-  std::vector<OpKernel*> session_kernels_;
+  std::vector<std::unique_ptr<OpKernel>> session_kernels_;
   Graph& graph_;
   std::unique_ptr<GraphViewer> graph_viewer_;  // GraphViewer for const access to Graph
 
diff --git a/onnxruntime/core/framework/sparse_tensor.cc b/onnxruntime/core/framework/sparse_tensor.cc
index 223670392a..83ada96c05 100644
--- a/onnxruntime/core/framework/sparse_tensor.cc
+++ b/onnxruntime/core/framework/sparse_tensor.cc
@@ -22,7 +22,7 @@ std::ostream& operator<<(std::ostream& os, SparseFormat flags) {
 namespace {
 // Round up size to a multiple of int64
 constexpr size_t kIndexAlignment = alignof(int64_t);
-inline int64_t Roundup(int64_t size) {
+constexpr inline int64_t Roundup(int64_t size) {
   return ((SafeInt<int64_t>(size) + kIndexAlignment - 1) / kIndexAlignment) * kIndexAlignment;
 }
 
@@ -31,7 +31,7 @@ inline int64_t Roundup(int64_t size) {
 /// after data and make sure indices start at int64_t aligned place
 /// </summary>
 /// <returns></returns>
-inline int64_t CalculateRequiredBufferSize(int64_t data_size, int64_t indices_size) {
+constexpr inline int64_t CalculateRequiredBufferSize(int64_t data_size, int64_t indices_size) {
   return SafeInt<int64_t>(Roundup(data_size)) + indices_size;
 }
 
diff --git a/onnxruntime/core/framework/tensor_type_and_shape.cc b/onnxruntime/core/framework/tensor_type_and_shape.cc
index e1dafc2f83..5f29fabcec 100644
--- a/onnxruntime/core/framework/tensor_type_and_shape.cc
+++ b/onnxruntime/core/framework/tensor_type_and_shape.cc
@@ -24,7 +24,9 @@ using onnxruntime::MLFloat16;
 using onnxruntime::SparseTensor;
 #endif
 using onnxruntime::Tensor;
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 ORT_API_STATUS_IMPL(OrtApis::CreateTensorTypeAndShapeInfo, _Outptr_ OrtTensorTypeAndShapeInfo** out) {
   API_IMPL_BEGIN
   *out = new OrtTensorTypeAndShapeInfo();
@@ -317,10 +319,10 @@ ORT_API_STATUS_IMPL(OrtApis::GetValueType, _In_ const OrtValue* v, _Out_ ONNXTyp
 }
 
 /**
-* Get the type information of an OrtValue
-* \param value
-* \return The returned value should be freed by OrtReleaseTypeInfo after use
-*/
+ * Get the type information of an OrtValue
+ * \param value
+ * \return The returned value should be freed by OrtReleaseTypeInfo after use
+ */
 ORT_API_STATUS_IMPL(OrtApis::GetTypeInfo, _In_ const OrtValue* v, _Outptr_result_maybenull_ struct OrtTypeInfo** out) {
   API_IMPL_BEGIN
   // TODO: This is consistent with the previous implementation but inconsistent with GetValueType which returns
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index ca7416fce0..2b6d2bbed1 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -533,7 +533,9 @@ ORT_API(void, OrtUninitializeBuffer, _In_opt_ void* input, size_t input_len, enu
     ptr[i].~string();
   }
 }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 class AutoDelete {
  public:
   OrtCallback d{nullptr, nullptr};
@@ -594,7 +596,7 @@ static Status GetFileContent(
  * @param tensor_proto  tensor data in protobuf format
  * @param tensorp       pre-allocated tensor object, where we store the data
  * @return
-*/
+ */
 Status TensorProtoToTensor(const Env& env, const ORTCHAR_T* model_path,
                            const ONNX_NAMESPACE::TensorProto& tensor_proto,
                            Tensor& tensor) {
@@ -830,7 +832,7 @@ common::Status ConstantNodeProtoToTensorProto(const ONNX_NAMESPACE::NodeProto& n
       break;
     }
 #else
-  ORT_UNUSED_PARAMETER(model_path);
+      ORT_UNUSED_PARAMETER(model_path);
 #endif
     default:
       ORT_THROW("Unsupported attribute value type of ", constant_attribute.type(),
@@ -1098,7 +1100,6 @@ inline void CopyElement<uint8_t>(void* dst, const void* src, int64_t dst_index,
   reinterpret_cast<uint8_t*>(dst)[dst_index] = reinterpret_cast<const uint8_t*>(src)[src_index];
 }
 
-
 template <typename T>
 static void SetIndices(gsl::span<int64_t> gathered_indices,
                        std::string& raw_indices,
@@ -1109,7 +1110,8 @@ static void SetIndices(gsl::span<int64_t> gathered_indices,
   for (auto src_index : gathered_indices) {
     ORT_IF_CONSTEXPR(sizeof(T) == sizeof(int8_t)) {
       ind_dest[dest_index] = static_cast<T>(src_index);
-    } else {
+    }
+    else {
       auto* dst = ind_dest + dest_index;
       T v = static_cast<T>(src_index);
       memcpy(dst, &v, sizeof(T));
diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
index 840b0321ac..bf76d3bcbd 100644
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@@ -563,14 +563,14 @@ static common::Status ExecuteGraphImpl(const SessionState& session_state,
                                        const logging::Logger& logger, const bool only_execute_path_to_fetches = false) {
   std::unique_ptr<IExecutor> p_exec;
   if (execution_mode == ExecutionMode::ORT_SEQUENTIAL) {
-    p_exec = std::unique_ptr<IExecutor>(new SequentialExecutor(terminate_flag, only_execute_path_to_fetches));
+    p_exec = std::make_unique<SequentialExecutor>(terminate_flag, only_execute_path_to_fetches);
   } else if (execution_mode == ExecutionMode::ORT_PARALLEL) {
     auto* p_inter_op_thread_pool = session_state.GetInterOpThreadPool();
     if (!p_inter_op_thread_pool) {
       LOGS(logger, WARNING) << "Only one thread was configured for parallel execution. Hence will use sequential execution.";
-      p_exec = std::unique_ptr<IExecutor>(new SequentialExecutor(terminate_flag, only_execute_path_to_fetches));
+      p_exec = std::make_unique<SequentialExecutor>(terminate_flag, only_execute_path_to_fetches);
     } else {
-      p_exec = std::unique_ptr<IExecutor>(new ParallelExecutor(session_state, terminate_flag));
+      p_exec = std::make_unique<ParallelExecutor>(session_state, terminate_flag);
     }
   }
 
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 3e81f43850..f854839abe 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -705,7 +705,7 @@ flatbuffers::Offset<fbs::NodeEdge> Node::SaveEdgesToOrtFormat(flatbuffers::FlatB
 
 Status Node::LoadFromOrtFormat(const onnxruntime::fbs::Node& fbs_node, Graph& graph,
                                const logging::Logger& logger, std::unique_ptr<Node>& node) {
-  node.reset(new Node(fbs_node.index(), graph));
+  node = std::make_unique<Node>(fbs_node.index(), graph);
   return node->LoadFromOrtFormat(fbs_node, logger);
 }
 
@@ -860,9 +860,9 @@ void Node::CreateSubgraph(const std::string& attr_name) {
 
   if (attr != attributes_.cend() && utils::HasGraph(attr->second)) {
     GraphProto& mutable_graph = *attr->second.mutable_g();
-    std::unique_ptr<Graph> subgraph{new Graph(*graph_, *this, mutable_graph)};
+    std::unique_ptr<Graph> subgraph = std::make_unique<Graph>(*graph_, *this, mutable_graph);
     attr_to_subgraph_map_.insert({std::string(attr_name), gsl::not_null<Graph*>{subgraph.get()}});
-    subgraphs_.push_back(std::move(subgraph));
+    subgraphs_.emplace_back(std::move(subgraph));
   }
 }
 
@@ -3999,7 +3999,7 @@ Status Graph::InlineFunction(Node& node) {
   // main graph.
   const Graph& subgraph = node.GetFunctionBody()->Body();
   auto output_edges = node.GetRelationships().output_edges;
-  for (auto output_edge : output_edges) {
+  for (const auto& output_edge : output_edges) {
     RemoveEdge(node.Index(), output_edge.GetNode().Index(), output_edge.GetSrcArgIndex(), output_edge.GetDstArgIndex());
   }
 
@@ -4130,7 +4130,7 @@ void Graph::SetNodeArgType(NodeArg& arg, const ONNX_NAMESPACE::TypeProto& type_p
   GraphResolveNeeded(true);
 }
 
-#endif  //  !defined(ORT_MINIMAL_BUILD)
+#endif  // !defined(ORT_MINIMAL_BUILD)
 
 Graph::~Graph() {
   // nothing to do, but we put it here so we don't need to fully define types in Graph that are held in unique_ptr
@@ -4216,12 +4216,11 @@ Status Graph::LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph,
                                 IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
 #endif
                                 const logging::Logger& logger, std::unique_ptr<Graph>& graph) {
-  // can't use make_unique as we're calling a private ctor
-  graph.reset(new Graph(owning_model, domain_to_version,
+  graph = std::make_unique<Graph>(owning_model, domain_to_version,
 #if !defined(ORT_MINIMAL_BUILD)
-                        schema_registry,
+                                  schema_registry,
 #endif
-                        nullptr, nullptr, logger));
+                                  nullptr, nullptr, logger);
 
   ORT_RETURN_IF_ERROR(graph->LoadFromOrtFormat(fbs_graph));
 
@@ -4240,14 +4239,13 @@ Status Graph::LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph,
 Status Graph::LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph,
                                 Graph& parent_graph, const Node& parent_node,
                                 const logging::Logger& logger, std::unique_ptr<Graph>& graph) {
-  // can't use make_unique as we're calling a private ctor
-  graph.reset(new Graph(parent_graph.owning_model_,
-                        parent_graph.domain_to_version_,
+  graph = std::make_unique<Graph>(parent_graph.owning_model_,
+                                  parent_graph.domain_to_version_,
 #if !defined(ORT_MINIMAL_BUILD)
-                        parent_graph.schema_registry_,
+                                  parent_graph.schema_registry_,
 #endif
-                        &parent_graph, &parent_node,
-                        logger));
+                                  &parent_graph, &parent_node,
+                                  logger);
 
   return graph->LoadFromOrtFormat(fbs_graph);
 }
@@ -4348,8 +4346,7 @@ common::Status Graph::LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph
       ORT_RETURN_IF(nullptr == fbs_value_info, "NodeArg is missing. Invalid ORT format model.");
       NodeArgInfo node_arg_info;
       ORT_RETURN_IF_ERROR(fbs::utils::LoadValueInfoOrtFormat(*fbs_value_info, node_arg_info));
-      // NodeArg ctor is private, cannot use make_unique
-      node_args_[fbs_value_info->name()->str()] = std::unique_ptr<NodeArg>(new NodeArg(std::move(node_arg_info)));
+      node_args_[fbs_value_info->name()->str()] = std::make_unique<NodeArg>(std::move(node_arg_info));
     }
   }
 
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index f04b32d6b6..e9e34c17cd 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -347,7 +347,7 @@ Status Model::Load(const ModelProto& model_proto,
 
   auto status = Status::OK();
   ORT_TRY {
-    model.reset(new Model(model_proto, model_path, local_registries, logger));
+    model = std::make_unique<Model>(model_proto, model_path, local_registries, logger);
   }
   ORT_CATCH(const std::exception& ex) {
     ORT_HANDLE_EXCEPTION([&]() {
@@ -386,7 +386,7 @@ Status Model::Load(ModelProto&& model_proto,
   GSL_SUPPRESS(r .11)
   auto status = Status::OK();
   ORT_TRY {
-    model.reset(new Model(std::move(model_proto), model_path, local_registries, logger, allow_released_opsets_only));
+    model = std::make_unique<Model>(std::move(model_proto), model_path, local_registries, logger, allow_released_opsets_only);
   }
   ORT_CATCH(const std::exception& ex) {
     ORT_HANDLE_EXCEPTION([&]() {
@@ -730,7 +730,7 @@ common::Status Model::LoadFromOrtFormat(const fbs::Model& fbs_model,
 #endif
                                         const logging::Logger& logger,
                                         std::unique_ptr<Model>& model) {
-  model.reset(new Model());
+  model = std::make_unique<Model>();
 
   // Load the model metadata
   if (const auto* fbs_metadata_props = fbs_model.metadata_props()) {
diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h
index 77cd279284..5425c3ee08 100644
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@@ -272,9 +272,9 @@ class Model {
                                           const logging::Logger& logger,
                                           std::unique_ptr<Model>& model);
 
- private:
   Model();
 
+ private:
   // Model data.
 #if !defined(ORT_MINIMAL_BUILD)
   ONNX_NAMESPACE::ModelProto model_proto_;
diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index 78eceda4fc..4b5c682384 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -1018,6 +1018,7 @@ Return Value:
 }
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(push)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 void
diff --git a/onnxruntime/core/mlas/lib/dgemm.cpp b/onnxruntime/core/mlas/lib/dgemm.cpp
index 3688699843..b98d6f8bb2 100644
--- a/onnxruntime/core/mlas/lib/dgemm.cpp
+++ b/onnxruntime/core/mlas/lib/dgemm.cpp
@@ -805,6 +805,7 @@ Return Value:
 
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(push)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 void
diff --git a/onnxruntime/core/mlas/lib/qgemm_kernel_default.cpp b/onnxruntime/core/mlas/lib/qgemm_kernel_default.cpp
index c968c2aae5..eb8ef4e274 100644
--- a/onnxruntime/core/mlas/lib/qgemm_kernel_default.cpp
+++ b/onnxruntime/core/mlas/lib/qgemm_kernel_default.cpp
@@ -34,7 +34,7 @@ constexpr MLAS_GEMM_QUANT_STRIDES MLAS_GEMM_QUANT_KERNEL_DEFAULT::Strides;
 constexpr MLAS_GEMM_QUANT_STRIDES MLAS_GEMM_QUANT_KERNEL_DEFAULT::PackedStrides;
 
 template<>
-MLAS_FORCEINLINE constexpr 
+MLAS_FORCEINLINE constexpr
 int32_t
 MlasGemmQuantFixupZeroPointA<MLAS_GEMM_QUANT_KERNEL_DEFAULT>(
     int32_t ZeroPointA,
@@ -49,7 +49,7 @@ MlasGemmQuantFixupZeroPointA<MLAS_GEMM_QUANT_KERNEL_DEFAULT>(
 }
 
 template<>
-MLAS_FORCEINLINE
+MLAS_FORCEINLINE constexpr
 int32_t
 MlasGemmQuantFixupZeroPointB<MLAS_GEMM_QUANT_KERNEL_DEFAULT>(
     int32_t ZeroPointB,
diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp
index 0360127ca0..62170a2573 100644
--- a/onnxruntime/core/mlas/lib/sgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sgemm.cpp
@@ -1554,6 +1554,7 @@ Return Value:
 }
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(push)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 void
diff --git a/onnxruntime/core/optimizer/optimizer_execution_frame.cc b/onnxruntime/core/optimizer/optimizer_execution_frame.cc
index 828bfdbf87..77161ed6a8 100644
--- a/onnxruntime/core/optimizer/optimizer_execution_frame.cc
+++ b/onnxruntime/core/optimizer/optimizer_execution_frame.cc
@@ -41,7 +41,7 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
       size_t cpu_tensor_length;
       ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<0>(tensor_proto, &cpu_tensor_length));
       OrtValue ort_value;
-      std::unique_ptr<char[]> data(new char[cpu_tensor_length]);
+      std::unique_ptr<char[]> data = std::make_unique<char[]>(cpu_tensor_length);
       std::unique_ptr<Tensor> p_tensor;
       ORT_RETURN_IF_ERROR(utils::TensorProtoToMLValue(Env::Default(),
                                                       model_path.IsEmpty() ? nullptr : model_path.ToPathString().c_str(),
@@ -103,8 +103,9 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
 std::unique_ptr<const OpKernel> OptimizerExecutionFrame::Info::CreateKernel(const Node* node) const {
   std::unique_ptr<OpKernel> op_kernel;
   std::shared_ptr<KernelRegistry> kernel_registry = execution_provider_.GetKernelRegistry();
+  FuncManager func;
   auto status = kernel_registry->TryCreateKernel(*node, execution_provider_, initializers_,
-                                                 ort_value_name_idx_map_, FuncManager(), data_transfer_mgr_,
+                                                 ort_value_name_idx_map_, func, data_transfer_mgr_,
                                                  op_kernel);
 
   // Kernel found in the CPU kernel registry
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
index fb1c49e312..b5a90c36a9 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
@@ -28,10 +28,10 @@ void DropQDQNodesRules(SelectorsAndActions& qdq_selectors_and_actions) {
       MoveToSlot(dq, ArgType::kInput, 0, ArgType::kInput, 0),
       MoveToSlot(q, ArgType::kOutput, 0, ArgType::kOutput, 0)};
 
-  std::unique_ptr<Action> action(new MergeIntoTarget(std::move(moves)));
+  std::unique_ptr<Action> action = std::make_unique<MergeIntoTarget>(std::move(moves));
 
 #if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> selector(new QDQ::DropDQDNodesSelector());
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropDQDNodesSelector>();
   qdq_selectors_and_actions.RegisterSelectorAndAction(action_name,
                                                       SelectorAndAction::OpVersionsMap{{"Gather", {}},
                                                                                        {"Reshape", {}},
@@ -49,10 +49,10 @@ void UnaryOpQDQRules(SelectorsAndActions& qdq_selectors_and_actions, bool is_int
   // 3 nodes. DQ, target, Q
   // Replace with internal QLinear version of operator. Delete all original nodes.
   const std::string action_name{"1DQ"};
-  std::unique_ptr<Action> action(new QDQ::UnaryReplaceWithQLinear(kMSDomain));
+  std::unique_ptr<Action> action = std::make_unique<QDQ::UnaryReplaceWithQLinear>(kMSDomain);
 
 #if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> selector(new QDQ::UnarySelector(is_int8_allowed));
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::UnarySelector>(is_int8_allowed);
   qdq_selectors_and_actions.RegisterSelectorAndAction(action_name,
                                                       SelectorAndAction::OpVersionsMap{{"AveragePool", {}}},
                                                       std::move(selector),
@@ -67,10 +67,10 @@ void BinaryOpQDQRules(SelectorsAndActions& qdq_selectors_and_actions) {
   // 4 nodes. 2 x DQ for inputs, target, Q
   // Replace with internal QLinear version of operator. Delete all original nodes.
   const std::string action_name{"2DQ"};
-  std::unique_ptr<Action> action(new QDQ::BinaryReplaceWithQLinear(kMSDomain));
+  std::unique_ptr<Action> action = std::make_unique<QDQ::BinaryReplaceWithQLinear>(kMSDomain);
 
 #if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> selector(new QDQ::BinarySelector());
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::BinarySelector>();
   qdq_selectors_and_actions.RegisterSelectorAndAction(action_name,
                                                       SelectorAndAction::OpVersionsMap{{"Add", {}},
                                                                                        {"Mul", {}}},
@@ -86,10 +86,10 @@ void VariadicOpQDQRules(SelectorsAndActions& qdq_selectors_and_actions) {
   // 0=variadic DQ nodes 2=target, 3=Q
   // Replace with QLinear version of operator. Delete all original nodes.
   const std::string action_name{"*DQ"};
-  std::unique_ptr<Action> action(new QDQ::VariadicReplaceWithQLinear(kMSDomain));
+  std::unique_ptr<Action> action = std::make_unique<QDQ::VariadicReplaceWithQLinear>(kMSDomain);
 
 #if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> selector(new QDQ::VariadicSelector());
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::VariadicSelector>();
 
   qdq_selectors_and_actions.RegisterSelectorAndAction(action_name,
                                                       SelectorAndAction::OpVersionsMap{{"Concat", {}}},
@@ -107,10 +107,10 @@ void ConvQDQRules(SelectorsAndActions& qdq_selectors_and_actions, bool is_int8_a
   // Replace Conv with QLinearConv
   // Delete all original nodes
   const std::string action_name{"Conv"};
-  std::unique_ptr<Action> action(new QDQ::ConvReplaceWithQLinear());
+  std::unique_ptr<Action> action = std::make_unique<QDQ::ConvReplaceWithQLinear>();
 
 #if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> selector(new QDQ::ConvSelector(is_int8_allowed));
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::ConvSelector>(is_int8_allowed);
 
   qdq_selectors_and_actions.RegisterSelectorAndAction(action_name,
                                                       SelectorAndAction::OpVersionsMap{{"Conv", {}}},
@@ -129,10 +129,10 @@ void MatMulQDQRules(SelectorsAndActions& qdq_selectors_and_actions, bool is_int8
   // Delete all original nodes.
   const std::string action_name{"MatMul"};
 
-  std::unique_ptr<Action> action(new QDQ::MatMulReplaceWithQLinear());
+  std::unique_ptr<Action> action = std::make_unique<QDQ::MatMulReplaceWithQLinear>();
 
 #if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> selector(new QDQ::MatMulSelector(is_int8_allowed));
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::MatMulSelector>(is_int8_allowed);
   qdq_selectors_and_actions.RegisterSelectorAndAction(action_name,
                                                       SelectorAndAction::OpVersionsMap{{"MatMul", {}}},
                                                       std::move(selector),
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 01ad7b1521..6e370fffa2 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -54,6 +54,11 @@ class WindowsThread : public EnvThread {
     unsigned (*start_address)(int id, Eigen::ThreadPoolInterface* param);
     Eigen::ThreadPoolInterface* param;
     const ThreadOptions& thread_options;
+    Param(const ORTCHAR_T* name_prefix1,
+          int index1,
+          unsigned (*start_address1)(int id, Eigen::ThreadPoolInterface* param),
+          Eigen::ThreadPoolInterface* param1,
+          const ThreadOptions& thread_options1) : name_prefix(name_prefix1), index(index1), start_address(start_address1), param(param1), thread_options(thread_options1) {}
   };
 
  public:
@@ -63,15 +68,15 @@ class WindowsThread : public EnvThread {
     custom_create_thread_fn = thread_options.custom_create_thread_fn;
     custom_thread_creation_options = thread_options.custom_thread_creation_options;
     custom_join_thread_fn = thread_options.custom_join_thread_fn;
-
+    std::unique_ptr<Param> local_param = std::make_unique<Param>(name_prefix, index, start_address, param, thread_options);
     if (custom_create_thread_fn) {
-      custom_thread_handle = custom_create_thread_fn(custom_thread_creation_options, (OrtThreadWorkerFn)CustomThreadMain, new Param{name_prefix, index, start_address, param, thread_options});
+      custom_thread_handle = custom_create_thread_fn(custom_thread_creation_options, (OrtThreadWorkerFn)CustomThreadMain, local_param.release());
       if (!custom_thread_handle) {
-        ORT_THROW("custom_create_thread_fn returned invalid handle."); 
+        ORT_THROW("custom_create_thread_fn returned invalid handle.");
       }
     } else {
       hThread.reset(reinterpret_cast<HANDLE>(_beginthreadex(nullptr, thread_options.stack_size, ThreadMain,
-                                                            new Param{name_prefix, index, start_address, param, thread_options}, 0,
+                                                            local_param.release(), 0,
                                                             &threadID)));
     }
   }
@@ -142,12 +147,18 @@ class WindowsThread : public EnvThread {
 
 class WindowsEnv : public Env {
  public:
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 26409)
+#endif
   EnvThread* CreateThread(_In_opt_z_ const ORTCHAR_T* name_prefix, int index,
                           unsigned (*start_address)(int id, Eigen::ThreadPoolInterface* param),
                           Eigen::ThreadPoolInterface* param, const ThreadOptions& thread_options) {
     return new WindowsThread(name_prefix, index, start_address, param, thread_options);
   }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
   void SleepForMicroseconds(int64_t micros) const override {
     Sleep(static_cast<DWORD>(micros) / 1000);
   }
@@ -524,14 +535,32 @@ class WindowsEnv : public Env {
   }
 
   virtual Status LoadDynamicLibrary(const std::string& library_filename, bool /*global_symbols*/, void** handle) const override {
+    const std::wstring& wlibrary_filename = ToWideString(library_filename);
 #if WINAPI_FAMILY == WINAPI_FAMILY_PC_APP
-    *handle = ::LoadPackagedLibrary(ToWideString(library_filename).c_str(), 0);
+    *handle = ::LoadPackagedLibrary(wlibrary_filename.c_str(), 0);
 #else
-    *handle = ::LoadLibraryExA(library_filename.c_str(), nullptr, LOAD_WITH_ALTERED_SEARCH_PATH);
+    // TODO: in most cases, the path name is a relative path and the behavior of the following line of code is undefined.
+    *handle = ::LoadLibraryExW(wlibrary_filename.c_str(), nullptr, LOAD_WITH_ALTERED_SEARCH_PATH);
 #endif
     if (!*handle) {
       const auto error_code = GetLastError();
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "LoadLibrary failed with error ", error_code, " \"", std::system_category().message(error_code), "\" when trying to load \"", library_filename, "\"");
+      LPVOID lpMsgBuf;
+      FormatMessageW(
+          FORMAT_MESSAGE_ALLOCATE_BUFFER |
+              FORMAT_MESSAGE_FROM_SYSTEM |
+              FORMAT_MESSAGE_IGNORE_INSERTS,
+          NULL,
+          error_code,
+          MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+          (LPWSTR)&lpMsgBuf,
+          0, NULL);
+      std::wostringstream oss;
+      oss << L"LoadLibrary failed with error " << error_code << L" \"" << (LPWSTR)lpMsgBuf << L"\" when trying to load \"" << wlibrary_filename << L"\"";
+      std::wstring errmsg = oss.str();
+      // TODO: errmsg should be converted to UTF-8 as it will be passed out to the C interface.
+      common::Status status(common::ONNXRUNTIME, common::FAIL, ToMBString(errmsg));
+      LocalFree(lpMsgBuf);
+      return status;
     }
     return Status::OK();
   }
@@ -548,7 +577,23 @@ class WindowsEnv : public Env {
     *symbol = ::GetProcAddress(reinterpret_cast<HMODULE>(handle), symbol_name.c_str());
     if (!*symbol) {
       const auto error_code = GetLastError();
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to find symbol in library, error code: ", error_code, " - ", std::system_category().message(error_code));
+      LPVOID lpMsgBuf;
+      FormatMessageW(
+          FORMAT_MESSAGE_ALLOCATE_BUFFER |
+              FORMAT_MESSAGE_FROM_SYSTEM |
+              FORMAT_MESSAGE_IGNORE_INSERTS,
+          NULL,
+          error_code,
+          MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+          (LPWSTR)&lpMsgBuf,
+          0, NULL);
+      std::wostringstream oss;
+      oss << L"Failed to find symbol " << ToWideString(symbol_name) << L" in library, error code: " << error_code << L" \"" << (LPWSTR)lpMsgBuf << L"\"";
+      std::wstring errmsg = oss.str();
+      // TODO: errmsg should be converted to UTF-8 as it will be passed out to the C interface.
+      common::Status status(common::ONNXRUNTIME, common::FAIL, ToMBString(errmsg));
+      LocalFree(lpMsgBuf);
+      return status;
     }
     return Status::OK();
   }
@@ -576,11 +621,11 @@ class WindowsEnv : public Env {
     // Create buffer to hold the result
     std::string buffer(kBufferSize, '\0');
 
-    //The last argument is the size of the buffer pointed to by the lpBuffer parameter, including the null-terminating character, in characters.
-    //If the function succeeds, the return value is the number of characters stored in the buffer pointed to by lpBuffer, not including the terminating null character.
-    //Therefore, If the function succeeds, kBufferSize should be larger than char_count.
+    // The last argument is the size of the buffer pointed to by the lpBuffer parameter, including the null-terminating character, in characters.
+    // If the function succeeds, the return value is the number of characters stored in the buffer pointed to by lpBuffer, not including the terminating null character.
+    // Therefore, If the function succeeds, kBufferSize should be larger than char_count.
     auto char_count = GetEnvironmentVariableA(var_name.c_str(), buffer.data(), kBufferSize);
-    
+
     if (kBufferSize > char_count) {
       buffer.resize(char_count);
       return buffer;
diff --git a/onnxruntime/core/platform/windows/stacktrace.cc b/onnxruntime/core/platform/windows/stacktrace.cc
index 53e403a042..4e159b8edd 100644
--- a/onnxruntime/core/platform/windows/stacktrace.cc
+++ b/onnxruntime/core/platform/windows/stacktrace.cc
@@ -104,7 +104,7 @@ std::vector<std::string> CaptureStackTrace::Trace() const {
   stacktrace.reserve(num_frames);
 
   // hide CaptureStackTrace::Trace and GetStackTrace so the output starts with the 'real' location
-  const int frames_to_skip = 2;
+  constexpr int frames_to_skip = 2;
 
   // we generally want to skip the first two frames, but if something weird is going on (e.g. code coverage is
   // running) and we only have 1 or 2 frames, output them so there's at least something that may be meaningful
diff --git a/onnxruntime/core/providers/common.h b/onnxruntime/core/providers/common.h
index 0b9179c0b4..22949ed297 100644
--- a/onnxruntime/core/providers/common.h
+++ b/onnxruntime/core/providers/common.h
@@ -118,9 +118,9 @@ inline Status ComputePad(const int64_t in_dim,
   return Status::OK();
 }
 
-inline int64_t ComputeOutputShape(const int64_t in_dim,
-                                  const int64_t stride, const int64_t kernel, const int64_t dilation,
-                                  const int64_t pad_head, const int64_t pad_tail) {
+constexpr inline int64_t ComputeOutputShape(const int64_t in_dim,
+                                            const int64_t stride, const int64_t kernel, const int64_t dilation,
+                                            const int64_t pad_head, const int64_t pad_tail) {
   const int64_t dkernel = dilation * (kernel - 1) + 1;
   return static_cast<int64_t>(static_cast<double>(in_dim + pad_head + pad_tail - dkernel) / stride + 1);
 }
diff --git a/onnxruntime/core/providers/cpu/activation/activations.h b/onnxruntime/core/providers/cpu/activation/activations.h
index 75a73a818d..9b5fd84929 100644
--- a/onnxruntime/core/providers/cpu/activation/activations.h
+++ b/onnxruntime/core/providers/cpu/activation/activations.h
@@ -84,6 +84,7 @@ struct Softplus : public ElementWiseRangedTransform<T> {
   Status Init(const onnxruntime::NodeAttributes&) {
     return Status::OK();
   }
+  GSL_SUPPRESS(r .11)
   ElementWiseRangedTransform<T>* Copy() const {
     using T1 = typename std::remove_pointer<decltype(this)>::type;
     using T2 = typename std::remove_const<T1>::type;
@@ -106,6 +107,7 @@ struct Relu : public ElementWiseRangedTransform<T> {
   Status Init(const onnxruntime::NodeAttributes&) {
     return Status::OK();
   }
+  GSL_SUPPRESS(r .11)
   ElementWiseRangedTransform<T>* Copy() const {  // replace it with a macro. why this?
     using T1 = typename std::remove_pointer<decltype(this)>::type;
     using T2 = typename std::remove_const<T1>::type;  //redundant?
@@ -128,6 +130,7 @@ struct Sigmoid : public ElementWiseRangedTransform<T> {
   Status Init(const onnxruntime::NodeAttributes&) {
     return Status::OK();
   }
+  GSL_SUPPRESS(r .11)
   ElementWiseRangedTransform<T>* Copy() const {
     using T1 = typename std::remove_pointer<decltype(this)>::type;
     using T2 = typename std::remove_const<T1>::type;
@@ -153,6 +156,7 @@ struct Softsign : public ElementWiseRangedTransform<T> {
   Status Init(const onnxruntime::NodeAttributes&) {
     return Status::OK();
   }
+  GSL_SUPPRESS(r .11)
   ElementWiseRangedTransform<T>* Copy() const {
     using T1 = typename std::remove_pointer<decltype(this)>::type;
     using T2 = typename std::remove_const<T1>::type;
@@ -175,6 +179,7 @@ struct Tanh : public ElementWiseRangedTransform<T> {
   Status Init(const onnxruntime::NodeAttributes&) {
     return Status::OK();
   }
+  GSL_SUPPRESS(r .11)
   ElementWiseRangedTransform<T>* Copy() const {
     using T1 = typename std::remove_pointer<decltype(this)>::type;
     using T2 = typename std::remove_const<T1>::type;
@@ -192,6 +197,7 @@ struct Tanh : public ElementWiseRangedTransform<T> {
     ym = xm.tanh();
   }
 };
+
 template <>
 void Tanh<float>::operator()(std::ptrdiff_t first, std::ptrdiff_t last) const;
 
@@ -226,7 +232,6 @@ struct Selu : public ElementWiseRangedTransform<T> {
     ym = (T)gamma * (xm.cwiseMax(0.0f) + ((T)alpha * (xm.array().exp() - 1.0f)).cwiseMin(0.0f));
   }
 };
-
 }  // namespace functors
 
 DEFINE_ELE_KERNEL(Celu);
diff --git a/onnxruntime/core/providers/cpu/controlflow/loop.cc b/onnxruntime/core/providers/cpu/controlflow/loop.cc
index b844d79206..fac00d83b0 100644
--- a/onnxruntime/core/providers/cpu/controlflow/loop.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/loop.cc
@@ -142,7 +142,7 @@ Loop::Info::Info(const onnxruntime::Node& node, const GraphViewer& subgraph_in)
   const auto& node_input_types = node.InputDefs();
   loop_carried_vars_types.reserve(num_subgraph_inputs);
   for (int i = 0; i < num_loop_carried_vars; ++i) {
-    loop_carried_vars_types.push_back(node_input_types[i + 2]->TypeAsProto());
+    loop_carried_vars_types.push_back(node_input_types[static_cast<size_t>(i) + 2]->TypeAsProto());
   }
 
   auto& subgraph_inputs = subgraph.GetInputs();
@@ -280,7 +280,7 @@ common::Status Loop::SetupSubgraphExecutionInfo(const SessionState& session_stat
   // the Loop inputs are matched to subgraph feeds based on order.
   // we first need the names of the Loop inputs to determine what device they are available on
   std::vector<std::string> feed_names;
-  feed_names.reserve(info_->num_subgraph_inputs + info_->num_implicit_inputs);
+  feed_names.reserve(static_cast<size_t>(info_->num_subgraph_inputs) + info_->num_implicit_inputs);
 
   // iter_num and cond subgraph inputs - created by the LoopImpl::Initialize so the name doesn't matter
   // as we skip them when we call FindDevicesForValues, and default them to always being on CPU.
@@ -291,7 +291,7 @@ common::Status Loop::SetupSubgraphExecutionInfo(const SessionState& session_stat
   const auto& loop_inputs = node.InputDefs();
   for (int i = 0; i < info_->num_loop_carried_vars; ++i) {
     // + 2 to skip 'M' and 'cond' Loop inputs
-    feed_names.push_back(loop_inputs[i + 2]->Name());
+    feed_names.push_back(loop_inputs[static_cast<size_t>(i) + 2]->Name());
   }
 
   for (auto& entry : node.ImplicitInputDefs()) {
@@ -306,7 +306,7 @@ common::Status Loop::SetupSubgraphExecutionInfo(const SessionState& session_stat
 
   // now update the feed names to use the subgraph input names for the loop carried vars so that we can determine
   // what device the subgraph needs them on
-  for (int i = 0; i < info_->num_loop_carried_vars; ++i) {
+  for (ptrdiff_t i = 0; i < info_->num_loop_carried_vars; ++i) {
     // +2 for both to skip the iter_num and cond values
     feed_names[i + 2] = info_->subgraph_input_names[i + 2];
   }
@@ -329,7 +329,7 @@ common::Status Loop::SetupSubgraphExecutionInfo(const SessionState& session_stat
 
   // Loop state variables need to be where we can feed them in to the next iteration, so set the fetch location
   // to match the feed location.
-  for (int i = 0; i < info_->num_loop_carried_vars; ++i) {
+  for (ptrdiff_t i = 0; i < info_->num_loop_carried_vars; ++i) {
     // +2 for both to skip the iter_num and cond input values
     const auto& alloc_info = utils::FindMemoryInfoForValue(session_state, loop_inputs[i + 2]->Name());
     fetch_locations.push_back(&alloc_info);
@@ -421,13 +421,13 @@ Status LoopImpl::Initialize() {
   iter_num_mlvalue_ = MakeScalarMLValue<int64_t>(cpu_allocator, 0, iter_num_rank != 0);
   condition_mlvalue_ = MakeScalarMLValue<bool>(cpu_allocator, condition_, condition_rank != 0);
 
-  loop_output_tensors_.resize(info_.num_outputs - info_.num_loop_carried_vars);
+  loop_output_tensors_.resize(static_cast<size_t>(info_.num_outputs) - info_.num_loop_carried_vars);
 
   return status;
 }
 
 void LoopImpl::CreateInitialFeeds(std::vector<OrtValue>& feeds) {
-  feeds.reserve(info_.num_subgraph_inputs + info_.num_implicit_inputs);
+  feeds.reserve(static_cast<size_t>(info_.num_subgraph_inputs) + info_.num_implicit_inputs);
 
   // This ordering is the same as used in SetupSubgraphExecutionInfo
   feeds.push_back(iter_num_mlvalue_);
@@ -450,12 +450,12 @@ void LoopImpl::SaveOutputsAndUpdateFeeds(const std::vector<OrtValue>& last_outpu
   // next_input: iter_num, cond, loop_vars. iter_num is re-used
 
   // simple copy for cond and loop carried vars. start at 1 to skip iter_num in input
-  for (int i = 1; i < info_.num_subgraph_inputs; ++i) {
+  for (ptrdiff_t i = 1; i < info_.num_subgraph_inputs; ++i) {
     next_inputs[i] = last_outputs[i - 1];
   }
 
   // save loop outputs as we have to concatenate at the end
-  for (int j = info_.num_loop_carried_vars; j < info_.num_outputs; ++j) {
+  for (ptrdiff_t j = info_.num_loop_carried_vars; j < info_.num_outputs; ++j) {
     ORT_ENFORCE(last_outputs[j + 1].IsTensor(), "All scan outputs MUST be tensors");
     loop_output_tensors_[j - info_.num_loop_carried_vars].push_back(last_outputs[j + 1]);  // skip 'cond' in output
   }
@@ -570,13 +570,13 @@ Status LoopImpl::Execute(const FeedsFetchesManager& ffm) {
   if (iter_num_value != 0) {
     for (int i = 0; i < info_.num_loop_carried_vars; ++i) {
       // need to allocate Loop output and copy OrtValue from fetches
-      ORT_RETURN_IF_ERROR(copy_mlvalue_to_output(fetches[i + 1], i, iter_num_value, *info_.loop_carried_vars_types[i]));  // skip cond
+      ORT_RETURN_IF_ERROR(copy_mlvalue_to_output(fetches[static_cast<ptrdiff_t>(i) + 1], i, iter_num_value, *info_.loop_carried_vars_types[static_cast<ptrdiff_t>(i)]));  // skip cond
     }
 
     for (int i = info_.num_loop_carried_vars; i < info_.num_outputs; ++i) {
       // add last output
-      auto& per_iteration_outputs = loop_output_tensors_[i - info_.num_loop_carried_vars];
-      per_iteration_outputs.push_back(fetches[i + 1]);  // skip cond
+      auto& per_iteration_outputs = loop_output_tensors_[static_cast<ptrdiff_t>(i) - info_.num_loop_carried_vars];
+      per_iteration_outputs.push_back(fetches[static_cast<ptrdiff_t>(i) + 1]);  // skip cond
 
       ORT_RETURN_IF_ERROR(ConcatenateLoopOutput(per_iteration_outputs, i));
     }
@@ -584,7 +584,7 @@ Status LoopImpl::Execute(const FeedsFetchesManager& ffm) {
     // no iterations.
     // copy input loop carried vars to output.
     for (int i = 0; i < info_.num_loop_carried_vars; ++i) {
-      ORT_RETURN_IF_ERROR(copy_mlvalue_to_output(feeds[i + 2], i, iter_num_value, *info_.loop_carried_vars_types[i]));  // skip iter# and cond
+      ORT_RETURN_IF_ERROR(copy_mlvalue_to_output(feeds[static_cast<ptrdiff_t>(i) + 2], i, iter_num_value, *info_.loop_carried_vars_types[i]));  // skip iter# and cond
     }
 
     // create empty outputs for loop outputs using the subgraph output shapes for the rank
@@ -592,11 +592,11 @@ Status LoopImpl::Execute(const FeedsFetchesManager& ffm) {
 
     for (int i = info_.num_loop_carried_vars; i < info_.num_outputs; ++i) {
       // get shape from subgraph output if possible to attempt to have the correct rank
-      auto* graph_output = graph_outputs.at(i + 1);  // + 1 as first subgraph output is condition value
+      auto* graph_output = graph_outputs.at(static_cast<ptrdiff_t>(i) + 1);  // + 1 as first subgraph output is condition value
       auto* graph_output_shape = graph_output->Shape();
 
       std::vector<int64_t> output_dims;
-      output_dims.reserve((graph_output_shape ? graph_output_shape->dim_size() : 0) + 1);
+      output_dims.reserve(static_cast<ptrdiff_t>(graph_output_shape ? graph_output_shape->dim_size() : 0) + 1);
       output_dims.push_back(0);  // num iterations is first dim
 
       if (graph_output_shape) {
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_8.cc b/onnxruntime/core/providers/cpu/controlflow/scan_8.cc
index b5765bf5ca..53e7ae0e69 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan_8.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_8.cc
@@ -161,7 +161,7 @@ Status Scan<8>::SetupSubgraphExecutionInfo(const SessionState& session_state,
 
   const auto& node = Node();
   info_ = std::make_unique<Scan<8>::Info>(node, subgraph_session_state.GetGraphViewer(),
-                                                  static_cast<int>(num_scan_inputs_));
+                                          static_cast<int>(num_scan_inputs_));
 
   auto status = scan::detail::CreateFeedsFetchesManager(node, *info_, session_state, subgraph_session_state,
                                                         /* is_v8 */ true, feeds_fetches_manager_);
@@ -396,13 +396,13 @@ Status Scan8Impl::Execute(const FeedsFetchesManager& ffm) {
 
     // Setup input OrtValue streams
     std::vector<OrtValueTensorSlicer<const OrtValue>::Iterator> scan_input_stream_iterators;
-    scan_input_stream_iterators.reserve(info_.num_variadic_inputs - info_.num_loop_state_variables);
+    scan_input_stream_iterators.reserve(static_cast<size_t>(info_.num_variadic_inputs) - info_.num_loop_state_variables);
 
     for (int i = info_.num_loop_state_variables, end = info_.num_variadic_inputs; i < end; ++i) {
       const auto& ort_value = GetSubgraphInputMLValue(context_, i);
 
       // forward
-      if (directions_[i - info_.num_loop_state_variables] == static_cast<int64_t>(ScanDirection::kForward)) {
+      if (directions_[static_cast<ptrdiff_t>(i) - info_.num_loop_state_variables] == static_cast<int64_t>(ScanDirection::kForward)) {
         // the iterator is self contained, so we don't need to keep the OrtValueTensorSlicer instance around
         scan_input_stream_iterators.push_back(device_helpers_.create_const_slicer_func(ort_value, 1, b).begin());
       } else {  // reverse
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_9.cc b/onnxruntime/core/providers/cpu/controlflow/scan_9.cc
index 001c2d9ac8..c387a3da4c 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan_9.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_9.cc
@@ -205,7 +205,7 @@ Status Scan<9>::SetupSubgraphExecutionInfo(const SessionState& session_state,
 
   const auto& node = Node();
   info_ = std::make_unique<Scan<9>::Info>(node, subgraph_session_state.GetGraphViewer(),
-                                                  static_cast<int>(num_scan_inputs_));
+                                          static_cast<int>(num_scan_inputs_));
 
   auto status = scan::detail::CreateFeedsFetchesManager(node, *info_, session_state, subgraph_session_state,
                                                         /* is_v8 */ false, feeds_fetches_manager_);
@@ -281,7 +281,7 @@ Status ScanImpl::ValidateSubgraphInput(int start_input, int end_input,
                              " Expected ", min_dims_required,
                              " dimensions or more but input had shape of ", input_shape);
 
-    auto seq_len_dim = input_axes_[i - info_.num_loop_state_variables];
+    auto seq_len_dim = input_axes_[static_cast<ptrdiff_t>(i) - info_.num_loop_state_variables];
     auto this_seq_len = input_shape[seq_len_dim];
 
     if (sequence_len_ < 0) {
@@ -432,7 +432,7 @@ Status ScanImpl::Execute(const FeedsFetchesManager& ffm) {
 
   // Setup input OrtValue streams
   std::vector<OrtValueTensorSlicer<const OrtValue>::Iterator> scan_input_stream_iterators;
-  scan_input_stream_iterators.reserve(info_.num_inputs - info_.num_loop_state_variables);
+  scan_input_stream_iterators.reserve(static_cast<size_t>(info_.num_inputs) - info_.num_loop_state_variables);
 
   for (int i = 0, end = info_.num_scan_inputs; i < end; ++i) {
     const auto& ort_value = inputs_[i];
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
index 49a5cb19ec..e5b25b56a4 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
@@ -142,7 +142,7 @@ Status CreateFeedsFetchesManager(const Node& node,
   // we need the names of the Scan inputs to determine what device they are available on,
   // so first create a list using those value
   std::vector<std::string> feed_names;
-  feed_names.reserve(info.num_variadic_inputs + info.num_implicit_inputs);
+  feed_names.reserve(static_cast<size_t>(info.num_variadic_inputs) + info.num_implicit_inputs);
 
   const auto& scan_inputs = node.InputDefs();
   int start = is_v8 ? 1 : 0;  // skip sequence_lens for v8
@@ -217,7 +217,7 @@ Status IterateSequence(OpKernelContextInternal& context, const SessionState& ses
         feeds[input] = loop_state_variables[input].Input();
       } else {
         // add sliced input
-        auto& iterator = scan_input_stream_iterators[input - num_loop_state_variables];
+        auto& iterator = scan_input_stream_iterators[static_cast<ptrdiff_t>(input) - num_loop_state_variables];
         feeds[input] = *iterator;
 
         ++iterator;
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_utils.h b/onnxruntime/core/providers/cpu/controlflow/scan_utils.h
index 6e36fe2f28..4eee03b6f2 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.h
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.h
@@ -52,7 +52,7 @@ class LoopStateVariable {
   const OrtValue original_value_;
   OrtValue final_value_;
 
-  /* we use original_value and final_value once, 
+  /* we use original_value and final_value once,
      and alternate between a_ and b_ as input/output for each iteration to avoid copies
 
     Iteration   Input             Output
@@ -88,8 +88,8 @@ class OutputIterator {
                        ScanDirection direction = ScanDirection::kForward,
                        bool temporary = false,
                        MLDataType data_type = nullptr) {
-    iterator.reset(new OutputIterator(context, output_index, is_loop_state_var, is_v8, final_shape,
-                                      create_slicer_func, zero_data_func, direction, temporary, data_type));
+    iterator = std::make_unique<OutputIterator>(context, output_index, is_loop_state_var, is_v8, final_shape,
+                                                create_slicer_func, zero_data_func, direction, temporary, data_type);
     return iterator->Initialize();
   }
 
@@ -115,8 +115,7 @@ class OutputIterator {
     ORT_ENFORCE(final_output_mlvalue_, "Attempt to retrieve final output before it was set.");
     return *final_output_mlvalue_;
   }
-
- private:
+  //std::unique_ptr needs to access this function.
   OutputIterator(OpKernelContextInternal& context,
                  int output_index,
                  bool is_loop_state_var,
@@ -128,6 +127,7 @@ class OutputIterator {
                  bool temporary,
                  MLDataType data_type);
 
+ private:
   Status Initialize();
   Status AllocateFinalBuffer();
 
@@ -201,7 +201,7 @@ void CalculateTransposedShapeForInput(const TensorShape& original_shape, int64_t
 /**
 Calculate the transpose permutations and shape by shifting the chosen axis FROM the first dimension.
 
-e.g. if shape is {4, 2, 3} and axis 2 is chosen, dimension 0 will move to dimension 2, 
+e.g. if shape is {4, 2, 3} and axis 2 is chosen, dimension 0 will move to dimension 2,
      the permutations will be {1, 2, 0} and output shape will be {2, 3, 4}
 */
 void CalculateTransposedShapeForOutput(const TensorShape& original_shape, int64_t axis,
diff --git a/onnxruntime/core/providers/cpu/cpu_provider_factory.cc b/onnxruntime/core/providers/cpu/cpu_provider_factory.cc
index a2dff68a39..00222f5642 100644
--- a/onnxruntime/core/providers/cpu/cpu_provider_factory.cc
+++ b/onnxruntime/core/providers/cpu/cpu_provider_factory.cc
@@ -37,7 +37,9 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_CPU, _In_ OrtSessio
   options->provider_factories.push_back(onnxruntime::CreateExecutionProviderFactory_CPU(use_arena));
   return nullptr;
 }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 ORT_API_STATUS_IMPL(OrtApis::CreateCpuMemoryInfo, enum OrtAllocatorType type, enum OrtMemType mem_type,
                     _Outptr_ OrtMemoryInfo** out) {
   *out = new OrtMemoryInfo(onnxruntime::CPU, type, OrtDevice(), 0, mem_type);
diff --git a/onnxruntime/core/providers/cpu/element_wise_ranged_transform.h b/onnxruntime/core/providers/cpu/element_wise_ranged_transform.h
index b7fe209203..f6cfc64216 100644
--- a/onnxruntime/core/providers/cpu/element_wise_ranged_transform.h
+++ b/onnxruntime/core/providers/cpu/element_wise_ranged_transform.h
@@ -56,6 +56,7 @@ ElementWiseRangedTransform<T>::~ElementWiseRangedTransform() {
   Status Init(const onnxruntime::NodeAttributes& attributes) {     \
     return (GetFloatParam(#X, attributes, X));                     \
   }                                                                \
+  GSL_SUPPRESS(r .11)                                              \
   ElementWiseRangedTransform<T>* Copy() const final {              \
     using T1 = typename std::remove_pointer<decltype(this)>::type; \
     using T2 = typename std::remove_const<T1>::type;               \
@@ -70,6 +71,7 @@ ElementWiseRangedTransform<T>::~ElementWiseRangedTransform() {
     ORT_RETURN_IF_ERROR(GetFloatParam(#Y, attributes, Y));         \
     return Status::OK();                                           \
   }                                                                \
+  GSL_SUPPRESS(r .11)                                              \
   ElementWiseRangedTransform<T>* Copy() const final {              \
     using T1 = typename std::remove_pointer<decltype(this)>::type; \
     using T2 = typename std::remove_const<T1>::type;               \
diff --git a/onnxruntime/core/providers/cpu/generator/range.cc b/onnxruntime/core/providers/cpu/generator/range.cc
index 05ef9f508a..97903fd074 100644
--- a/onnxruntime/core/providers/cpu/generator/range.cc
+++ b/onnxruntime/core/providers/cpu/generator/range.cc
@@ -6,7 +6,10 @@
 #include <cmath>
 
 #include "core/providers/op_kernel_type_control.h"
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 
 namespace op_kernel_type_control {
diff --git a/onnxruntime/core/providers/cpu/math/clip.h b/onnxruntime/core/providers/cpu/math/clip.h
index cdbe86ee66..938f41f1f2 100644
--- a/onnxruntime/core/providers/cpu/math/clip.h
+++ b/onnxruntime/core/providers/cpu/math/clip.h
@@ -16,8 +16,8 @@ template <typename T>
 class Clip_6Base {
  public:
   explicit Clip_6Base(const OpKernelInfo& info) {
-    auto min_val = std::numeric_limits<T>::lowest();
-    auto max_val = std::numeric_limits<T>::max();
+    constexpr auto min_val = std::numeric_limits<T>::lowest();
+    constexpr auto max_val = std::numeric_limits<T>::max();
     info.GetAttrOrDefault("min", &min_, min_val);
     info.GetAttrOrDefault("max", &max_, max_val);
     ORT_ENFORCE(min_ <= max_);
diff --git a/onnxruntime/core/providers/cpu/math/det.cc b/onnxruntime/core/providers/cpu/math/det.cc
index c7b2feffe4..996e8505f0 100644
--- a/onnxruntime/core/providers/cpu/math/det.cc
+++ b/onnxruntime/core/providers/cpu/math/det.cc
@@ -3,6 +3,11 @@
 
 #include "core/providers/cpu/math/det.h"
 #include "core/util/math_cpuonly.h"
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
 
 using namespace onnxruntime::common;
 
diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc
index 88ef7738b2..ab6944b011 100644
--- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc
+++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc
@@ -114,7 +114,7 @@ Status EinsumComputePreprocessor::ProcessSubscripts() {
           // Example for the following line of code
           // Subscript "...ij" for an input of rank 6
           // num_of_ellipsis_dims = 6 - 5 + 3 = 4
-          int64_t current_num_of_ellipsis_dims = rank - subscript.length() + 3;
+          int64_t current_num_of_ellipsis_dims = static_cast<int64_t>(rank) - subscript.length() + 3;
           if (current_num_of_ellipsis_dims < 0) {
             return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                                    "Einsum subscripts string contains too many subscript labels when compared to the rank of the input");
diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
index 3403023608..5224c7fef5 100644
--- a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
+++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
@@ -1794,8 +1794,8 @@ void UntypedBroadcastTwo(OpKernelContext& context, const ProcessBroadcastSpanFun
 
     concurrency::ThreadPool::TryParallelFor(
         tp, output_size / span_size,
-        TensorOpCost{static_cast<float>(input_broadcaster.Input0ElementSize() * span_size),
-                     static_cast<float>(output_tensor.DataType()->Size() * span_size),
+        TensorOpCost{static_cast<double>(input_broadcaster.Input0ElementSize()) * span_size,
+                     static_cast<double>(output_tensor.DataType()->Size()) * span_size,
                      unit_cost * span_size},
         [span_size, &const_input_broadcaster, &output_tensor, &funcs, user_data](std::ptrdiff_t first_span,
                                                                                  std::ptrdiff_t last_span) {
diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.h b/onnxruntime/core/providers/cpu/math/element_wise_ops.h
index b95738c878..a94d6b25c3 100644
--- a/onnxruntime/core/providers/cpu/math/element_wise_ops.h
+++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.h
@@ -10,7 +10,12 @@
 
 namespace onnxruntime {
 namespace functors {
-
+// TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+// Do not use raw new/delete.
+#pragma warning(disable : 26409)
+#endif
 template <typename T>
 struct Log final : public ElementWiseRangedTransform<T> {
   Status Init(const onnxruntime::NodeAttributes) {
@@ -982,4 +987,7 @@ struct TensorAllocator {
  private:
   AllocatorPtr allocator_;
 };
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/ml/ml_common.h b/onnxruntime/core/providers/cpu/ml/ml_common.h
index e2fb9139e1..e40b2a563f 100644
--- a/onnxruntime/core/providers/cpu/ml/ml_common.h
+++ b/onnxruntime/core/providers/cpu/ml/ml_common.h
@@ -241,7 +241,7 @@ static inline void multiclass_probability(int64_t classcount,
   }
 }
 
-static const float ml_sqrt2 = 1.41421356f;
+static constexpr float ml_sqrt2 = 1.41421356f;
 
 static inline float ComputeLogistic(float val) {
   float v = 1 / (1 + std::exp(-std::abs(val)));
diff --git a/onnxruntime/core/providers/cpu/ml/svmclassifier.cc b/onnxruntime/core/providers/cpu/ml/svmclassifier.cc
index e9bd36e846..070d0aec73 100644
--- a/onnxruntime/core/providers/cpu/ml/svmclassifier.cc
+++ b/onnxruntime/core/providers/cpu/ml/svmclassifier.cc
@@ -3,6 +3,11 @@
 
 #include "core/providers/cpu/ml/svmclassifier.h"
 #include "core/platform/threadpool.h"
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
 
 namespace onnxruntime {
 namespace ml {
diff --git a/onnxruntime/core/providers/cpu/nn/batch_norm.h b/onnxruntime/core/providers/cpu/nn/batch_norm.h
index fc109dff76..bb8400094b 100644
--- a/onnxruntime/core/providers/cpu/nn/batch_norm.h
+++ b/onnxruntime/core/providers/cpu/nn/batch_norm.h
@@ -58,7 +58,10 @@ class BatchNorm : public OpKernel {
 #endif
     }
   }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 26451)
+#endif
   Status Compute(OpKernelContext* p_op_kernel_context) const override {
     const auto* X = p_op_kernel_context->Input<Tensor>(0);
     const auto* scale = p_op_kernel_context->Input<Tensor>(1);
@@ -206,4 +209,7 @@ class BatchNorm : public OpKernel {
   const bool is_spatial_;
   int64_t is_train_;
 };
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
index 85e555ef23..49afe6503c 100644
--- a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
+++ b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
@@ -8,7 +8,12 @@
 #include "core/framework/tensor.h"
 #endif
 #include <sstream>
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 class BatchNormHelper {
  public:
@@ -124,3 +129,6 @@ class BatchNormHelper {
   }
 };
 }  // namespace onnxruntime
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
\ No newline at end of file
diff --git a/onnxruntime/core/providers/cpu/nn/conv_attributes.h b/onnxruntime/core/providers/cpu/nn/conv_attributes.h
index 7fff54137d..393a5ea992 100644
--- a/onnxruntime/core/providers/cpu/nn/conv_attributes.h
+++ b/onnxruntime/core/providers/cpu/nn/conv_attributes.h
@@ -243,7 +243,7 @@ struct ConvAttributes {
           }
 
           post_slicing_needed = true;
-          slice_axes.push_back(dim + 2);
+          slice_axes.push_back(static_cast<int64_t>(dim) + 2);
           slice_starts.push_back(excess_output_head);
           slice_ends.push_back(excess_output_head + output_dim_size);                      // we may modify this below
           output_shape_with_revised_pads.push_back(excess_output_head + output_dim_size);  // we may modify this below
@@ -273,7 +273,7 @@ struct ConvAttributes {
               // Head has not been over-padded. Only tail pads need to be modified.
               post_slicing_needed = true;
 
-              slice_axes.push_back(dim + 2);
+              slice_axes.push_back(static_cast<int64_t>(dim) + 2);
               slice_starts.push_back(0);
               slice_ends.push_back(output_dim_size - revised_dim_size);
             }
diff --git a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
index d7e39c16b6..1e2f4f60d3 100644
--- a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
@@ -76,7 +76,7 @@ Status ConvTranspose<float>::PrePack(const Tensor& tensor, int input_idx, Alloca
     transposed_filter_ = BufferUniquePtr(packed_filter_data, BufferDeleter(alloc));
 
     for (int64_t group_id = 0; group_id < conv_transpose_attrs_.group; ++group_id) {
-      MlasTranspose(tensor.Data<float>() + (N * K * group_id),
+      MlasTranspose(tensor.Data<float>() + (group_id * N * K),
                     ((float*)packed_filter_data) + (group_id * packed_elements_per_group),
                     K, N);
     }
diff --git a/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h b/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h
index 9ded92912f..621a522344 100644
--- a/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h
+++ b/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h
@@ -50,7 +50,7 @@ struct ConvTransposeAttributes : public ConvAttributes {
     const TensorShape& F_Shape = (filter_shape != nullptr) ? *filter_shape : F->Shape();
     const Tensor* Pads = dynamic_padding ? context->Input<Tensor>(2) : nullptr;
     const Tensor* B = has_bias ? (dynamic_padding ? context->Input<Tensor>(3) : context->Input<Tensor>(2)) : nullptr;
-    const TensorShape& input_shape = X->Shape().Slice(2);
+    TensorShape input_shape = X->Shape().Slice(2);
 
     const int64_t num_input_channels = X->Shape()[1];
     const int64_t N = X->Shape()[0];
diff --git a/onnxruntime/core/providers/cpu/nn/lrn.cc b/onnxruntime/core/providers/cpu/nn/lrn.cc
index a3d133358f..5111a1479e 100644
--- a/onnxruntime/core/providers/cpu/nn/lrn.cc
+++ b/onnxruntime/core/providers/cpu/nn/lrn.cc
@@ -21,7 +21,11 @@
 #include "core/common/safeint.h"
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 
 namespace functors {
@@ -74,7 +78,7 @@ Status LRN<float>::Compute(OpKernelContext* context) const {
   auto* scale_data = static_cast<float*>(scale_buffer.get());
   math::Set<float, CPUMathUtil>(Xsize, bias_, scale_data, &CPUMathUtil::Instance());
 
-  const size_t padded_square_size = (C + size_ - 1) * H * W;
+  const size_t padded_square_size = (static_cast<size_t>(C) + size_ - 1) * H * W;
   auto psdata = alloc->Alloc(SafeInt<size_t>(sizeof(float)) * padded_square_size);
   BufferUniquePtr padded_square_buffer(psdata, BufferDeleter(alloc));
   auto* padded_square_data = static_cast<float*>(padded_square_buffer.get());
diff --git a/onnxruntime/core/providers/cpu/nn/pool_attributes.h b/onnxruntime/core/providers/cpu/nn/pool_attributes.h
index 11c11b6b6c..87f07e55f7 100644
--- a/onnxruntime/core/providers/cpu/nn/pool_attributes.h
+++ b/onnxruntime/core/providers/cpu/nn/pool_attributes.h
@@ -168,7 +168,11 @@ struct PoolAttributes {
       *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head + *pad_tail, dilation);
     }
   }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
   int64_t ComputeOutputSize(int64_t in_size,
                             int64_t stride,
                             int64_t kernel,
@@ -180,6 +184,9 @@ struct PoolAttributes {
     return static_cast<int64_t>(
         std::ceil(static_cast<float>(in_size + pad_needed - dilation * (kernel - 1) - 1) / stride + 1));
   }
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/nn/pool_base.h b/onnxruntime/core/providers/cpu/nn/pool_base.h
index d03754eec9..00dd1b1520 100644
--- a/onnxruntime/core/providers/cpu/nn/pool_base.h
+++ b/onnxruntime/core/providers/cpu/nn/pool_base.h
@@ -36,7 +36,7 @@ class PoolProcessContext {
 
 class AveragePool {
  public:
-  static float Initialize() {
+  constexpr static float Initialize() {
     return 0.0;
   }
 
@@ -59,7 +59,7 @@ class MaxPool;
 template <>
 class MaxPool<1 /*START_VERSION*/> {
  public:
-  static float Initialize() {
+  constexpr static float Initialize() {
     return std::numeric_limits<float>::lowest();
   }
 
@@ -84,7 +84,7 @@ class MaxPool<8 /*START_VERSION*/> {
 
 class LpPool {
  public:
-  static float Initialize() {
+  constexpr static float Initialize() {
     return 0.0f;
   }
 
diff --git a/onnxruntime/core/providers/cpu/nn/shrink.cc b/onnxruntime/core/providers/cpu/nn/shrink.cc
index 6272eb409f..0c336da99e 100644
--- a/onnxruntime/core/providers/cpu/nn/shrink.cc
+++ b/onnxruntime/core/providers/cpu/nn/shrink.cc
@@ -31,7 +31,10 @@ ONNX_CPU_OPERATOR_KERNEL(
                         BuildKernelDefConstraintsFromTypeList<ShrinkDataTypes>(),
                         BuildKernelDefConstraintsFromTypeList<EnabledShrinkDataTypes>()),
     Shrink);
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26451)
+#endif
 namespace shrink_internal {
 template <class T>
 inline T ShrinkCore(const T& val, float bias, float lambd) {
diff --git a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc
index d6677a482c..464d027202 100644
--- a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc
+++ b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc
@@ -125,7 +125,7 @@ struct TfIdfVectorizer::Impl {
   // Contains output indexes
   // represents ngram_indexes output
   gsl::span<const int64_t> ngram_indexes_;
-  gsl::span<const float>   weights_;
+  gsl::span<const float> weights_;
 
   // This map contains references to pool_string_ entries
   // of pool_strings attribute
@@ -145,13 +145,13 @@ struct TfIdfVectorizer::Impl {
     assert(ngram_id != 0);
     --ngram_id;
     assert(ngram_id < ngram_indexes_.size());
-    auto output_idx = row_num * output_size_ + ngram_indexes_[ngram_id];
+    auto output_idx = static_cast<int64_t>(row_num) * output_size_ + ngram_indexes_[ngram_id];
     assert(static_cast<size_t>(output_idx) < frequencies.size());
     ++frequencies[output_idx];
   }
 };
 
-TfIdfVectorizer::TfIdfVectorizer(const OpKernelInfo& info) : OpKernel(info), impl_(new Impl) {
+TfIdfVectorizer::TfIdfVectorizer(const OpKernelInfo& info) : OpKernel(info), impl_(std::make_unique<Impl>()) {
   std::string mode;
   Status status = info.GetAttr("mode", &mode);
   ORT_ENFORCE(status.IsOK(), "mode is required");
diff --git a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc
index bc6b7e3543..ddc4432eba 100644
--- a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc
+++ b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc
@@ -194,7 +194,7 @@ Status NonMaxSuppression::Compute(OpKernelContext* ctx) const {
     }    //for class_index
   }      //for batch_index
 
-  const auto last_dim = 3;
+  constexpr auto last_dim = 3;
   const auto num_selected = selected_indices.size();
   Tensor* output = ctx->Output(0, {static_cast<int64_t>(num_selected), last_dim});
   ORT_ENFORCE(output != nullptr);
diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.cc b/onnxruntime/core/providers/cpu/object_detection/roialign.cc
index ed9f56d050..f316da7cb6 100644
--- a/onnxruntime/core/providers/cpu/object_detection/roialign.cc
+++ b/onnxruntime/core/providers/cpu/object_detection/roialign.cc
@@ -49,7 +49,11 @@ struct PreCalc {
   T w3;
   T w4;
 };
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
 template <typename T>
 static void PreCalcForBilinearInterpolate(const int64_t height, const int64_t width, const int64_t pooled_height,
                                           const int64_t pooled_width, const int64_t iy_upper, const int64_t ix_upper,
@@ -235,8 +239,8 @@ void RoiAlignForward(const TensorShape& output_shape, const T* bottom_data, floa
 }  // namespace
 
 Status CheckROIAlignValidInput(const Tensor* X_ptr, const Tensor* rois_ptr, const Tensor* batch_indices_ptr) {
-  const int64_t EXPECTED_NUM_ROI_DIMS = 2;
-  const int64_t EXPECTED_SECOND_ROI_DIM = 4;
+  constexpr int64_t EXPECTED_NUM_ROI_DIMS = 2;
+  constexpr int64_t EXPECTED_SECOND_ROI_DIM = 4;
   if (!X_ptr) {
     return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Null input X ptr");
   }
diff --git a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
index 27b23781b3..2b81637f7f 100644
--- a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
+++ b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
@@ -619,7 +619,7 @@ Status QLinearConv<ActType>::Compute(OpKernelContext* context) const {
     auto conv_worker = [&](ptrdiff_t batch) {
       auto work = concurrency::ThreadPool::PartitionWork(batch, thread_count, static_cast<ptrdiff_t>(output_image_size));
       int64_t output_start = static_cast<int64_t>(work.start);
-      int64_t output_count = static_cast<int64_t>(work.end - work.start);
+      int64_t output_count = static_cast<int64_t>(work.end) - work.start;
 
       ActType const** worker_indirection_buffer = nullptr;
       if (indirection_buffer) {
diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
index c8af3ffff3..5a8c659c96 100644
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
@@ -3,7 +3,10 @@
 
 #include "core/providers/cpu/reduction/reduction_ops.h"
 #include "core/providers/common.h"
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26451)
+#endif
 using namespace std;
 namespace onnxruntime {
 
@@ -219,10 +222,6 @@ bool operator!=(FastReduceKind a, FastReduceKind b) {
   return static_cast<uint8_t>(a) != static_cast<uint8_t>(b);
 }
 
-bool IsFastReduceKindAvailable(FastReduceKind scenario, FastReduceKind available) {
-  return (static_cast<uint8_t>(scenario) & static_cast<uint8_t>(available)) > 0;
-}
-
 bool ResultsNoTransposePrepareForReduce::equal(gsl::span<const int64_t> local_input_shape,
                                                gsl::span<const int64_t> local_reduced_axes) {
   if (gsl::make_span(input_shape) != local_input_shape)
@@ -267,12 +266,6 @@ void ReduceAggregatorBase::FastReduceKRK(const Tensor&, const std::vector<int64_
   ValidateMustBeOverloaded();
 }
 
-TensorOpCost ParallelReduceFastCost(int64_t n_row, int64_t n_col, int64_t element_size, int n_ops) {
-  return TensorOpCost{static_cast<double>(n_row * n_col * element_size),
-                      static_cast<double>(n_row * element_size),
-                      static_cast<double>(n_row * n_col * element_size * n_ops)};
-}
-
 void NoTransposePrepareForReduce(const TensorShape& new_input_shape,
                                  gsl::span<const int64_t> reduced_axes,
                                  ResultsNoTransposePrepareForReduce& results) {
diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
index 9f064fa345..8acce4524d 100644
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
@@ -34,10 +34,15 @@ bool operator==(FastReduceKind a, FastReduceKind b);
 
 bool operator!=(FastReduceKind a, FastReduceKind b);
 
-bool IsFastReduceKindAvailable(FastReduceKind scenario, FastReduceKind available);
-
+constexpr bool IsFastReduceKindAvailable(FastReduceKind scenario, FastReduceKind available) {
+  return (static_cast<uint8_t>(scenario) & static_cast<uint8_t>(available)) > 0;
+}
 /* Evaluate the cost of parallelized FastReduce implementations. */
-TensorOpCost ParallelReduceFastCost(int64_t n_row, int64_t n_col, int64_t element_size, int n_ops);
+constexpr TensorOpCost ParallelReduceFastCost(int64_t n_row, int64_t n_col, int64_t element_size, int n_ops) {
+  return TensorOpCost{static_cast<double>(n_row * n_col * element_size),
+                      static_cast<double>(n_row * element_size),
+                      static_cast<double>(n_row * n_col * element_size * n_ops)};
+}
 
 /**
   This only improves reduce function when reduced axes are contiguous:
diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
index 8f4f2aaa32..e53dbb4cf0 100644
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
@@ -14,7 +14,11 @@
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
 /*
 ONNX_OPERATOR_SCHEMA(GRU)
     .SetDoc(R"DOC(
@@ -644,7 +648,7 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
       for (int r = 0; r < batch_size_; r++) {
         const T* p_bias_r = use_bias_ ? SafeRawConstPointer<T>(batched_bias_WRr_local + r * hidden_size_,
                                                                batched_bias_WRr_local_end, hidden_size_)
-          : nullptr;
+                                      : nullptr;
 
         // initialize p_rt with input to calculate rt. outputZRH_ has Xt*(Wr^T) + Ht-1*(Rr^T).
         T* p_rt = SafeRawPointer(outputZRH_, out_added_offset + r * hidden_size_x3 + hidden_size_, hidden_size_);
@@ -738,7 +742,7 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
 
         const T* p_bias_z = use_bias_ ? SafeRawConstPointer<T>(batched_bias_WRz_local,
                                                                batched_bias_WRz_local_end, hidden_size_)
-          : nullptr;
+                                      : nullptr;
 
         // initialize p_zt with Xt*(Wz^T) + Ht-1*(Rz^T), which is most of the input to calculate zt:
         T* p_zt = SafeRawPointer<T>(outputZRH_, out_added_offset + r * hidden_size_x3, hidden_size_);
@@ -788,7 +792,7 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
       prev_Ht = output;
       prev_Ht_end = output_end;
     }
-  } // End parallel section
+  }  // End parallel section
 
   // copy last output to final_hidden_state
   for (int i = 0; i < batch_size_; i++) {
diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
index a711a59bd1..b602adb32e 100644
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
@@ -185,7 +185,7 @@ Status DeepCpuLstmOp::TryPackWeights(const Tensor& weights, PackedWeights& packe
   const size_t N = static_cast<size_t>(shape[1]);
   const size_t K = static_cast<size_t>(shape[2]);
 
-  if ((shape[0] != num_directions_) || (N != static_cast<size_t>(hidden_size_ * 4))) {
+  if ((shape[0] != num_directions_) || (N != static_cast<size_t>(hidden_size_) * 4)) {
     return Status::OK();
   }
 
diff --git a/onnxruntime/core/providers/cpu/rnn/lstm_base.cc b/onnxruntime/core/providers/cpu/rnn/lstm_base.cc
index 44d785b62b..cfabe3fb45 100644
--- a/onnxruntime/core/providers/cpu/rnn/lstm_base.cc
+++ b/onnxruntime/core/providers/cpu/rnn/lstm_base.cc
@@ -3,7 +3,10 @@
 
 #include "lstm_base.h"
 #include "uni_directional_lstm.h"
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 
 using namespace rnn::detail;
diff --git a/onnxruntime/core/providers/cpu/rnn/rnn.cc b/onnxruntime/core/providers/cpu/rnn/rnn.cc
index 1f0e0610da..a51129ceac 100644
--- a/onnxruntime/core/providers/cpu/rnn/rnn.cc
+++ b/onnxruntime/core/providers/cpu/rnn/rnn.cc
@@ -9,7 +9,11 @@
 #include "core/providers/cpu/rnn/rnn_helpers.h"
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     RNN,
diff --git a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.cc b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.cc
index a7c20b9ed6..ece449095c 100644
--- a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.cc
+++ b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.cc
@@ -17,7 +17,10 @@
 #include "core/providers/cpu/rnn/rnn_activation_functors.h"
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 namespace rnn {
 namespace detail {
@@ -311,21 +314,21 @@ void ComputeGemm(const int M,
 
 namespace deepcpu {
 
-const float alpha_1 = 4.89352455891786e-03f;
-const float alpha_3 = 6.37261928875436e-04f;
-const float alpha_5 = 1.48572235717979e-05f;
-const float alpha_7 = 5.12229709037114e-08f;
-const float alpha_9 = -8.60467152213735e-11f;
-const float alpha_11 = 2.00018790482477e-13f;
-const float alpha_13 = -2.76076847742355e-16f;
+constexpr float alpha_1 = 4.89352455891786e-03f;
+constexpr float alpha_3 = 6.37261928875436e-04f;
+constexpr float alpha_5 = 1.48572235717979e-05f;
+constexpr float alpha_7 = 5.12229709037114e-08f;
+constexpr float alpha_9 = -8.60467152213735e-11f;
+constexpr float alpha_11 = 2.00018790482477e-13f;
+constexpr float alpha_13 = -2.76076847742355e-16f;
 
-const float beta_0 = 4.89352518554385e-03f;
-const float beta_2 = 2.26843463243900e-03f;
-const float beta_4 = 1.18534705686654e-04f;
-const float beta_6 = 1.19825839466702e-06f;
+constexpr float beta_0 = 4.89352518554385e-03f;
+constexpr float beta_2 = 2.26843463243900e-03f;
+constexpr float beta_4 = 1.18534705686654e-04f;
+constexpr float beta_6 = 1.19825839466702e-06f;
 
-const float sigmoid_bound = 20.0f;
-const float tanh_bound = 10.0f;
+constexpr float sigmoid_bound = 20.0f;
+constexpr float tanh_bound = 10.0f;
 
 #if defined(__GNUC__) && !defined(__wasm__)
 #define restrict __restrict__
@@ -918,4 +921,3 @@ GruOutputGateFuncPtr GruOutputGateFuncByName(const std::string& func) {
 }  // namespace detail
 }  // namespace rnn
 }  // namespace onnxruntime
-
diff --git a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
index e23d516350..c94cdf14ab 100644
--- a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
+++ b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
@@ -110,7 +110,7 @@ void ReverseSequence(gsl::span<const T> inputs,
   for (int i = 0; i < batch_size; i++) {
     int seq_len = sequence_lengths[i];
 
-    for (int j = 0; j < seq_len; j++) {
+    for (ptrdiff_t j = 0; j < seq_len; j++) {
       gsl::span<const T> src = inputs.subspan(j * batch_size * input_size + i * input_size, input_size);
       gsl::span<T> dest = inputs_reverse.subspan(num_directions * (seq_len - j - 1) * batch_size * input_size + i * input_size, input_size);
 
diff --git a/onnxruntime/core/providers/cpu/rnn/uni_directional_lstm.cc b/onnxruntime/core/providers/cpu/rnn/uni_directional_lstm.cc
index 715aa931a3..605ebacaee 100644
--- a/onnxruntime/core/providers/cpu/rnn/uni_directional_lstm.cc
+++ b/onnxruntime/core/providers/cpu/rnn/uni_directional_lstm.cc
@@ -4,7 +4,11 @@
 #include "uni_directional_lstm.h"
 
 #include "core/platform/threadpool.h"
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 namespace lstm {
 
@@ -82,7 +86,7 @@ UniDirectionalLstm<T>::UniDirectionalLstm(
 template <typename T>
 void UniDirectionalLstm<T>::AllocateBuffers() {
   // allocate and fill with zeroes
-  const bool fill = true;
+  constexpr bool fill = true;
   hidden0_ = Allocate(allocator_, hidden_size_, hidden0_ptr_, fill);
   internal_memory_prev_ = Allocate(allocator_, hidden_size_, internal_memory_prev_ptr_, fill);
   batched_hidden0_ = Allocate(allocator_, batch_size_ * hidden_size_, batched_hidden0_ptr_);
diff --git a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
index 412c6d6aa0..9645028745 100644
--- a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
+++ b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
@@ -69,7 +69,7 @@ static int64_t GetSeqIdx(const Tensor& idx_tensor) {
   return seq_idx;
 }
 
-bool ValidateSeqIdx(int64_t input_seq_idx, int64_t seq_size) {
+constexpr bool ValidateSeqIdx(int64_t input_seq_idx, int64_t seq_size) {
   bool retval = false;
   if (input_seq_idx < 0) {
     retval = input_seq_idx <= -1 && input_seq_idx >= -seq_size;
@@ -536,7 +536,7 @@ Status SplitToSequence::ComputeImpl(OpKernelContext& context, const Tensor& inpu
           copy_data<T>(src, dst, count);
         });
 
-    input_offset += split_size * after_dims_excluding_split;  // offset by the N data we used in this iteration
+    input_offset += static_cast<int64_t>(split_size) * after_dims_excluding_split;  // offset by the N data we used in this iteration
 
     // if keep_dims = 0, reshape the tensor by dropping the dimension corresponding to 'axis'
     if (use_keep_dims && keepdims_ == 0) {
diff --git a/onnxruntime/core/providers/cpu/tensor/expand.cc b/onnxruntime/core/providers/cpu/tensor/expand.cc
index 085214f290..466bb83ad1 100644
--- a/onnxruntime/core/providers/cpu/tensor/expand.cc
+++ b/onnxruntime/core/providers/cpu/tensor/expand.cc
@@ -78,9 +78,9 @@ Status Expand<T>::Compute(OpKernelContext* context) const {
     return Status::OK();
   }
 
-  std::unique_ptr<int64_t[]> input_dim_group{new int64_t[max_dims_size]};
-  std::unique_ptr<int64_t[]> output_dim_group{new int64_t[max_dims_size]};
-  std::unique_ptr<int64_t[]> expand_dim_size{new int64_t[max_dims_size]};
+  std::unique_ptr<int64_t[]> input_dim_group = std::make_unique<int64_t[]>(max_dims_size);
+  std::unique_ptr<int64_t[]> output_dim_group = std::make_unique<int64_t[]>(max_dims_size);
+  std::unique_ptr<int64_t[]> expand_dim_size = std::make_unique<int64_t[]>(max_dims_size);
   auto dim_group_start = max_dims_size;
 
   for (int64_t input_dims_iter = input_dims_size - 1,
diff --git a/onnxruntime/core/providers/cpu/tensor/gather_nd.cc b/onnxruntime/core/providers/cpu/tensor/gather_nd.cc
index cec467e066..4e75de493f 100644
--- a/onnxruntime/core/providers/cpu/tensor/gather_nd.cc
+++ b/onnxruntime/core/providers/cpu/tensor/gather_nd.cc
@@ -100,7 +100,7 @@ Status GatherNDBase::PrepareForCompute(const TensorShape& input_shape, const Ten
       relative_slice_offset += index * sizes_from_slice_dims[dim_idx];
     }
 
-    p.slice_offsets[slice_idx] = input_base_offset + relative_slice_offset;
+    p.slice_offsets[slice_idx] = static_cast<uint64_t>(input_base_offset) + relative_slice_offset;
   };
 
   concurrency::ThreadPool::TryParallelFor(
diff --git a/onnxruntime/core/providers/cpu/tensor/nonzero_op.cc b/onnxruntime/core/providers/cpu/tensor/nonzero_op.cc
index 0b09f213b6..9fc0aeb2b9 100644
--- a/onnxruntime/core/providers/cpu/tensor/nonzero_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/nonzero_op.cc
@@ -77,7 +77,7 @@ Status NonZero<T>::Compute(OpKernelContext* context) const {
     // as we iterate the entries, increment the coordinate for the current entry
     // e.g. if shape is {2,2}, we start with 0,0 increment to 0,1 increment to 1,0 and finally 1,1
     auto increment_coordinate = [&coordinate, &coordinate_size, &X_shape]() {
-      for (int64_t idx = coordinate_size - 1; idx >= 0; --idx) {
+      for (Eigen::Index idx = coordinate_size - 1; idx >= 0; --idx) {
         int64_t& cur_coord = coordinate[idx];
         if (cur_coord != X_shape[idx] - 1) {
           ++cur_coord;
diff --git a/onnxruntime/core/providers/cpu/tensor/onehot.cc b/onnxruntime/core/providers/cpu/tensor/onehot.cc
index cd621f968d..42e20284c5 100644
--- a/onnxruntime/core/providers/cpu/tensor/onehot.cc
+++ b/onnxruntime/core/providers/cpu/tensor/onehot.cc
@@ -94,7 +94,7 @@ Status PrepareOutputShape(const Tensor* indices, const int64_t depth_val, const
   output_shape = indices_shape.GetDimsAsVector();
 
   // output rank is always 1 more than the input rank as a new dimension is added to the input shape
-  const auto output_rank = static_cast<int64_t>(indices_num_dims + 1);
+  const auto output_rank = static_cast<int64_t>(indices_num_dims) + 1;
 
   auto true_axis = HandleNegativeAxis(axis, output_rank);
 
diff --git a/onnxruntime/core/providers/cpu/tensor/reverse_sequence.cc b/onnxruntime/core/providers/cpu/tensor/reverse_sequence.cc
index 493ff62b1d..a76f82fb39 100644
--- a/onnxruntime/core/providers/cpu/tensor/reverse_sequence.cc
+++ b/onnxruntime/core/providers/cpu/tensor/reverse_sequence.cc
@@ -79,7 +79,7 @@ Status ReverseSequenceOp::Compute(OpKernelContext* context) const {
   return status;
 }
 
-static int64_t TimeMajorInputOffset(const int64_t max_seq_len,
+constexpr static int64_t TimeMajorInputOffset(const int64_t max_seq_len,
                                     const int64_t batch_size,
                                     const int64_t input_size,
                                     const int64_t batch_num,
@@ -88,7 +88,7 @@ static int64_t TimeMajorInputOffset(const int64_t max_seq_len,
   return seq_num * batch_size * input_size + batch_num * input_size;
 }
 
-static int64_t BatchMajorInputOffset(const int64_t max_seq_len,
+constexpr static int64_t BatchMajorInputOffset(const int64_t max_seq_len,
                                      const int64_t batch_size,
                                      const int64_t input_size,
                                      const int64_t batch_num,
@@ -97,7 +97,7 @@ static int64_t BatchMajorInputOffset(const int64_t max_seq_len,
   return batch_num * max_seq_len * input_size + seq_num * input_size;
 }
 
-static int64_t TimeMajorOutputOffset(const int64_t max_seq_len,
+constexpr static int64_t TimeMajorOutputOffset(const int64_t max_seq_len,
                                      const int64_t batch_size,
                                      const int64_t input_size,
                                      const int64_t batch_num,
@@ -107,7 +107,7 @@ static int64_t TimeMajorOutputOffset(const int64_t max_seq_len,
   return (seq_len - seq_num - 1) * batch_size * input_size + batch_num * input_size;
 }
 
-static int64_t BatchMajorOutputOffset(const int64_t max_seq_len,
+constexpr static int64_t BatchMajorOutputOffset(const int64_t max_seq_len,
                                       const int64_t batch_size,
                                       const int64_t input_size,
                                       const int64_t batch_num,
diff --git a/onnxruntime/core/providers/cpu/tensor/scatter_nd.cc b/onnxruntime/core/providers/cpu/tensor/scatter_nd.cc
index b5a19fd44c..3fb3956cdc 100644
--- a/onnxruntime/core/providers/cpu/tensor/scatter_nd.cc
+++ b/onnxruntime/core/providers/cpu/tensor/scatter_nd.cc
@@ -64,7 +64,7 @@ Status ScatterNDBase::ValidateShapes(const TensorShape& input_shape,
     // Validate rank of update tensor
     // Per spec, the rank of the update tensor should be:
     // (Rank of input tensor) + (Rank of indices tensor) -1 - last_indice_dimension
-    if (update_rank != (input_rank + indice_rank - 1 - static_cast<int64_t>(last_indice_dimension))) {
+    if (update_rank != (input_rank + indice_rank - 1 - static_cast<ptrdiff_t>(last_indice_dimension))) {
       return true;
     }
 
diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
index e565c2467b..a62bad2178 100644
--- a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
+++ b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
@@ -62,7 +62,7 @@ ONNX_CPU_OPERATOR_KERNEL(
 // intermediate tensor shapes are:
 // (batch, blocksize, blocksize, input_depth / (blocksize * blocksize), input_height, input_width) for DepthToSpace
 // (batch, input_depth, input_height / blocksize, blocksize, input_width / blocksize, blocksize) for SpaceToDepth
-const int IntermediateTensorRank = 6;
+constexpr int IntermediateTensorRank = 6;
 
 template <typename T>
 using EigenTensorMap = Eigen::TensorMap<Eigen::Tensor<T, IntermediateTensorRank, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned>;
diff --git a/onnxruntime/core/providers/cpu/tensor/split.cc b/onnxruntime/core/providers/cpu/tensor/split.cc
index 74d8be1fce..345c9d2be3 100644
--- a/onnxruntime/core/providers/cpu/tensor/split.cc
+++ b/onnxruntime/core/providers/cpu/tensor/split.cc
@@ -197,7 +197,7 @@ Status Split::ComputeImpl(OpKernelContext& context, const Tensor& input) const {
           copy_data<T>(src, dst, count);
         });
 
-    input_offset += split_size * after_dims_excluding_split;  // offset by the N data we used in this iteration
+    input_offset += static_cast<int64_t>(split_size) * after_dims_excluding_split;  // offset by the N data we used in this iteration
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.cc b/onnxruntime/core/providers/cpu/tensor/transpose.cc
index 16d0f5c746..de6a1b33cf 100644
--- a/onnxruntime/core/providers/cpu/tensor/transpose.cc
+++ b/onnxruntime/core/providers/cpu/tensor/transpose.cc
@@ -78,7 +78,7 @@ static void IncrementIndexAndComputeOffsetSetup(MultiIndex& mindex, size_t num_a
   for (size_t i = 0; i < num_axes; ++i) {
     if (target_dims[i] == 1)
       continue;
-    mindex.InitAxis(naxes, 0, static_cast<size_t>(target_dims[i]), stride[i] * element_size);
+    mindex.InitAxis(naxes, 0, static_cast<size_t>(target_dims[i]), stride[i] * static_cast<int64_t>(element_size));
     ++naxes;
   }
   ORT_ENFORCE(naxes > 0, "Method IncrementIndexAndComputeOffset assumes this value is strictly positive.");
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.h b/onnxruntime/core/providers/cpu/tensor/upsample.h
index 3d998187e8..dd9dc489de 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample.h
+++ b/onnxruntime/core/providers/cpu/tensor/upsample.h
@@ -7,7 +7,11 @@
 #include "core/framework/op_kernel.h"
 #endif
 #include <cmath>
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 
 constexpr const char* UpsampleModeNN = "nearest";
@@ -16,9 +20,9 @@ constexpr const char* UpsampleModeCubic = "cubic";
 
 // In case of cubic mode the grid used to calculate the interpolation value
 // is a 4x4 matrix
-const size_t CubicModeGridLength = 4;
+constexpr size_t CubicModeGridLength = 4;
 
-using GetNearestPixelFunc = int64_t(*)(float, bool);
+using GetNearestPixelFunc = int64_t (*)(float, bool);
 using GetOriginalCoordinateFunc = float (*)(float, float, float, float, float, float);
 
 enum UpsampleMode {
@@ -375,3 +379,6 @@ class Upsample : public UpsampleBase, public OpKernel {
 };
 
 }  // namespace onnxruntime
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
\ No newline at end of file
diff --git a/onnxruntime/core/providers/cuda/cuda_kernel.h b/onnxruntime/core/providers/cuda/cuda_kernel.h
index 8268e40292..cb3232ffd5 100644
--- a/onnxruntime/core/providers/cuda/cuda_kernel.h
+++ b/onnxruntime/core/providers/cuda/cuda_kernel.h
@@ -30,7 +30,7 @@ class CudaKernel : public OpKernel {
     if (s.IsOK()) {
       auto err = cudaGetLastError();
       if (err != cudaSuccess) {
-        s = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA error ", cudaGetErrorName(err), ":", cudaGetErrorString(err));
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA error ", cudaGetErrorName(err), ":", cudaGetErrorString(err));
       }
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
index 02733453f7..04d1e6859f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
@@ -450,9 +450,9 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel(
         constantCpuInputCapture,
         shapeInferrerCapture,
         defaultAttributesCapture
-        ](const onnxruntime::OpKernelInfo& info) -> onnxruntime::OpKernel*
+        ](onnxruntime::FuncManager&, const onnxruntime::OpKernelInfo& info, std::unique_ptr<onnxruntime::OpKernel>& out) -> onnxruntime::common::Status
         {
-            return new AbiOpKernel(
+            out = std::make_unique<AbiOpKernel>(
                     kernelFactoryCapture.Get(),
                     info,
                     requiresInputShapesAtCreation,
@@ -461,6 +461,7 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel(
                     constantCpuInputCapture,
                     shapeInferrerCapture.Get(),
                     &defaultAttributesCapture);
+			return Status::OK();
         };
 
     onnxruntime::KernelCreateInfo create_info(builder.Build(), lotusKernelCreateFn);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
index cac81590ed..ba65508d2b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
@@ -681,9 +681,10 @@ namespace Dml
             printf("\n");
 #endif
 
-            auto fused_kernel_func = [partitionNodePropsMap, transferredInitializerMap](const onnxruntime::OpKernelInfo& info) mutable ->onnxruntime::OpKernel*
+            auto fused_kernel_func = [partitionNodePropsMap, transferredInitializerMap](onnxruntime::FuncManager& func_mgr, const onnxruntime::OpKernelInfo& info, std::unique_ptr<onnxruntime::OpKernel>& out) mutable ->onnxruntime::Status
             {
-                return CreateFusedGraphKernel(info, partitionNodePropsMap, *transferredInitializerMap);
+                out.reset(CreateFusedGraphKernel(info, partitionNodePropsMap, *transferredInitializerMap));
+				return Status::OK();
             };
 
             // build the kernel definition on the fly, and register it to the fused_kernel_regisitry.
diff --git a/onnxruntime/core/session/IOBinding.h b/onnxruntime/core/session/IOBinding.h
index 2b97dbeb22..61f7557f03 100644
--- a/onnxruntime/core/session/IOBinding.h
+++ b/onnxruntime/core/session/IOBinding.h
@@ -50,30 +50,30 @@ class IOBinding {
   common::Status BindInput(const std::string& name, const OrtValue& ml_value);
 
   /**
-    * If the BindInput calls are async this function acts as a barrier to ensure all inputs are fully copied
-    * before you call the Run() method. There is no point calling Run() if you're inputs are not ready at the 
-    * desired location.
-    * This is a blocking call and is a wrapper over IExecutionProvider::Sync().
-    * Call InferenceSession::Run() only after calling this method or else you'll end up wasting cycles inside Run().
-    */
+   * If the BindInput calls are async this function acts as a barrier to ensure all inputs are fully copied
+   * before you call the Run() method. There is no point calling Run() if you're inputs are not ready at the
+   * desired location.
+   * This is a blocking call and is a wrapper over IExecutionProvider::Sync().
+   * Call InferenceSession::Run() only after calling this method or else you'll end up wasting cycles inside Run().
+   */
   common::Status SynchronizeInputs();
   common::Status SynchronizeOutputs();
 
   /**
-    * Bind an output name to a provided pre-allocated OrtValue. 
-    */
+   * Bind an output name to a provided pre-allocated OrtValue.
+   */
   common::Status BindOutput(const std::string& name, const OrtValue& ml_value);
 
   /**
-    * Bind an output name to a device. 
-    * 
-    * @param device Device to allocate the output on. Default is CPU. 
-    */
+   * Bind an output name to a device.
+   *
+   * @param device Device to allocate the output on. Default is CPU.
+   */
   common::Status BindOutput(const std::string& name, OrtDevice device = {});
 
   /**
-    * This simply collects the outputs obtained after calling Run() inside the @param outputs.
-    */
+   * This simply collects the outputs obtained after calling Run() inside the @param outputs.
+   */
   const std::vector<std::string>& GetOutputNames() const;
   const std::vector<OrtValue>& GetOutputs() const;
   std::vector<OrtValue>& GetOutputs();
@@ -82,22 +82,22 @@ class IOBinding {
   const std::vector<OrtValue>& GetInputs() const;
 
   /**
-    * Get a CPU allocator from provider for async copy later if the provider supports that
-    * If it doesn't support that, return the default allocator from CPU provider
-    * \return a nonnull pointer
-    */
+   * Get a CPU allocator from provider for async copy later if the provider supports that
+   * If it doesn't support that, return the default allocator from CPU provider
+   * \return a nonnull pointer
+   */
   AllocatorPtr GetCPUAllocator(int id, onnxruntime::ProviderType provider_type) const;
 
   /**
-    * clear inputs or outputs. IOBinding is stateful. There are cases we need to reset its state.
-    */
+   * clear inputs or outputs. IOBinding is stateful. There are cases we need to reset its state.
+   */
   void ClearOutputs();
   void ClearInputs();
+  IOBinding(const SessionState& session_state);
 
  private:
   friend InferenceSession;
 
-  IOBinding(const SessionState& session_state);
   const SessionState& session_state_;
   std::vector<std::string> feed_names_;
   std::vector<OrtValue> feeds_;
diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc
index e5cb9fa176..cb721b65f8 100644
--- a/onnxruntime/core/session/abi_session_options.cc
+++ b/onnxruntime/core/session/abi_session_options.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/graph/onnx_protobuf.h"
+#include "core/common/gsl_suppress.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/ort_apis.h"
 #include "core/framework/error_code_helper.h"
diff --git a/onnxruntime/core/session/allocator_adapters.cc b/onnxruntime/core/session/allocator_adapters.cc
index a3a8cb273a..6dd495e578 100644
--- a/onnxruntime/core/session/allocator_adapters.cc
+++ b/onnxruntime/core/session/allocator_adapters.cc
@@ -47,7 +47,9 @@ void IAllocatorImplWrappingOrtAllocator::Free(void* p) {
 }
 
 }  // namespace onnxruntime
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 ORT_API_STATUS_IMPL(OrtApis::CreateAllocator, const OrtSession* sess,
                     const OrtMemoryInfo* mem_info, _Outptr_ OrtAllocator** out) {
   API_IMPL_BEGIN
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index 4d6b8b382b..6510888995 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -258,8 +258,9 @@ common::Status CreateCustomRegistry(const std::vector<OrtCustomOpDomain*>& op_do
         def_builder.Provider(onnxruntime::kCpuExecutionProvider);
       }
 
-      KernelCreateFn kernel_create_fn = [op](const OpKernelInfo& info) -> OpKernel* {
-        return new CustomOpKernel(info, *op);
+      KernelCreateFn kernel_create_fn = [op](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
+        out = std::make_unique<CustomOpKernel>(info, *op);
+        return Status::OK();
       };
 
       KernelCreateInfo create_info(def_builder.Build(), kernel_create_fn);
diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc
index 563159ca93..ba479a1b80 100644
--- a/onnxruntime/core/session/environment.cc
+++ b/onnxruntime/core/session/environment.cc
@@ -50,7 +50,7 @@ Status Environment::Create(std::unique_ptr<logging::LoggingManager> logging_mana
                            std::unique_ptr<Environment>& environment,
                            const OrtThreadingOptions* tp_options,
                            bool create_global_thread_pools) {
-  environment = std::unique_ptr<Environment>(new Environment());
+  environment = std::make_unique<Environment>();
   auto status = environment->Initialize(std::move(logging_manager), tp_options, create_global_thread_pools);
   return status;
 }
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index b525224b79..4641e2da89 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1982,8 +1982,7 @@ common::Status InferenceSession::NewIOBinding(std::unique_ptr<IOBinding>* io_bin
     }
   }
 
-  // private constructor, can't use make_unique
-  *io_binding = std::unique_ptr<IOBinding>(new IOBinding(*session_state_));
+  *io_binding = std::make_unique<IOBinding>(*session_state_);
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 74880c7ba3..086c7236f3 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2131,6 +2131,8 @@ ORT_API_STATUS_IMPL(OrtApis::CreateArenaCfgV2, _In_reads_(num_keys) const char*
   API_IMPL_END
 }
 
+//Allow using raw new/delete because this is for C.
+GSL_SUPPRESS(r .11)
 ORT_API(void, OrtApis::ReleaseArenaCfg, _Frees_ptr_opt_ OrtArenaCfg* ptr) {
   delete ptr;
 }
diff --git a/onnxruntime/core/session/ort_env.cc b/onnxruntime/core/session/ort_env.cc
index 834bbbeae9..dd10f0eb68 100644
--- a/onnxruntime/core/session/ort_env.cc
+++ b/onnxruntime/core/session/ort_env.cc
@@ -16,7 +16,7 @@
 using namespace onnxruntime;
 using namespace onnxruntime::logging;
 
-OrtEnv* OrtEnv::p_instance_ = nullptr;
+std::unique_ptr<OrtEnv> OrtEnv::p_instance_;
 int OrtEnv::ref_count_ = 0;
 onnxruntime::OrtMutex OrtEnv::m_;
 
@@ -52,19 +52,19 @@ OrtEnv* OrtEnv::GetInstance(const OrtEnv::LoggingManagerConstructionInfo& lm_inf
     if (lm_info.logging_function) {
       std::unique_ptr<ISink> logger = std::make_unique<LoggingWrapper>(lm_info.logging_function,
                                                                        lm_info.logger_param);
-      lmgr.reset(new LoggingManager(std::move(logger),
-                                    static_cast<Severity>(lm_info.default_warning_level),
-                                    false,
-                                    LoggingManager::InstanceType::Default,
-                                    &name));
+      lmgr = std::make_unique<LoggingManager>(std::move(logger),
+                                              static_cast<Severity>(lm_info.default_warning_level),
+                                              false,
+                                              LoggingManager::InstanceType::Default,
+                                              &name);
     } else {
       auto sink = MakePlatformDefaultLogSink();
 
-      lmgr.reset(new LoggingManager(std::move(sink),
-                                    static_cast<Severity>(lm_info.default_warning_level),
-                                    false,
-                                    LoggingManager::InstanceType::Default,
-                                    &name));
+      lmgr = std::make_unique<LoggingManager>(std::move(sink),
+                                              static_cast<Severity>(lm_info.default_warning_level),
+                                              false,
+                                              LoggingManager::InstanceType::Default,
+                                              &name);
     }
     std::unique_ptr<onnxruntime::Environment> env;
     if (!tp_options) {
@@ -75,11 +75,11 @@ OrtEnv* OrtEnv::GetInstance(const OrtEnv::LoggingManagerConstructionInfo& lm_inf
     if (!status.IsOK()) {
       return nullptr;
     }
-    p_instance_ = new OrtEnv(std::move(env));
+    p_instance_ = std::make_unique<OrtEnv>(std::move(env));
   }
 
   ++ref_count_;
-  return p_instance_;
+  return p_instance_.get();
 }
 
 void OrtEnv::Release(OrtEnv* env_ptr) {
@@ -87,11 +87,10 @@ void OrtEnv::Release(OrtEnv* env_ptr) {
     return;
   }
   std::lock_guard<onnxruntime::OrtMutex> lock(m_);
-  ORT_ENFORCE(env_ptr == p_instance_);  // sanity check
+  ORT_ENFORCE(env_ptr == p_instance_.get());  // sanity check
   --ref_count_;
   if (ref_count_ == 0) {
-    delete p_instance_;
-    p_instance_ = nullptr;
+    p_instance_.reset();
   }
 }
 
diff --git a/onnxruntime/core/session/ort_env.h b/onnxruntime/core/session/ort_env.h
index 8d85c1a4b3..da852b9425 100644
--- a/onnxruntime/core/session/ort_env.h
+++ b/onnxruntime/core/session/ort_env.h
@@ -59,30 +59,29 @@ struct OrtEnv {
   /**
    * Registers an allocator for sharing between multiple sessions.
    * Returns an error if an allocator with the same OrtMemoryInfo is already registered.
-  */
+   */
   onnxruntime::common::Status RegisterAllocator(onnxruntime::AllocatorPtr allocator);
 
   /**
    * Creates and registers an allocator for sharing between multiple sessions.
    * Return an error if an allocator with the same OrtMemoryInfo is already registered.
-  */
+   */
   onnxruntime::common::Status CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info,
                                                          const OrtArenaCfg* arena_cfg = nullptr);
 
   /**
    * Removes registered allocator that was previously registered for sharing between multiple sessions.
-  */
+   */
   onnxruntime::common::Status UnregisterAllocator(const OrtMemoryInfo& mem_info);
+  OrtEnv(std::unique_ptr<onnxruntime::Environment> value);
+  ~OrtEnv();
 
  private:
-  static OrtEnv* p_instance_;
+  static std::unique_ptr<OrtEnv> p_instance_;
   static onnxruntime::OrtMutex m_;
   static int ref_count_;
 
   std::unique_ptr<onnxruntime::Environment> value_;
 
-  OrtEnv(std::unique_ptr<onnxruntime::Environment> value1);
-  ~OrtEnv();
-
   ORT_DISALLOW_COPY_AND_ASSIGNMENT(OrtEnv);
 };
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index bafb803e11..cb76556779 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -140,7 +140,9 @@ struct Node__EdgeIterator_Impl : Node__EdgeIterator {
 
   Node::EdgeConstIterator v_;
 };
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 // wrapped = The internal object is exposed as an opaque pointer, so we wrap it in a class that forwards every call to the real calls. No members are ever directly accessed
 // direct = Same implementation is used for shared providers & core code, but some of the methods need to be routed through here to make the linker happy
 struct ProviderHostImpl : ProviderHost {
diff --git a/onnxruntime/core/util/qmath.h b/onnxruntime/core/util/qmath.h
index 9c272cde54..34c68ea4ff 100644
--- a/onnxruntime/core/util/qmath.h
+++ b/onnxruntime/core/util/qmath.h
@@ -59,12 +59,12 @@ void GetQuantizationParameter(const float* data, int64_t num_of_elements, float&
     block_size = num_of_elements;
   }
 
-  for (int i = 0; i < num_blocks;  i++) {
+  for (int i = 0; i < num_blocks; i++) {
     aggregate[i].min = std::numeric_limits<float>::max();
     aggregate[i].max = std::numeric_limits<float>::lowest();
   }
 
-  const TensorOpCost unit_cost{static_cast<double>(block_size * sizeof(float)), 2.0, static_cast<double>(block_size)};
+  const TensorOpCost unit_cost{static_cast<double>(block_size) * sizeof(float), 2.0, static_cast<double>(block_size)};
   concurrency::ThreadPool::TryParallelFor(thread_pool, num_blocks, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
     auto begin_idx = begin * block_size;
     auto end_idx = std::min(std::ptrdiff_t(num_of_elements), end * block_size);
@@ -105,8 +105,8 @@ void GetQuantizationParameter(const float* data, int64_t num_of_elements, float&
 }
 
 /**
- * @brief Run MlasQuantizeLinear in parallel, with provided thread pool 
-*/
+ * @brief Run MlasQuantizeLinear in parallel, with provided thread pool
+ */
 template <typename OutputType>
 void ParQuantizeLinear(const float* Input,
                        OutputType* Output,
diff --git a/onnxruntime/core/util/thread_utils.cc b/onnxruntime/core/util/thread_utils.cc
index 4446c2f7a4..312076de28 100644
--- a/onnxruntime/core/util/thread_utils.cc
+++ b/onnxruntime/core/util/thread_utils.cc
@@ -37,20 +37,23 @@ CreateThreadPoolHelper(Env* env, OrtThreadPoolParams options) {
   }
 
   return std::make_unique<ThreadPool>(env, to, options.name, options.thread_pool_size,
-                                              options.allow_spinning);
+                                      options.allow_spinning);
 }
 
 std::unique_ptr<ThreadPool>
 CreateThreadPool(Env* env, OrtThreadPoolParams options, ThreadPoolType tpool_type) {
-// If openmp is enabled we don't want to create any additional threadpools for sequential execution.
-// However, parallel execution relies on the existence of a separate threadpool. Hence we allow eigen threadpools
-// to be created for parallel execution.
+  // If openmp is enabled we don't want to create any additional threadpools for sequential execution.
+  // However, parallel execution relies on the existence of a separate threadpool. Hence we allow eigen threadpools
+  // to be created for parallel execution.
   ORT_UNUSED_PARAMETER(tpool_type);
   return CreateThreadPoolHelper(env, options);
 }
 
 }  // namespace concurrency
 }  // namespace onnxruntime
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 namespace OrtApis {
 ORT_API_STATUS_IMPL(CreateThreadingOptions, _Outptr_ OrtThreadingOptions** out) {
   *out = new OrtThreadingOptions();
diff --git a/onnxruntime/python/numpy_helper.h b/onnxruntime/python/numpy_helper.h
new file mode 100644
index 0000000000..d9165c3d31
--- /dev/null
+++ b/onnxruntime/python/numpy_helper.h
@@ -0,0 +1,12 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <numpy/arrayobject.h>
+namespace onnxruntime {
+namespace python {
+constexpr bool IsNumericNumpyType(int npy_type) {
+  return npy_type < NPY_OBJECT || npy_type == NPY_HALF;
+}
+}  // namespace python
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index f169d7f67d..480fdcef04 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -9,6 +9,7 @@
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #define PY_ARRAY_UNIQUE_SYMBOL onnxruntime_python_ARRAY_API
 #include <numpy/arrayobject.h>
+#include "python/numpy_helper.h"
 
 #include "core/graph/graph.h"
 #include "core/framework/tensor_shape.h"
@@ -41,10 +42,6 @@ bool IsNumpyArray(py::object& obj) {
   return PyObjectCheck_NumpyArray(obj.ptr());
 }
 
-bool IsNumericNumpyType(int npy_type) {
-  return npy_type < NPY_OBJECT || npy_type == NPY_HALF;
-}
-
 int GetNumpyArrayType(const py::object& obj) {
   return PyArray_TYPE(reinterpret_cast<PyArrayObject*>(obj.ptr()));
 }
@@ -253,8 +250,7 @@ MLDataType NumpyTypeToOnnxRuntimeType(int numpy_type) {
       {NPY_UINT, DataTypeImpl::GetType<uint32_t>()},
       {NPY_LONGLONG, DataTypeImpl::GetType<int64_t>()},
       {NPY_ULONGLONG, DataTypeImpl::GetType<uint64_t>()},
-      {NPY_OBJECT, DataTypeImpl::GetType<std::string>()}
-  };
+      {NPY_OBJECT, DataTypeImpl::GetType<std::string>()}};
   const auto it = type_map.find(numpy_type);
   if (it == type_map.end()) {
     throw std::runtime_error("No corresponding Numpy type for Tensor Type.");
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.h b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
index eb1b43a55d..d253dfdcb1 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.h
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
@@ -23,8 +23,6 @@ namespace python {
 extern const char* PYTHON_ORTVALUE_OBJECT_NAME;
 extern const char* PYTHON_ORTVALUE_NATIVE_OBJECT_ATTR;
 
-bool IsNumericNumpyType(int npy_type);
-
 bool IsNumericNumpyArray(const pybind11::object& py_object);
 
 bool IsNumpyArray(pybind11::object& obj);
diff --git a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
index a4e7523ecf..f37539906f 100644
--- a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
@@ -9,6 +9,7 @@
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #define PY_ARRAY_UNIQUE_SYMBOL onnxruntime_python_ARRAY_API
 #include <numpy/arrayobject.h>
+#include "python/numpy_helper.h"
 
 #include "core/framework/ort_value.h"
 #include "core/framework/tensor.h"
diff --git a/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc b/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
index f8f51d257d..5e2839ed5f 100644
--- a/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
+++ b/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
@@ -9,6 +9,7 @@
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #define PY_ARRAY_UNIQUE_SYMBOL onnxruntime_python_ARRAY_API
 #include <numpy/arrayobject.h>
+#include "python/numpy_helper.h"
 
 #include "core/framework/tensor_shape.h"
 #include "core/framework/tensor.h"
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 2ed442df75..2c3a194ede 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -202,15 +202,15 @@ py::object GetPyObjectFromSparseTensor(size_t pos, const OrtValue& ort_value, co
     if (!data_transfer_manager) {
       LOGS(logger, WARNING) << "Returned OrtValue with sparse tensor at position: " << pos << " is on GPU but no data_transfer_manager provided."
                             << " Returned it will have its data on GPU, you can copy it using numpy_array_to_cpu()";
-      py_sparse_tensor.reset(new PySparseTensor(ort_value));
+      py_sparse_tensor = std::make_unique<PySparseTensor>(ort_value);
     } else {
       auto dst_sparse_tensor = std::make_unique<SparseTensor>(src_sparse_tensor.DataType(), src_sparse_tensor.DenseShape(), GetAllocator());
       auto status = src_sparse_tensor.Copy(*data_transfer_manager, 0, *dst_sparse_tensor);
       OrtPybindThrowIfError(status);
-      py_sparse_tensor.reset(new PySparseTensor(std::move(dst_sparse_tensor)));
+      py_sparse_tensor = std::make_unique<PySparseTensor>(std::move(dst_sparse_tensor));
     }
   } else {
-    py_sparse_tensor.reset(new PySparseTensor(ort_value));
+    py_sparse_tensor = std::make_unique<PySparseTensor>(ort_value);
   }
 
   py::object result = py::cast(py_sparse_tensor.get(), py::return_value_policy::take_ownership);
@@ -1301,7 +1301,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
           },
           "node shape (assuming the node holds a tensor)");
 
-  py::class_<SessionObjectInitializer>(m, "SessionObjectInitializer");
+  py::class_<SessionObjectInitializer> sessionObjectInitializer(m, "SessionObjectInitializer");
   py::class_<PyInferenceSession>(m, "InferenceSession", R"pbdoc(This is the main class used to run a model.)pbdoc")
       // In Python3, a Python bytes object will be passed to C++ functions that accept std::string or char*
       // without any conversion. So this init method can be used for model file path (string) and model content (bytes)
@@ -1542,7 +1542,7 @@ void InitializeEnv() {
     InitArray();
     Env::Default().GetTelemetryProvider().SetLanguageProjection(OrtLanguageProjection::ORT_PROJECTION_PYTHON);
     OrtPybindThrowIfError(Environment::Create(std::make_unique<LoggingManager>(
-                                                  std::unique_ptr<ISink>{new CLogSink{}},
+                                                  std::make_unique<CLogSink>(),
                                                   Severity::kWARNING, false, LoggingManager::InstanceType::Default,
                                                   &SessionObjectInitializer::default_logger_id),
                                               session_env));
diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h
index 0218c2c5f1..db4eda1259 100644
--- a/onnxruntime/python/onnxruntime_pybind_state_common.h
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.h
@@ -379,9 +379,9 @@ class PySparseTensor {
   std::unique_ptr<OrtValue> AsOrtValue() const;
 
  private:
-  //  instance_ represents data that comes as input. Thus we depend on numpy
-  // arrays that own the underlying memory to stay around. We store copies
-  // of py::objects for those arrays in backing_storage_ as an extra ref-count.
+  // instance_ represents data that comes as input. Thus we depend on numpy
+  //arrays that own the underlying memory to stay around. We store copies
+  //of py::objects for those arrays in backing_storage_ as an extra ref-count.
 
   // If we have and are able to copy from the OrtValue returned by run() to CPU, then this owns the data
   // and backing_storage_ is empty.
@@ -393,6 +393,11 @@ class PySparseTensor {
 };
 #endif  // !defined(DISABLE_SPARSE_TENSORS)
 
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+//You can attempt to make 'onnxruntime::python::SessionObjectInitializer::Get' constexpr
+#pragma warning(disable : 26497)
+#endif
 class SessionObjectInitializer {
  public:
   typedef const PySessionOptions& Arg1;
@@ -413,7 +418,9 @@ class SessionObjectInitializer {
     return SessionObjectInitializer();
   }
 };
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
 Environment& GetEnv();
 
 // Initialize an InferenceSession.
diff --git a/onnxruntime/test/contrib_ops/tensor_op_test.cc b/onnxruntime/test/contrib_ops/tensor_op_test.cc
index 2d4c836782..a72d2d3d83 100644
--- a/onnxruntime/test/contrib_ops/tensor_op_test.cc
+++ b/onnxruntime/test/contrib_ops/tensor_op_test.cc
@@ -14,7 +14,7 @@ namespace test {
 using ExpectResult = OpTester::ExpectResult;
 
 TEST(CropContribOpTest, CropBorderOnly) {
-  const int N = 2, C = 1, H = 3, W = 4;
+  constexpr int N = 2, C = 1, H = 3, W = 4;
   std::vector<float> X = {1.0f, 2.0f, 3.0f, 4.0f,
                           2.0f, 3.0f, 4.0f, 5.0f,
                           3.0f, 4.0f, 5.0f, 6.0f,
@@ -37,7 +37,7 @@ TEST(CropContribOpTest, CropBorderOnly) {
 }
 
 TEST(CropContribOpTest, CropBorderAndScale) {
-  const int N = 2, C = 1, H = 3, W = 4;
+  constexpr int N = 2, C = 1, H = 3, W = 4;
   std::vector<float> X = {1.0f, 2.0f, 3.0f, 4.0f,
                           2.0f, 3.0f, 4.0f, 5.0f,
                           3.0f, 4.0f, 5.0f, 6.0f,
@@ -65,7 +65,7 @@ TEST(CropContribOpTest, CropBorderAndScale) {
 }
 
 TEST(ImageScalerContribOpTest, ImageScalerTest) {
-  const int64_t N = 1, C = 2, H = 2, W = 2;
+  constexpr int64_t N = 1, C = 2, H = 2, W = 2;
   std::vector<float> X = {
       1.0f, 3.0f,
       3.0f, 5.0f,
@@ -92,9 +92,9 @@ TEST(ImageScalerContribOpTest, ImageScalerTest) {
 }
 
 void MeanVarianceNormalizationAcrossChannels(bool across_channels, bool normalize_variance) {
-  const int64_t N = 2, C = 2, H = 2, W = 3;
-  int64_t one = 1;
-  int64_t zero = 0;
+  constexpr int64_t N = 2, C = 2, H = 2, W = 3;
+  constexpr int64_t one = 1;
+  constexpr int64_t zero = 0;
 
   std::vector<float> X = {3.0f, -3.0f, -1.0f,
                           1.0f, 2.0f, -1.0f,
@@ -118,9 +118,9 @@ void MeanVarianceNormalizationAcrossChannels(bool across_channels, bool normaliz
 }
 
 void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_variance) {
-  const int64_t N = 2, C = 2, H = 2, W = 3;
-  int64_t one = 1;
-  int64_t zero = 0;
+  constexpr int64_t N = 2, C = 2, H = 2, W = 3;
+  constexpr int64_t one = 1;
+  constexpr int64_t zero = 0;
 
   std::vector<float> N1C1 = {3.0f, -3.0f, -1.0f,
                              1.0f, 2.0f, -1.0f};
diff --git a/onnxruntime/test/eager/ort_invoker_test.cc b/onnxruntime/test/eager/ort_invoker_test.cc
index d5b0c02d09..ed2efccd76 100644
--- a/onnxruntime/test/eager/ort_invoker_test.cc
+++ b/onnxruntime/test/eager/ort_invoker_test.cc
@@ -87,8 +87,9 @@ class TestKernel final : public OpKernel {
   }
 };
 
-OpKernel* CreateTestKernel(const OpKernelInfo& info) {
-  return new TestKernel(info);
+Status CreateTestKernel(FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) {
+  out = std::make_unique<TestKernel>(info);
+  return Status::OK();
 }
 
 TEST(InvokerTest, CustomOp) {
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index 379ef1d4e4..78b598dcf1 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -230,13 +230,13 @@ class PlannerTest : public ::testing::Test {
     ASSERT_NE(ep, nullptr);
     auto info = std::make_unique<OpKernelInfo>(
         *p_node, kernel_def, *ep, state_->GetInitializedTensors(), state_->GetOrtValueNameIdxMap(),
-        state_->GetFuncMgr(), state_->GetDataTransferMgr());
+        state_->GetDataTransferMgr());
 
     op_kernel_infos_.push_back(std::move(info));
     if (!KernelRegistry::HasImplementationOf(*reg, *p_node, onnxruntime::kCpuExecutionProvider)) {
       auto st = reg->Register(
           KernelCreateInfo(std::make_unique<KernelDef>(kernel_def),
-                           [](const OpKernelInfo& info) -> OpKernel* { return new DummyOpKernel(info); }));
+                           [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<DummyOpKernel>(info); return Status::OK(); }));
       ORT_ENFORCE(st.IsOK(), st.ErrorMessage());
     }
 
@@ -575,7 +575,7 @@ TEST_F(PlannerTest, LocationPlanningForPassThroughExplicitAndImplicitSubgraphInp
       /*  Inputs: iter_num, cond_in, loop carried state variables.
          iter_num_in    cond_in     [loop_state_var]
            (unused)        |               |
-                       [Identity]         [If]  
+                       [Identity]         [If]
                            |               |
                         cond_out     loop_state_var_out
     */
diff --git a/onnxruntime/test/framework/float_16_test.cc b/onnxruntime/test/framework/float_16_test.cc
index cf453d099e..f8bdb78660 100644
--- a/onnxruntime/test/framework/float_16_test.cc
+++ b/onnxruntime/test/framework/float_16_test.cc
@@ -145,7 +145,7 @@ TEST(Float16_Tests, Mul_16_Test) {
 
   auto def = MulFP16KernelDef();
   //Register a foo kernel which is doing Add, but bind to Mul.
-  KernelCreateFn kernel_create_fn = [](const OpKernelInfo& info) -> OpKernel* { return new MulFP16Kernel(info); };
+  KernelCreateFn kernel_create_fn = [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<MulFP16Kernel>(info); return Status::OK(); };
   EXPECT_TRUE(registry->RegisterCustomKernel(def, kernel_create_fn).IsOK());
 
   EXPECT_TRUE(session_object.Load(MUL_MODEL_URI).IsOK());
diff --git a/onnxruntime/test/framework/kernel_registry_test.cc b/onnxruntime/test/framework/kernel_registry_test.cc
index be9dd2a260..e2490b6ff4 100644
--- a/onnxruntime/test/framework/kernel_registry_test.cc
+++ b/onnxruntime/test/framework/kernel_registry_test.cc
@@ -26,8 +26,9 @@ class FakeKernel final : public OpKernel {
   }
 };
 
-OpKernel* CreateFakeKernel(const OpKernelInfo& info) {
-  return new FakeKernel(info);
+Status CreateFakeKernel(FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) {
+  out = std::make_unique<FakeKernel>(info);
+  return Status::OK();
 }
 }  // namespace
 
diff --git a/onnxruntime/test/framework/local_kernel_registry_test.cc b/onnxruntime/test/framework/local_kernel_registry_test.cc
index 1cfdf97311..67e65fee60 100644
--- a/onnxruntime/test/framework/local_kernel_registry_test.cc
+++ b/onnxruntime/test/framework/local_kernel_registry_test.cc
@@ -95,8 +95,9 @@ KernelDefBuilder FooKernelDef(const char* schema_name) {
   return def;
 }
 
-OpKernel* CreateFooKernel(const OpKernelInfo& kernel_info) {
-  return new FooKernel<float>(kernel_info);
+Status CreateFooKernel(FuncManager&, const OpKernelInfo& kernel_info, std::unique_ptr<OpKernel>& out) {
+  out = std::make_unique<FooKernel<float>>(kernel_info);
+  return Status::OK();
 }
 
 // kernel with optional outputs
@@ -181,8 +182,9 @@ class OptionalOpKernel : public OpKernel {
   }
 };
 
-OpKernel* CreateOptionalOpKernel(const OpKernelInfo& kernel_info) {
-  return new OptionalOpKernel<float>(kernel_info);
+Status CreateOptionalOpKernel(FuncManager&, const OpKernelInfo& kernel_info, std::unique_ptr<OpKernel>& out) {
+  out = std::make_unique<OptionalOpKernel<float>>(kernel_info);
+  return Status::OK();
 }
 
 static const std::string MUL_MODEL_URI = "testdata/mul_1.onnx";
diff --git a/onnxruntime/test/framework/opaque_kernels_test.cc b/onnxruntime/test/framework/opaque_kernels_test.cc
index 632f68deb0..a65f9a19a8 100644
--- a/onnxruntime/test/framework/opaque_kernels_test.cc
+++ b/onnxruntime/test/framework/opaque_kernels_test.cc
@@ -291,9 +291,9 @@ TEST_F(OpaqueTypeTests, RunModel) {
   EXPECT_TRUE(registry->RegisterOpSet(schemas, onnxruntime::kMLDomain, 8, 9).IsOK());
   // Register our kernels here
   auto ctor_def = ConstructSparseTensorDef();
-  EXPECT_TRUE(registry->RegisterCustomKernel(ctor_def, [](const OpKernelInfo& info) { return new ConstructSparseTensor(info); }).IsOK());
+  EXPECT_TRUE(registry->RegisterCustomKernel(ctor_def, [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) { out = std::make_unique<ConstructSparseTensor>(info); return Status::OK(); }).IsOK());
   auto shape_def = ConstructFetchSparseShape();
-  EXPECT_TRUE(registry->RegisterCustomKernel(shape_def, [](const OpKernelInfo& info) { return new FetchSparseTensorShape(info); }).IsOK());
+  EXPECT_TRUE(registry->RegisterCustomKernel(shape_def, [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) { out = std::make_unique<FetchSparseTensorShape>(info); return Status::OK(); }).IsOK());
 
   IOnnxRuntimeOpSchemaRegistryList custom_schema_registries_ = {registry->GetOpschemaRegistry()};
   std::unordered_map<std::string, int> domain_to_version = {{onnxruntime::kMLDomain, 8}};
diff --git a/onnxruntime/test/framework/parallel_executor_test.cc b/onnxruntime/test/framework/parallel_executor_test.cc
index 49c7800c1b..8dcaad10c5 100644
--- a/onnxruntime/test/framework/parallel_executor_test.cc
+++ b/onnxruntime/test/framework/parallel_executor_test.cc
@@ -82,7 +82,7 @@ TEST(ParallelExecutor, TestStatusPropagation) {
   std::vector<OpSchema> schemas{TestOp::OpSchema()};
   Status status;
   ASSERT_TRUE((status = registry->RegisterOpSet(schemas, TestOp::OpDomain, 10, 11)).IsOK()) << status;
-  KernelCreateFn kernel_create_fn = [](const OpKernelInfo& info) { return new typename TestOp::OpKernelImpl(info); };
+  KernelCreateFn kernel_create_fn = [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) { out = std::make_unique<typename TestOp::OpKernelImpl>(info); return Status::OK(); };
   auto kernel_def = TestOp::KernelDef();
   ASSERT_TRUE((status = registry->RegisterCustomKernel(kernel_def, kernel_create_fn)).IsOK()) << status;
 
@@ -125,7 +125,7 @@ TEST_P(ParallelExecutorThreadPoolTest, TestNullInterOpThreadPool) {
   std::vector<OpSchema> schemas{TestOp::OpSchema()};
   Status status;
   ASSERT_TRUE((status = registry->RegisterOpSet(schemas, TestOp::OpDomain, 10, 11)).IsOK()) << status;
-  KernelCreateFn kernel_create_fn = [](const OpKernelInfo& info) { return new typename TestOp::OpKernelImpl(info); };
+  KernelCreateFn kernel_create_fn = [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) { out = std::make_unique<typename TestOp::OpKernelImpl>(info); return Status::OK(); };
   auto kernel_def = TestOp::KernelDef();
   ASSERT_TRUE((status = registry->RegisterCustomKernel(kernel_def, kernel_create_fn)).IsOK()) << status;
 
@@ -144,6 +144,6 @@ TEST_P(ParallelExecutorThreadPoolTest, TestNullInterOpThreadPool) {
 }
 
 INSTANTIATE_TEST_SUITE_P(ParallelExecutorThreadPoolTests, ParallelExecutorThreadPoolTest,
-                        testing::Values(1, 0));
+                         testing::Values(1, 0));
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index c6d49e7356..344f563fab 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -76,7 +76,7 @@ TEST_P(SessionStateAddGetKernelTest, AddGetKernelTest) {
   auto kernel_def = KernelDefBuilder().SetName("Variable").Provider(kCpuExecutionProvider).SinceVersion(1, 10).Build();
 
   OpKernelInfo p_info(node, *kernel_def, *cpu_execution_provider, s.GetConstantInitializedTensors(),
-                      s.GetOrtValueNameIdxMap(), s.GetFuncMgr(), s.GetDataTransferMgr());
+                      s.GetOrtValueNameIdxMap(), s.GetDataTransferMgr());
   unique_ptr<TestOpKernel> p_kernel;
   p_kernel.reset(new TestOpKernel(p_info));
   size_t orig_num_outputs = p_kernel->Node().OutputDefs().size();
@@ -88,7 +88,7 @@ TEST_P(SessionStateAddGetKernelTest, AddGetKernelTest) {
   node.SetExecutionProviderType(kCpuExecutionProvider);
   std::shared_ptr<KernelRegistry> kernel_registry = std::make_shared<KernelRegistry>();
   ASSERT_STATUS_OK(kernel_registry->Register(KernelCreateInfo(
-      std::move(kernel_def), [](const OpKernelInfo& info) -> OpKernel* { return new TestOpKernel(info); })));
+      std::move(kernel_def), [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<TestOpKernel>(info); return Status::OK(); })));
   kernel_registry_manager.RegisterKernelRegistry(kernel_registry);
   ASSERT_STATUS_OK(s.FinalizeSessionState(ORT_TSTR(""), kernel_registry_manager));
 
@@ -511,7 +511,7 @@ TEST_P(SessionStatePrepackingTest, PrePackingTest) {
   auto kernel_def = KernelDefBuilder().SetName("PrePackingTest").Provider(kCpuExecutionProvider).SinceVersion(1).Build();
   ASSERT_STATUS_OK(kernel_registry->Register(
       KernelCreateInfo(std::move(kernel_def),
-                       [](const OpKernelInfo& info) -> OpKernel* { return new PrePackingTestOpKernel(info); })));
+                       [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<PrePackingTestOpKernel>(info); return Status::OK(); })));
   kernel_registry_manager.RegisterKernelRegistry(kernel_registry);
 
   PlaceAllNodesToCPUEP(model.MainGraph());
@@ -553,7 +553,7 @@ TEST(SessionStateTest, SharedInitalizersWithPrePackingTest) {
   auto kernel_def = KernelDefBuilder().SetName("PrePackingTest").Provider(kCpuExecutionProvider).SinceVersion(1).Build();
   ASSERT_STATUS_OK(kernel_registry->Register(
       KernelCreateInfo(std::move(kernel_def),
-                       [](const OpKernelInfo& info) -> OpKernel* { return new PrePackingTestOpKernel(info); })));
+                       [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out =  std::make_unique<PrePackingTestOpKernel>(info); return Status::OK(); })));
   kernel_registry_manager.RegisterKernelRegistry(kernel_registry);
 
   // Part 1: Pre-packing enabled + no shared initializers = no pre-packed weights caching
diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc
index 3bba5d1a9d..30d1232a7b 100644
--- a/onnxruntime/test/framework/sparse_kernels_test.cc
+++ b/onnxruntime/test/framework/sparse_kernels_test.cc
@@ -95,8 +95,8 @@ This operator constructs a sparse tensor from three tensors that provide a COO
   }
 
   /**
- *  @brief An implementation of the SparseFromCOO op.
- */
+   *  @brief An implementation of the SparseFromCOO op.
+   */
   class OpKernelImpl final : public OpKernel {
    public:
     OpKernelImpl(const OpKernelInfo& info) : OpKernel{info} {}
@@ -264,7 +264,7 @@ struct SparseToValues {
     return schema;
   }
 
-  //  A kernel implementation of SparseToValues
+  // A kernel implementation of SparseToValues
   class OpKernelImpl final : public OpKernel {
    public:
     OpKernelImpl(const OpKernelInfo& info) : OpKernel{info} {}
@@ -333,7 +333,7 @@ class SparseTensorTests : public testing::Test {
           .SetDomain(onnxruntime::kMLDomain)
           .SinceVersion(10)
           .Provider(onnxruntime::kCpuExecutionProvider);
-      KernelCreateFn kernel_create_fn = [](const OpKernelInfo& info) { return new typename Op::OpKernelImpl(info); };
+      KernelCreateFn kernel_create_fn = [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) { out = std::make_unique<typename Op::OpKernelImpl>(info); return Status::OK(); };
       EXPECT_TRUE(registry2->RegisterCustomKernel(kernel_def_builder, kernel_create_fn).IsOK());
     };
     register_actions.push_back(register_kernel);
@@ -698,9 +698,10 @@ struct InsertIndices {
     std::vector<int8_t> indices_data;
     insert_indices_data(indices_1D, values_size, shape_size, indices_data, indices_tp);
     indices_tp.set_data_type(utils::ToTensorProtoElementType<T>());
-    ORT_IF_CONSTEXPR (sizeof(T) == sizeof(int8_t)) {
+    ORT_IF_CONSTEXPR(sizeof(T) == sizeof(int8_t)) {
       indices_tp.mutable_raw_data()->assign(reinterpret_cast<const char*>(indices_data.data()), indices_data.size());
-    } else {
+    }
+    else {
       // Conversion on the fly to the target data type
       std::vector<T> indices(indices_data.cbegin(), indices_data.cend());
       indices_tp.mutable_raw_data()->assign(reinterpret_cast<const char*>(indices.data()), indices.size() * sizeof(T));
@@ -823,8 +824,7 @@ static void TestConversion(
       TensorProto_DataType_INT8,
       TensorProto_DataType_INT16,
       TensorProto_DataType_INT32,
-      TensorProto_DataType_INT64
-  };
+      TensorProto_DataType_INT64};
 
   for (auto dt : indices_types) {
     TestConversion(true, dt, inserter, checker);
@@ -1037,7 +1037,7 @@ TensorProto CreateDenseTensor(size_t indices_start,
                               std::vector<T>& expected_values, std::vector<int64_t>& expected_indicies) {
   TensorProto result;
   std::vector<T> values = CreateSparseValues<T>(indices_start);
-  auto ind_start = static_cast<int64_t>(indices_start); 
+  auto ind_start = static_cast<int64_t>(indices_start);
   expected_indicies = {ind_start, ind_start + 1};
   for (const auto& ind : expected_indicies) {
     expected_values.push_back(values[ind]);
@@ -1110,11 +1110,11 @@ void RawSparseDataChecker<MLFloat16>(gsl::span<const MLFloat16> expected_bfloat,
 
 template <typename T>
 static void TestDenseToSparseConversionValues(size_t indices_start,
-    std::function<void(const std::vector<T>& values, TensorProto& tp)> inserter,
-    std::function<void(gsl::span<const T> expected,
-                       gsl::span<const int64_t> expected_indicies,
-                       const SparseTensorProto& actual)>
-        checker) {
+                                              std::function<void(const std::vector<T>& values, TensorProto& tp)> inserter,
+                                              std::function<void(gsl::span<const T> expected,
+                                                                 gsl::span<const int64_t> expected_indicies,
+                                                                 const SparseTensorProto& actual)>
+                                                  checker) {
   std::vector<T> expected_values;
   std::vector<int64_t> expected_indicies;
   // Path is required for loading external data
@@ -1862,7 +1862,7 @@ TEST(SparseTensorConversionTests, BlockSparse) {
                            indices_span.cbegin(), indices_span.cend()));
   }
 }
-#endif  //  !defined(DISABLE_SPARSE_TENSORS)
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
 
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/mlas/bench/bench_qgemm.cpp b/onnxruntime/test/mlas/bench/bench_qgemm.cpp
index d2bf9a57e6..1d21c8732b 100644
--- a/onnxruntime/test/mlas/bench/bench_qgemm.cpp
+++ b/onnxruntime/test/mlas/bench/bench_qgemm.cpp
@@ -13,9 +13,9 @@
 static const std::vector<std::string> qgemm_arg_names = {"M", "N", "K", "Batch", "Threads"};
 
 void QGEMM(benchmark::State& state, bool pack_b, bool a_is_signed) {
-  const bool b_is_signed = true;
-  const uint8_t a_zero_point = 29;
-  const uint8_t b_zero_point = 179;
+  constexpr bool b_is_signed = true;
+  constexpr uint8_t a_zero_point = 29;
+  constexpr uint8_t b_zero_point = 179;
 
   if (state.range(0) <= 0) throw std::invalid_argument("M must greater than 0!");
   if (state.range(1) <= 0) throw std::invalid_argument("N must greater than 0!");
diff --git a/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h b/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h
index 439ebb027e..e516fa8a0b 100644
--- a/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h
+++ b/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h
@@ -31,9 +31,9 @@ class MlasNchwcConv2DTest : public MlasConv2DTest<Threaded> {
       const float* Filter,
       const float* Bias,
       float* Output) override {
-    int64_t InputShape[] = {int64_t(BatchCount), int64_t(GroupCount * InputChannels), int64_t(InputHeight), int64_t(InputWidth)};
-    int64_t FilterShape[] = {int64_t(GroupCount * FilterCount), int64_t(InputChannels), int64_t(KernelHeight), int64_t(KernelWidth)};
-    int64_t OutputShape[] = {int64_t(BatchCount), int64_t(GroupCount * FilterCount), int64_t(OutputHeight), int64_t(OutputWidth)};
+    int64_t InputShape[] = {int64_t(BatchCount), int64_t(GroupCount) * int64_t(InputChannels), int64_t(InputHeight), int64_t(InputWidth)};
+    int64_t FilterShape[] = {int64_t(GroupCount) * int64_t(FilterCount), int64_t(InputChannels), int64_t(KernelHeight), int64_t(KernelWidth)};
+    int64_t OutputShape[] = {int64_t(BatchCount), int64_t(GroupCount) * int64_t(FilterCount), int64_t(OutputHeight), int64_t(OutputWidth)};
 
     int64_t KernelShape[] = {int64_t(KernelHeight), int64_t(KernelWidth)};
     int64_t DilationShape[] = {int64_t(DilationHeight), int64_t(DilationWidth)};
@@ -149,7 +149,7 @@ class MlasNchwcConv2DTest : public MlasConv2DTest<Threaded> {
 
  public:
   static const char* GetTestSuiteName(void) {
-    static const std::string suite_name(Threaded? "Conv2dNchwc_Threaded" : "Conv2dNchwc_SingleThread");
+    static const std::string suite_name(Threaded ? "Conv2dNchwc_Threaded" : "Conv2dNchwc_SingleThread");
     return suite_name.c_str();
   }
 
diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index 0b1b16d301..f4fb214066 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -265,22 +265,22 @@ void LoopDataFile(int test_data_pb_fd, bool is_input, const TestModelInfo& model
 
 #if !defined(ORT_MINIMAL_BUILD)
 std::unique_ptr<TestModelInfo> TestModelInfo::LoadOnnxModel(_In_ const PATH_CHAR_TYPE* model_url) {
-  return std::unique_ptr<TestModelInfo>(new OnnxModelInfo(model_url));
+  return std::make_unique<OnnxModelInfo>(model_url);
 }
 #endif
 
 std::unique_ptr<TestModelInfo> TestModelInfo::LoadOrtModel(_In_ const PATH_CHAR_TYPE* model_url) {
-  return std::unique_ptr<TestModelInfo>(new OnnxModelInfo(model_url, true));
+  return std::make_unique<OnnxModelInfo>(model_url, true);
 }
 
 /**
-   * test_case_dir must have contents of:
-   * model.onnx
-   * ???/input_??.pb
-   * ???/output_??.pb
-   * ???/input_??.pb
-   * ???/output_??.pb
-   */
+ * test_case_dir must have contents of:
+ * model.onnx
+ * ???/input_??.pb
+ * ???/output_??.pb
+ * ???/input_??.pb
+ * ???/output_??.pb
+ */
 class OnnxTestCase : public ITestCase {
  private:
   std::string test_case_name_;
@@ -352,9 +352,9 @@ std::unique_ptr<ITestCase> CreateOnnxTestCase(const std::string& test_case_name,
                                               std::unique_ptr<TestModelInfo> model,
                                               double default_per_sample_tolerance,
                                               double default_relative_per_sample_tolerance) {
-  return std::unique_ptr<ITestCase>(new OnnxTestCase(test_case_name, std::move(model),
-                                                     default_per_sample_tolerance,
-                                                     default_relative_per_sample_tolerance));
+  return std::make_unique<OnnxTestCase>(test_case_name, std::move(model),
+                                        default_per_sample_tolerance,
+                                        default_relative_per_sample_tolerance);
 }
 
 void OnnxTestCase::GetPerSampleTolerance(double* value) const {
diff --git a/onnxruntime/test/onnx/dataitem_request.cc b/onnxruntime/test/onnx/dataitem_request.cc
index 41e81baa13..abe8111a29 100644
--- a/onnxruntime/test/onnx/dataitem_request.cc
+++ b/onnxruntime/test/onnx/dataitem_request.cc
@@ -39,7 +39,7 @@ void DataTaskRequestContext::Request(const Callback& cb, concurrency::ThreadPool
                                      const ITestCase& c, Ort::Session& session,
                                      OrtAllocator* allocator, size_t task_id) {
   assert(cb);
-  std::unique_ptr<DataTaskRequestContext> self(new DataTaskRequestContext(cb, c, session, allocator, task_id));
+  std::unique_ptr<DataTaskRequestContext> self = std::make_unique<DataTaskRequestContext>(cb, c, session, allocator, task_id);
   CallableFactory<DataTaskRequestContext, void> f(self.get());
   auto runnable = f.GetCallable<&DataTaskRequestContext::RunAsync>();
   onnxruntime::concurrency::ThreadPool::Schedule(tp, [runnable]() { runnable.Invoke(); });
diff --git a/onnxruntime/test/onnx/dataitem_request.h b/onnxruntime/test/onnx/dataitem_request.h
index 417b800e5c..96a0e64cd8 100644
--- a/onnxruntime/test/onnx/dataitem_request.h
+++ b/onnxruntime/test/onnx/dataitem_request.h
@@ -8,7 +8,6 @@
 #include "core/common/common.h"
 #include "core/platform/env_time.h"
 
-
 class ITestCase;
 struct OrtAllocator;
 
@@ -40,7 +39,7 @@ class DataTaskRequestContext {
   /// <param name="task_id">this task id</param>
   /// <returns>execution result and elapsed time</returns>
   static std::pair<EXECUTE_RESULT, TIME_SPEC> Run(const ITestCase& c, ::Ort::Session& session,
-                            OrtAllocator* allocator, size_t task_id);
+                                                  OrtAllocator* allocator, size_t task_id);
 
   /// <summary>
   /// Schedules a data task to run on a threadpool. The function
@@ -68,7 +67,6 @@ class DataTaskRequestContext {
     return spent_time_;
   }
 
- private:
   DataTaskRequestContext(const Callback& cb,
                          const ITestCase& test_case, ::Ort::Session& session,
                          OrtAllocator* allocator, size_t task_id)
@@ -80,6 +78,7 @@ class DataTaskRequestContext {
     SetTimeSpecToZero(&spent_time_);
   }
 
+ private:
   void RunAsync();
   std::pair<EXECUTE_RESULT, TIME_SPEC> RunImpl();
 
diff --git a/onnxruntime/test/onnx/microbenchmark/activation.cc b/onnxruntime/test/onnx/microbenchmark/activation.cc
index d0b417b84e..d763e89219 100644
--- a/onnxruntime/test/onnx/microbenchmark/activation.cc
+++ b/onnxruntime/test/onnx/microbenchmark/activation.cc
@@ -69,7 +69,7 @@ struct KernelAndDef {
                   .SetDomain(domain)
                   .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
                   .Build();
-    OpKernelInfo info(main_node, *out.def, *out.a, {}, {}, {}, {});
+    OpKernelInfo info(main_node, *out.def, *out.a, {}, {}, {});
     out.kernel = std::make_unique<KernelType>(info);
     return out;
   }
diff --git a/onnxruntime/test/onnx/tensorprotoutils.cc b/onnxruntime/test/onnx/tensorprotoutils.cc
index 4cff3880d5..902ee79d1b 100644
--- a/onnxruntime/test/onnx/tensorprotoutils.cc
+++ b/onnxruntime/test/onnx/tensorprotoutils.cc
@@ -69,7 +69,7 @@ static void UnpackTensorWithRawData(const void* raw_data, size_t raw_data_length
     ORT_CXX_API_THROW(MakeString("UnpackTensor: the pre-allocated size does not match the raw data size, expected ",
                                  expected_size_in_bytes, ", got ", raw_data_length),
                       OrtErrorCode::ORT_FAIL);
-  if constexpr(endian::native != endian::little) {
+  if constexpr (endian::native != endian::little) {
     ORT_CXX_API_THROW("UnpackTensorWithRawData only handles little-endian native byte order for now.",
                       OrtErrorCode::ORT_NOT_IMPLEMENTED);
   }
@@ -298,7 +298,9 @@ OrtStatus* OrtInitializeBufferForTensor(void* input, size_t input_len,
 }
 
 ORT_API(void, OrtUninitializeBuffer, _In_opt_ void* input, size_t input_len, enum ONNXTensorElementDataType type);
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 static void UnInitTensor(void* param) noexcept {
   UnInitializeParam* p = reinterpret_cast<UnInitializeParam*>(param);
   OrtUninitializeBuffer(p->preallocated, p->preallocated_size, p->ele_type);
diff --git a/onnxruntime/test/onnx/testcase_request.cc b/onnxruntime/test/onnx/testcase_request.cc
index ccb64e7131..bc6429905b 100644
--- a/onnxruntime/test/onnx/testcase_request.cc
+++ b/onnxruntime/test/onnx/testcase_request.cc
@@ -85,7 +85,7 @@ void TestCaseRequestContext::Request(const Callback& cb, PThreadPool tpool,
     concurrent_runs = 1;
   }
 
-  std::unique_ptr<TestCaseRequestContext> self(new TestCaseRequestContext(cb, tpool, c, env, session_opts, test_case_id));
+  std::unique_ptr<TestCaseRequestContext> self = std::make_unique<TestCaseRequestContext>(cb, tpool, c, env, session_opts, test_case_id);
   CallableFactory<TestCaseRequestContext, void, size_t> f(self.get());
   auto runnable = f.GetCallable<&TestCaseRequestContext::RunAsync>();
   onnxruntime::concurrency::ThreadPool::Schedule(tpool, [runnable, concurrent_runs]() { runnable.Invoke(concurrent_runs); });
diff --git a/onnxruntime/test/onnx/testcase_request.h b/onnxruntime/test/onnx/testcase_request.h
index 75d02db3d6..4d53a0867d 100644
--- a/onnxruntime/test/onnx/testcase_request.h
+++ b/onnxruntime/test/onnx/testcase_request.h
@@ -78,11 +78,10 @@ class TestCaseRequestContext {
   /// The impact is mitigated by the fact that __Ctor is still private.
   ~TestCaseRequestContext() = default;
 
- private:
-
   TestCaseRequestContext(const Callback& cb, PThreadPool tp, const ITestCase& test_case, Ort::Env& env,
                          const Ort::SessionOptions& session_opts, size_t test_case_id);
 
+ private:
   bool SetupSession();
 
   std::shared_ptr<TestCaseResult> GetResult() const {
@@ -119,6 +118,5 @@ class TestCaseRequestContext {
   mutable bool finished_ = false;
 };
 
-}
+}  //namespace test
 }  // namespace onnxruntime
-
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index af7f5739f7..abc080a31a 100644
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -1339,7 +1339,7 @@ TEST_F(GraphTransformationTests, GemmTransposeFusionOutput) {
   ASSERT_TRUE(new_input_defs[1]->Name() == "A");
 }
 
-//  ((A')'B')' = BA'
+// ((A')'B')' = BA'
 TEST_F(GraphTransformationTests, GemmTransposeFusionInputOutput) {
   auto model_uri = MODEL_FOLDER "fusion/gemm_transpose_inputs_output_transposed.onnx";
   std::shared_ptr<Model> p_model;
@@ -1368,7 +1368,7 @@ TEST_F(GraphTransformationTests, GemmTransposeFusionInputOutput) {
   ASSERT_TRUE(new_input_defs[1]->Name() == "A");
 }
 
-//  (A'(B'))' = BA
+// (A'(B'))' = BA
 TEST_F(GraphTransformationTests, GemmTransposeFusionInputOutput2) {
   auto model_uri = MODEL_FOLDER "fusion/gemm_transpose_inputs_output_transposed_2.onnx";
   std::shared_ptr<Model> p_model;
@@ -1497,8 +1497,8 @@ TEST_F(GraphTransformationTests, GemmSumFusionInternalNodes) {
   ASSERT_EQ(op_to_count["Identity"], 4);
   ASSERT_EQ(graph.NumberOfNodes(), 5);
 
-  for(Node &node : graph.Nodes()) {
-    if(node.OpType() == "Gemm") {
+  for (Node& node : graph.Nodes()) {
+    if (node.OpType() == "Gemm") {
       ASSERT_FALSE(static_cast<bool>(node.GetAttributes().at("transA").i()));
       ASSERT_FALSE(static_cast<bool>(node.GetAttributes().at("transB").i()));
       ASSERT_EQ(node.GetAttributes().at("alpha").f(), 1.0);
@@ -4339,8 +4339,8 @@ TEST_F(GraphTransformationTests, ComputationReductionTransformer_GatherND_E2E) {
                         values_unsqueezed_masked_lm_positions);
 
     ASSERT_TRUE(expected_ort_values.size() == actual_ort_values.size());
-    const double per_sample_tolerance = 1e-4;
-    const double relative_per_sample_tolerance = 1e-4;
+    constexpr double per_sample_tolerance = 1e-4;
+    constexpr double relative_per_sample_tolerance = 1e-4;
     for (size_t i = 0; i < expected_ort_values.size(); i++) {
       auto ret = CompareOrtValue(actual_ort_values[i], expected_ort_values[i],
                                  per_sample_tolerance, relative_per_sample_tolerance, false);
@@ -4411,7 +4411,7 @@ TEST_F(GraphTransformationTests, MatMulScaleFusionFusableModels) {
           EXPECT_EQ(transformed_op_counts["com.microsoft.FusedMatMul"], 1);
 
           // check combined scale, individual scales should all have the same value
-          const float scale_value = 3.0f;
+          constexpr float scale_value = 3.0f;
 
           const int num_scales =
               original_op_counts["Mul"] + original_op_counts["Div"] + original_op_counts["com.microsoft.FusedMatMul"];
@@ -4694,7 +4694,7 @@ TEST_F(GraphTransformationTests, PropagateCastOpsTests) {
   // Create a temporary directory, which will be deleted automatically, to save/load the transformed models.
   TemporaryDirectory temp_dir{ORT_TSTR("propagate_casts_test_output_dir")};
   for (PropagateCastOpsTestSpecs test_case : test_cases) {
-    for (auto scenario : test_case.casts_count_map) {
+    for (const auto& scenario : test_case.casts_count_map) {
       Strategy strategy = scenario.first.first;
       int level = scenario.first.second;
       int expected_casts_count = scenario.second;
diff --git a/onnxruntime/test/perftest/TFModelInfo.cc b/onnxruntime/test/perftest/TFModelInfo.cc
index 8b161faf90..82f5359545 100644
--- a/onnxruntime/test/perftest/TFModelInfo.cc
+++ b/onnxruntime/test/perftest/TFModelInfo.cc
@@ -8,8 +8,7 @@
 #include <core/platform/env.h>
 
 std::unique_ptr<TestModelInfo> TFModelInfo::Create(_In_ const PATH_CHAR_TYPE* model_url) {
-  auto* model_info = new TFModelInfo{};
-  std::unique_ptr<TestModelInfo> ret(model_info);
+  std::unique_ptr<TFModelInfo> model_info = std::make_unique<TFModelInfo>();
 
   model_info->model_url_ = model_url;
   std::basic_string<PATH_CHAR_TYPE> meta_file_path = model_url;
@@ -50,7 +49,7 @@ std::unique_ptr<TestModelInfo> TFModelInfo::Create(_In_ const PATH_CHAR_TYPE* mo
     }
   }
 
-  return ret;
+  return model_info;
 }
 
 int TFModelInfo::GetInputCount() const { return static_cast<int>(input_names_.size()); }
diff --git a/onnxruntime/test/perftest/TFModelInfo.h b/onnxruntime/test/perftest/TFModelInfo.h
index bf7f871a5a..2ca60010e3 100644
--- a/onnxruntime/test/perftest/TFModelInfo.h
+++ b/onnxruntime/test/perftest/TFModelInfo.h
@@ -22,9 +22,9 @@ class TFModelInfo : public TestModelInfo {
   ~TFModelInfo() override = default;
 
   static std::unique_ptr<TestModelInfo> Create(_In_ const PATH_CHAR_TYPE* model_url);
+  TFModelInfo() = default;
 
  private:
-  TFModelInfo() = default;
   std::basic_string<PATH_CHAR_TYPE> model_url_;
   std::vector<std::string> input_names_;
   std::vector<std::string> output_names_;
diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc
index 8536085611..f789ed5745 100644
--- a/onnxruntime/test/perftest/performance_runner.cc
+++ b/onnxruntime/test/perftest/performance_runner.cc
@@ -35,7 +35,7 @@ static std::once_flag default_pool_init;
 Eigen::ThreadPoolInterface* GetDefaultThreadPool(const onnxruntime::Env& env) {
   std::call_once(default_pool_init, [&env] {
     int core_num = env.GetNumCpuCores();
-    default_pool.reset(new DefaultThreadPoolType(core_num));
+    default_pool = std::make_unique<DefaultThreadPoolType>(core_num);
   });
   return default_pool.get();
 }
@@ -258,8 +258,7 @@ static std::unique_ptr<TestSession> CreateSession(Ort::Env& env, std::random_dev
                                                   const PerformanceTestConfig& performance_test_config_,
                                                   const TestModelInfo& test_model_info) {
   if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("ort")) == 0) {
-    return std::unique_ptr<TestSession>(
-        new OnnxRuntimeTestSession(env, rd, performance_test_config_, test_model_info));
+    return std::make_unique<OnnxRuntimeTestSession>(env, rd, performance_test_config_, test_model_info);
   }
 #ifdef HAVE_TENSORFLOW
   if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("tf")) == 0) {
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
index fb6ae200ab..a530bbb8c8 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
@@ -105,17 +105,19 @@ TEST_F(ActivationOpTest, Relu) {
   TestActivationOp<float>("Relu",
                           input_values,
                           [](float x) { return std::max(x, 0.0f); });
-  TestActivationOp<double>("Relu",
-                           input_values_double,
-                           [](double x) { return std::max(x, 0.0); },
-                           {},
-                           /*is_tensorrt_supported=*/ false);
-  TestActivationOp<int8_t>("Relu",
-                           input_values_int8,
-                           [](int8_t x) { return std::max(x, static_cast<int8_t>(0)); },
-                           {},
-                           /*is_tensorrt_supported=*/ false,
-                           /*opset_version= */ 14);
+  TestActivationOp<double>(
+      "Relu",
+      input_values_double,
+      [](double x) { return std::max(x, 0.0); },
+      {},
+      /*is_tensorrt_supported=*/false);
+  TestActivationOp<int8_t>(
+      "Relu",
+      input_values_int8,
+      [](int8_t x) { return std::max(x, static_cast<int8_t>(0)); },
+      {},
+      /*is_tensorrt_supported=*/false,
+      /*opset_version= */ 14);
 }
 
 TEST_F(ActivationOpTest, Elu) {
@@ -217,9 +219,9 @@ TEST_F(ActivationOpTest, PRelu_MultiChannel) {
   std::vector<float> inputs{1.0f, 2.0f, -4.0f, 3.0f, 0.0f, 5.0f, -9.0f, 8.0f};
   std::vector<float> slopes{1.0f, -2.0f};
   std::vector<float> outputs;
-  const int64_t num_images = 2;
-  const int64_t num_channels = 2;
-  const int64_t num_pixels = 2;
+  constexpr int64_t num_images = 2;
+  constexpr int64_t num_channels = 2;
+  constexpr int64_t num_pixels = 2;
   for (unsigned i = 0; i < inputs.size(); i++)
     outputs.push_back(formula(inputs[i], slopes[i / num_pixels % num_channels]));
 
diff --git a/onnxruntime/test/providers/cpu/generator/random_test.cc b/onnxruntime/test/providers/cpu/generator/random_test.cc
index e1be1e28f9..a59e19f2f1 100644
--- a/onnxruntime/test/providers/cpu/generator/random_test.cc
+++ b/onnxruntime/test/providers/cpu/generator/random_test.cc
@@ -15,9 +15,9 @@ TEST(Random, RandomNormal2DDouble) {
 
   std::vector<int64_t> dims{20, 50};
 
-  float scale = 10.f;
-  float mean = 0.f;
-  float seed = 123.f;
+  constexpr float scale = 10.f;
+  constexpr float mean = 0.f;
+  constexpr float seed = 123.f;
 
   test.AddAttribute("scale", scale);
   test.AddAttribute("mean", mean);
@@ -44,9 +44,9 @@ void RunRandomNormalLike3DFloat(bool infer_dtype = false) {
 
   std::vector<int64_t> dims{2, 2, 3};
 
-  float scale = 10.f;
-  float mean = 0.f;
-  float seed = 123.f;
+  constexpr float scale = 10.f;
+  constexpr float mean = 0.f;
+  constexpr float seed = 123.f;
 
   test.AddAttribute("scale", scale);
   test.AddAttribute("mean", mean);
@@ -79,7 +79,7 @@ TEST(Random, RandomNormalLike3DDouble) {
 }
 
 TEST(Random, RandomNormalLikeInferDType) {
-  const bool infer_dtype = true;
+  constexpr bool infer_dtype = true;
   RunRandomNormalLike3DFloat(infer_dtype);
 }
 
@@ -88,9 +88,9 @@ TEST(Random, RandomUniform1DFloat) {
 
   std::vector<int64_t> dims{10};
 
-  float low = 0.f;
-  float high = 100.f;
-  float seed = 123.f;
+  constexpr float low = 0.f;
+  constexpr float high = 100.f;
+  constexpr float seed = 123.f;
 
   test.AddAttribute("low", low);
   test.AddAttribute("high", high);
@@ -116,9 +116,9 @@ void RunRandomUniformLikeTest(bool infer_dtype = false) {
 
   std::vector<int64_t> dims{2, 6};
 
-  float low = 0.f;
-  float high = 100.f;
-  float seed = 123.f;
+  constexpr float low = 0.f;
+  constexpr float high = 100.f;
+  constexpr float seed = 123.f;
 
   test.AddAttribute("low", low);
   test.AddAttribute("high", high);
@@ -149,12 +149,12 @@ TEST(Random, RandomUniformLike2DDouble) {
 }
 
 TEST(Random, RandomUniformLikeInferDType) {
-  const bool infer_dtype = true;
+  constexpr bool infer_dtype = true;
   RunRandomUniformLikeTest(infer_dtype);
 }
 
 TEST(Random, InvalidDType) {
-  float seed = 123.f;
+  constexpr float seed = 123.f;
 
   std::vector<int64_t> dims{1, 4};
   std::vector<int32_t> input{0, 0, 0, 0};
@@ -179,8 +179,8 @@ TEST(Random, InvalidDType) {
   {
     OpTester test("RandomUniform");
 
-    float low = 0.f;
-    float high = 100.f;
+    constexpr float low = 0.f;
+    constexpr float high = 100.f;
 
     test.AddAttribute("low", low);
     test.AddAttribute("high", high);
@@ -195,8 +195,8 @@ TEST(Random, InvalidDType) {
   {
     OpTester test("RandomNormalLike");
 
-    float scale = 10.f;
-    float mean = 0.f;
+    constexpr float scale = 10.f;
+    constexpr float mean = 0.f;
 
     test.AddAttribute("scale", scale);
     test.AddAttribute("mean", mean);
@@ -211,8 +211,8 @@ TEST(Random, InvalidDType) {
   {
     OpTester test("RandomUniformLike");
 
-    float low = 0.f;
-    float high = 100.f;
+    constexpr float low = 0.f;
+    constexpr float high = 100.f;
 
     test.AddAttribute("low", low);
     test.AddAttribute("high", high);
@@ -234,10 +234,10 @@ for verification.
 TEST(Random, MultinomialGoodCase) {
   OpTester test("Multinomial");
 
-  const int64_t num_samples = 10;
-  const float seed = 1618.f;
-  const int batch_size = 2;
-  const int num_classes = 3;
+  constexpr int64_t num_samples = 10;
+  constexpr float seed = 1618.f;
+  constexpr int batch_size = 2;
+  constexpr int num_classes = 3;
 
   const std::vector<int64_t> input_dims{batch_size, num_classes};
   std::vector<float> input(TensorShape(input_dims).Size());
@@ -264,9 +264,9 @@ TEST(Random, MultinomialGoodCase) {
 TEST(Random, MultinomialDefaultDType) {
   auto run_test = [](int num_run_calls, const std::vector<int32_t>& expected_output) {
     OpTester test("Multinomial");
-    const int64_t num_samples = 10;
-    const int batch_size = 2;
-    const float seed = 1618.f;
+    constexpr int64_t num_samples = 10;
+    constexpr int batch_size = 2;
+    constexpr float seed = 1618.f;
 
     const std::vector<int64_t> input_dims{2, 3};
     std::vector<float> input(TensorShape(input_dims).Size());
@@ -307,10 +307,10 @@ TEST(Random, MultinomialDefaultDType) {
 TEST(Random, MultinomialInvalidDtype) {
   OpTester test("Multinomial");
 
-  const int64_t num_samples = 10;
-  const int batch_size = 2;
-  const int num_classes = 3;
-  const float seed = 1618.f;
+  constexpr int64_t num_samples = 10;
+  constexpr int batch_size = 2;
+  constexpr int num_classes = 3;
+  constexpr float seed = 1618.f;
 
   const std::vector<int64_t> input_dims{batch_size, num_classes};
   std::vector<float> input(TensorShape(input_dims).Size());
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index 68c764813f..841e075855 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -1572,7 +1572,6 @@ TEST(MathOpTest, Xor) {
   test.Run();
 }
 
-
 TEST(MathOpTest, Xor_Issue8880) {
   OpTester test("Xor");
   test.AddInput<bool>("A", {1}, {true});
@@ -2247,7 +2246,7 @@ TEST(MathOpTest, ErfMoreData) {
   test.Run();
 }
 
-const int ModOp_ver = 10;
+constexpr int ModOp_ver = 10;
 
 TEST(ModOpTest, Fmod_float_mixed_sign) {
   OpTester test("Mod", ModOp_ver);
diff --git a/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc b/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc
index aa5e103874..0cb078b456 100644
--- a/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc
+++ b/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc
@@ -415,8 +415,8 @@ TEST(QuantizeLinearMatmulOpTest, PerColumn_ND_S8S8) {
 }
 
 /**
- * @brief Extend QLinearMatMul for verifying prepacking behavior 
-*/
+ * @brief Extend QLinearMatMul for verifying prepacking behavior
+ */
 struct PrePackTestOp {
   // TODO!! use template and macro to extract a common utility out of this
   //   for grey box kernel testing by extending kernel classes.
@@ -487,7 +487,7 @@ TEST(QuantizeLinearMatmulOpTest, QLinearMatMulPrePack) {
   std::vector<ONNX_NAMESPACE::OpSchema> schemas{PrePackTestOp::OpSchema()};
   Status status;
   ASSERT_TRUE((status = registry->RegisterOpSet(schemas, PrePackTestOp::OpDomain, 10, 11)).IsOK()) << status;
-  KernelCreateFn kernel_create_fn = [](const OpKernelInfo& info) { return new typename PrePackTestOp::QLinearMatMulPrePackT(info); };
+  KernelCreateFn kernel_create_fn = [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) { out = std::make_unique<typename PrePackTestOp::QLinearMatMulPrePackT>(info); return Status::OK(); };
   auto kernel_def = PrePackTestOp::KernelDef();
   ASSERT_TRUE((status = registry->RegisterCustomKernel(kernel_def, kernel_create_fn)).IsOK()) << status;
 
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index 55f516bed7..12e98e41f5 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -13,10 +13,10 @@
 namespace onnxruntime {
 namespace test {
 
-const float FLOAT_INF = std::numeric_limits<float>::infinity();
-const float FLOAT_NINF = -std::numeric_limits<float>::infinity();
-const double DOUBLE_INF = std::numeric_limits<double>::infinity();
-const double DOUBLE_NINF = -std::numeric_limits<double>::infinity();
+constexpr float FLOAT_INF = std::numeric_limits<float>::infinity();
+constexpr float FLOAT_NINF = -std::numeric_limits<float>::infinity();
+constexpr double DOUBLE_INF = std::numeric_limits<double>::infinity();
+constexpr double DOUBLE_NINF = -std::numeric_limits<double>::infinity();
 
 // Disable TensorRT on some of the tests because the limit in its parser: axis >=0 && axis < nbDims
 template <typename OutT>
@@ -778,7 +778,7 @@ TEST(ReductionOpTest, ReduceMax_int32) {
 #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_MYRIAD)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled temporarily
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                          //TensorRT: axis must be 0
 #endif
 }
 
@@ -799,7 +799,7 @@ TEST(ReductionOpTest, ReduceMax_int64) {
 #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_MYRIAD)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled temporarily
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                          //TensorRT: axis must be 0
 #endif
 }
 
@@ -820,7 +820,7 @@ TEST(ReductionOpTest, ReduceMax_int8) {
 #if defined(OPENVINO_CONFIG_MYRIAD)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled temporarily
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                          //TensorRT: axis must be 0
 #endif
 }
 
@@ -841,7 +841,7 @@ TEST(ReductionOpTest, ReduceMax_uint8) {
 #if defined(OPENVINO_CONFIG_MYRIAD)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled temporarily
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                          //TensorRT: axis must be 0
 #endif
 }
 
@@ -1049,7 +1049,7 @@ TEST(ReductionOpTest, ReduceMean) {
                         9.0f, 10.0f,
                         11.0f, 12.0f});
   test.AddOutput<float>("reduced", {1, 2, 1}, {5.5f, 7.5f});
-  
+
   test.Run();
 }
 
diff --git a/onnxruntime/test/providers/cpu/tensor/copy_test.cc b/onnxruntime/test/providers/cpu/tensor/copy_test.cc
index 91c496a8ed..f8a72a5c8f 100644
--- a/onnxruntime/test/providers/cpu/tensor/copy_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/copy_test.cc
@@ -51,17 +51,16 @@ TEST_F(CopyTest, Contiguous3D) {
 
 TEST_F(CopyTest, Transpose4D) {
   // Test performing a transpose using a strided copy
-  int64_t numel = 2 * 3 * 4 * 5;
-  double* src = new double[numel];
+  constexpr int64_t numel = 2 * 3 * 4 * 5;
+  std::unique_ptr<double[]> src = std::make_unique<double[]>(numel);
   for (int i = 0; i < numel; i++) {
     src[i] = static_cast<double>(i);
   }
-
-  double* dst = new double[numel];
+  std::unique_ptr<double[]> dst = std::make_unique<double[]>(numel);
 
   std::vector<int64_t> dst_strides = {60, 5, 15, 1};
   std::vector<int64_t> src_strides = {60, 20, 5, 1};
-  StridedCopy<double>(tp.get(), dst, dst_strides, {2, 3, 4, 5}, src, src_strides);
+  StridedCopy<double>(tp.get(), dst.get(), dst_strides, {2, 3, 4, 5}, src.get(), src_strides);
 
   // stride to access the dst tensor as if it were contiguous
   std::vector<int64_t> contig_dst_strides = {60, 15, 5, 1};
@@ -78,18 +77,15 @@ TEST_F(CopyTest, Transpose4D) {
       }
     }
   }
-  delete[] src;
-  delete[] dst;
 }
 
 TEST_F(CopyTest, Concat2D) {
   // test performing a concat using a strided copy
-  double* src = new double[6 * 2];
+  std::unique_ptr<double[]> src = std::make_unique<double[]>(6 * 2);
   for (int i = 0; i < 6 * 2; i++) {
     src[i] = static_cast<double>(i);
   }
-
-  double* dst = new double[10 * 5];
+  std::unique_ptr<double[]> dst = std::make_unique<double[]>(10 * 5);
   for (int i = 0; i < 10 * 5; i++) {
     dst[i] = 0;
   }
@@ -97,7 +93,7 @@ TEST_F(CopyTest, Concat2D) {
   std::vector<int64_t> dst_strides = {5, 1};
   std::vector<int64_t> src_strides = {2, 1};
   std::ptrdiff_t offset = 3;
-  StridedCopy<double>(tp.get(), dst + offset, dst_strides, {6, 2}, src, src_strides);
+  StridedCopy<double>(tp.get(), dst.get() + offset, dst_strides, {6, 2}, src.get(), src_strides);
 
   for (int i0 = 0; i0 < 10; i0++) {
     for (int i1 = 0; i1 < 5; i1++) {
@@ -110,8 +106,6 @@ TEST_F(CopyTest, Concat2D) {
       }
     }
   }
-  delete[] src;
-  delete[] dst;
 }
 
 TEST_F(CopyTest, CoalesceTensorsTest) {
diff --git a/onnxruntime/test/providers/cpu/tensor/split_op_test.cc b/onnxruntime/test/providers/cpu/tensor/split_op_test.cc
index b2dc3a44b3..c3b5b25ee7 100644
--- a/onnxruntime/test/providers/cpu/tensor/split_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/split_op_test.cc
@@ -50,7 +50,7 @@ void RunTest(int64_t axis, const std::vector<int64_t> split_sizes, const ShapeAn
 }
 
 TEST(SplitOperatorTest, Axis0EqualSplitFloat) {
-  const int64_t axis = 0;
+  constexpr int64_t axis = 0;
   std::vector<ShapeAndFloatData> outputs;
 
   // input shape and data
@@ -73,7 +73,7 @@ TEST(SplitOperatorTest, Axis0EqualSplitFloat) {
 
 template <typename T, typename = typename std::enable_if<std::is_integral<T>::value, T>::type>
 static void SplitTestInt() {
-  const int64_t axis = 0;
+  constexpr int64_t axis = 0;
   std::vector<ShapeAndData<T>> outputs;
 
   // input shape and data
@@ -107,7 +107,7 @@ TEST(SplitOperatorTest, Axis0EqualSplitInt64) {
 }
 
 TEST(SplitOperatorTest, Axis0EqualSplitString) {
-  const int64_t axis = 0;
+  constexpr int64_t axis = 0;
   std::vector<ShapeAndStringData> outputs;
 
   // input shape and data
@@ -129,7 +129,7 @@ TEST(SplitOperatorTest, Axis0EqualSplitString) {
 }
 
 TEST(SplitOperatorTest, Axis0UnequalSplitFloat) {
-  const int64_t axis = 0;
+  constexpr int64_t axis = 0;
   std::vector<ShapeAndFloatData> outputs;
 
   // input shape and data
@@ -152,7 +152,7 @@ TEST(SplitOperatorTest, Axis0UnequalSplitFloat) {
 }
 
 TEST(SplitOperatorTest, Axis0UnequalSplitString) {
-  const int64_t axis = 0;
+  constexpr int64_t axis = 0;
   std::vector<ShapeAndStringData> outputs;
 
   // input shape and data
@@ -175,7 +175,7 @@ TEST(SplitOperatorTest, Axis0UnequalSplitString) {
 }
 
 TEST(SplitOperatorTest, Axis1EqualSplitFloat) {
-  const int64_t axis = 1;
+  constexpr int64_t axis = 1;
   std::vector<ShapeAndFloatData> outputs;
 
   // input shape and data
@@ -195,7 +195,7 @@ TEST(SplitOperatorTest, Axis1EqualSplitFloat) {
 }
 
 TEST(SplitOperatorTest, Axis1EqualSplitString) {
-  const int64_t axis = 1;
+  constexpr int64_t axis = 1;
   std::vector<ShapeAndStringData> outputs;
 
   // input shape and data
@@ -215,7 +215,7 @@ TEST(SplitOperatorTest, Axis1EqualSplitString) {
 }
 
 TEST(SplitOperatorTest, Axis1UnequalSplitFloat) {
-  const int64_t axis = 1;
+  constexpr int64_t axis = 1;
   std::vector<ShapeAndFloatData> outputs;
 
   // input shape and data
@@ -237,7 +237,7 @@ TEST(SplitOperatorTest, Axis1UnequalSplitFloat) {
 }
 
 TEST(SplitOperatorTest, Axis1UnequalSplitString) {
-  const int64_t axis = 1;
+  constexpr int64_t axis = 1;
   std::vector<ShapeAndStringData> outputs;
 
   // input shape and data
@@ -271,7 +271,7 @@ ShapeAndData<T> CreateInput(std::vector<int64_t> shape) {
 }
 
 TEST(SplitOperatorTest, Axis2EqualSplit) {
-  const int64_t axis = 2;
+  constexpr int64_t axis = 2;
   std::vector<ShapeAndFloatData> outputs;
 
   ShapeAndFloatData input = CreateInput<float>({2, 2, 6});
@@ -301,7 +301,7 @@ TEST(SplitOperatorTest, Axis2EqualSplit) {
 }
 
 TEST(SplitOperatorTest, Axis2UnequalSplit) {
-  const int64_t axis = 2;
+  constexpr int64_t axis = 2;
   std::vector<ShapeAndFloatData> outputs;
 
   ShapeAndFloatData input = CreateInput<float>({2, 2, 6});
@@ -333,7 +333,7 @@ TEST(SplitOperatorTest, Axis2UnequalSplit) {
 }
 
 TEST(SplitOperatorTest, ZeroSizeInput) {
-  const int64_t axis = -1;
+  constexpr int64_t axis = -1;
   std::vector<ShapeAndFloatData> outputs{{{0, 1}, {}}, {{0, 1}, {}}};
 
   ShapeAndFloatData input = CreateInput<float>({0, 2});
@@ -343,7 +343,7 @@ TEST(SplitOperatorTest, ZeroSizeInput) {
 
 // test a split of a dimension that has leading and trailing dimensions
 TEST(SplitOperatorTest, Axis1SplitMiddleDimensionEqually) {
-  const int64_t axis = 1;
+  constexpr int64_t axis = 1;
   std::vector<ShapeAndFloatData> outputs;
 
   ShapeAndFloatData input = CreateInput<float>({2, 4, 4});
@@ -367,7 +367,7 @@ TEST(SplitOperatorTest, Axis1SplitMiddleDimensionEqually) {
 
 // test a split of a dimension that has leading and trailing dimensions
 TEST(SplitOperatorTest, Axis1SplitMiddleDimensionUnequally) {
-  const int64_t axis = 1;
+  constexpr int64_t axis = 1;
   std::vector<ShapeAndFloatData> outputs;
 
   ShapeAndFloatData input = CreateInput<float>({2, 4, 4});
@@ -392,7 +392,7 @@ TEST(SplitOperatorTest, Axis1SplitMiddleDimensionUnequally) {
 }
 
 TEST(SplitOperatorTest, NegativeAxis) {
-  const int64_t axis = -1;  // split last axis equally
+  constexpr int64_t axis = -1;  // split last axis equally
   std::vector<ShapeAndFloatData> outputs;
 
   // input shape and data
@@ -412,7 +412,7 @@ TEST(SplitOperatorTest, NegativeAxis) {
 }
 
 TEST(SplitOperatorTest, InvalidAxis) {
-  const int64_t axis = 2;
+  constexpr int64_t axis = 2;
   std::vector<ShapeAndFloatData> outputs;
 
   // input shape and data
@@ -429,7 +429,7 @@ TEST(SplitOperatorTest, InvalidAxis) {
 
 // sum of values in splits is too small
 TEST(SplitOperatorTest, SplitAttributeSumTooSmall) {
-  const int64_t axis = 0;
+  constexpr int64_t axis = 0;
   std::vector<ShapeAndFloatData> outputs;
 
   // input shape and data
@@ -448,7 +448,7 @@ TEST(SplitOperatorTest, SplitAttributeSumTooSmall) {
 }
 
 TEST(SplitOperatorTest, InvalidValueInSplitAttribute) {
-  const int64_t axis = -1;
+  constexpr int64_t axis = -1;
   std::vector<ShapeAndFloatData> outputs;
 
   // input shape and data
@@ -467,7 +467,7 @@ TEST(SplitOperatorTest, InvalidValueInSplitAttribute) {
 
 // split as input
 TEST(SplitOperatorTest, Axis0UnequalSplitInputFloat) {
-  const int64_t axis = 0;
+  constexpr int64_t axis = 0;
   std::vector<ShapeAndFloatData> outputs;
 
   // input shape and data
@@ -491,7 +491,7 @@ TEST(SplitOperatorTest, Axis0UnequalSplitInputFloat) {
 
 // split as input
 TEST(SplitOperatorTest, Axis0UnequalSplitInputFloat_not_initializer) {
-  const int64_t axis = 0;
+  constexpr int64_t axis = 0;
   std::vector<ShapeAndFloatData> outputs;
 
   // input shape and data
@@ -576,7 +576,7 @@ SplitMiddleDimension()
 
 // test split for uint8_t data that has leading and trailing dimensions
 TEST(SplitOperatorTest, Uint8Axis1SplitMiddleDimensionUnequally) {
-  const int64_t axis = 1;
+  constexpr int64_t axis = 1;
   std::vector<ShapeAndData<uint8_t>> outputs;
 
   ShapeAndData<uint8_t> input = CreateInput<uint8_t>({2, 4, 4});
@@ -602,7 +602,7 @@ TEST(SplitOperatorTest, Uint8Axis1SplitMiddleDimensionUnequally) {
 
 // test split for uint8_t data on the last axis equally
 TEST(SplitOperatorTest, Uint8NegativeAxis) {
-  const int64_t axis = -1;
+  constexpr int64_t axis = -1;
   std::vector<ShapeAndData<uint8_t>> outputs;
 
   ShapeAndData<uint8_t> input = {{2, 4},
@@ -621,7 +621,7 @@ TEST(SplitOperatorTest, Uint8NegativeAxis) {
 }
 
 TEST(SplitOperatorTest, MissingOptionalInputAdded) {
-  const int64_t axis = 1;  // split last axis equally
+  constexpr int64_t axis = 1;  // split last axis equally
   std::vector<ShapeAndFloatData> outputs;
 
   // input shape and data
diff --git a/onnxruntime/test/providers/cpu/tensor/tensor_op_test.cc b/onnxruntime/test/providers/cpu/tensor/tensor_op_test.cc
index 1a7fada780..f2eeb06640 100644
--- a/onnxruntime/test/providers/cpu/tensor/tensor_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/tensor_op_test.cc
@@ -146,7 +146,7 @@ TEST(TensorOpTest, ShapeTest3D) {
 }
 
 void MeanVarianceNormalizationFunctionDefaultPerChannel() {
-  const int64_t N = 2, C = 2, H = 2, W = 3;
+  constexpr int64_t N = 2, C = 2, H = 2, W = 3;
 
   std::vector<float> N1C1 = {3.0f, -3.0f, -1.0f,
                              1.0f, 2.0f, -1.0f};
@@ -209,7 +209,7 @@ void MeanVarianceNormalizationFunctionDefaultPerChannel() {
 }
 
 void MeanVarianceNormalizationFunctionAcrossChannels(std::vector<int64_t> axes) {
-  const int64_t N = 2, C = 2, H = 2, W = 3;
+  constexpr int64_t N = 2, C = 2, H = 2, W = 3;
 
   std::vector<float> X = {3.0f, -3.0f, -1.0f,
                           1.0f, 2.0f, -1.0f,
diff --git a/onnxruntime/test/providers/cpu/tensor/unique_op_test.cc b/onnxruntime/test/providers/cpu/tensor/unique_op_test.cc
index a8e43fdc53..c539e59b5d 100644
--- a/onnxruntime/test/providers/cpu/tensor/unique_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/unique_op_test.cc
@@ -142,7 +142,7 @@ TEST(Unique, Axis0_Unsorted) {
                              0.f, 1.f,
                              1.f, 0.f};
 
-  const int64_t axis = 0;
+  constexpr int64_t axis = 0;
   bool sorted = false;
   const std::vector<int64_t> Y_dims{3, 2};
   const std::vector<float> Y{0.f, 1.f,
@@ -167,7 +167,7 @@ TEST(Unique, Axis0_Sorted) {
                              0.f, 1.f,
                              1.f, 0.f};
 
-  const int64_t axis = 0;
+  constexpr int64_t axis = 0;
   bool sorted = true;
   const std::vector<int64_t> Y_dims{3, 2};
   const std::vector<float> Y{0.f, 1.f,
@@ -192,7 +192,7 @@ TEST(Unique, Axis0_Unsorted_String) {
                                    "0.f", "1.f",
                                    "1.f", "0.f"};
 
-  const int64_t axis = 0;
+  constexpr int64_t axis = 0;
   bool sorted = false;
   const std::vector<int64_t> Y_dims{3, 2};
   const std::vector<std::string> Y{"0.f", "1.f",
@@ -222,7 +222,7 @@ TEST(Unique, Axis1_Unsorted) {
                               2, 1,
                               0, 1};
 
-  const int64_t axis = 1;
+  constexpr int64_t axis = 1;
   bool sorted = false;
   const std::vector<int64_t> Y_dims{2, 3, 2};
   const std::vector<int8_t> Y{1, 1,
@@ -256,7 +256,7 @@ TEST(Unique, Axis1_Sorted) {
                                2, 1,
                                0, 1};
 
-  const int64_t axis = 1;
+  constexpr int64_t axis = 1;
   bool sorted = true;
   const std::vector<int64_t> Y_dims{2, 3, 2};
   const std::vector<int64_t> Y{0, 1,
@@ -286,7 +286,7 @@ TEST(Unique, Axis2_Unsorted) {
                                1, 1, 0, 1,
                                2, 1, 0, 1};
 
-  const int64_t axis = 2;
+  constexpr int64_t axis = 2;
   bool sorted = false;
   const std::vector<int64_t> Y_dims{2, 2, 3};
   const std::vector<int64_t> Y{1, 1, 0,
@@ -314,7 +314,7 @@ TEST(Unique, Axis2_Sorted) {
                                1, 1, 0, 1,
                                2, 1, 0, 1};
 
-  const int64_t axis = 2;
+  constexpr int64_t axis = 2;
   bool sorted = true;
   const std::vector<int64_t> Y_dims{2, 2, 3};
   const std::vector<int64_t> Y{0, 1, 1,
@@ -335,7 +335,7 @@ TEST(Unique, Axis2_Sorted) {
 }
 
 TEST(Unique, InvalidAxis) {
-  const int64_t axis = 12;
+  constexpr int64_t axis = 12;
   const std::vector<int64_t> X_dims{2, 3};
   const std::vector<float> X{1.f, 4.f, 1.f, 2.f, 2.f, 0.f};
   const std::vector<int64_t> Y_dims{};
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_partitioning_tests.cc b/onnxruntime/test/providers/internal_testing/internal_testing_partitioning_tests.cc
index d59ec0f4bd..0aacbf14ce 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_partitioning_tests.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_partitioning_tests.cc
@@ -49,8 +49,8 @@ TEST(InternalTestingEP, TestSortResultsInSinglePartition) {
 
     ASSERT_STATUS_OK(session->Initialize());
 
-    const auto& func_mgr = session->GetSessionState().GetFuncMgr();
-    NodeComputeInfo* compute_func = nullptr;
+    auto& func_mgr = const_cast<SessionState&>(session->GetSessionState()).GetMutableFuncMgr();
+    const NodeComputeInfo* compute_func = nullptr;
 
     int num_partitions{0}, num_other_nodes{0};
 
@@ -94,8 +94,8 @@ TEST(InternalTestingEP, TestDependenciesCorrectlyHandled) {
 
   ASSERT_STATUS_OK(session->Initialize());  // this should fail if we don't process dependencies correctly
 
-  const auto& func_mgr = session->GetSessionState().GetFuncMgr();
-  NodeComputeInfo* compute_func = nullptr;
+  auto& func_mgr = const_cast<SessionState&>(session->GetSessionState()).GetMutableFuncMgr();
+  const NodeComputeInfo* compute_func = nullptr;
 
   int num_partitions{0};
   int num_other_nodes{0};
@@ -235,8 +235,8 @@ static void TestNnapiPartitioning(const std::string& test_name, const std::strin
     unsupported_op_str = oss.str();
   }
 
-  const auto& func_mgr = session->GetSessionState().GetFuncMgr();
-  NodeComputeInfo* compute_func = nullptr;
+  auto& func_mgr = const_cast<SessionState&>(session->GetSessionState()).GetMutableFuncMgr();
+  const NodeComputeInfo* compute_func = nullptr;
 
   stats.num_nodes_not_handled = 0;
   stats.num_compiled_nodes = 0;
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
index cdd2eb6445..f456809f0b 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
@@ -174,8 +174,8 @@ TEST(InternalTestingEP, TestLoadOrtModelWithReducedOpCoverage) {
   // Conv+Add gets fused by level 1 optimizer into single node. The 'Conv'/'Add'/'Relu' nodes should be compiled and
   // handled by the custom EP. fallback to CPU for MaxPool.
   ASSERT_EQ(graph.NumberOfNodes(), 6);
-  const auto& func_mgr = session->GetSessionState().GetFuncMgr();
-  NodeComputeInfo* compute_func = nullptr;
+  auto& func_mgr = const_cast<SessionState&>(session->GetSessionState()).GetMutableFuncMgr();
+  const NodeComputeInfo* compute_func = nullptr;
 
   // the generated op type should have a hash for the model based on the model path
   const std::string expected_op_type_prefix = "InternalTestingEP_9611636968429821767_";
@@ -197,14 +197,14 @@ TEST(InternalTestingEP, TestLoadOrtModelWithReducedOpCoverage) {
 // count nodes assigned to the test EP and make sure they all have valid compute funcs
 static int CountAndValidateAssignedNodes(const Graph& current_graph,
                                          const std::unordered_set<std::string>& supported_ops,
-                                         const FuncManager& func_mgr) {
+                                         FuncManager& func_mgr) {
   int count = 0;
 
   for (const auto& node : current_graph.Nodes()) {
     EXPECT_EQ(supported_ops.count(node.OpType()), size_t(0))
         << "Nodes with supported op types should have been replaced. Node with type " << node.OpType() << " was not.";
     if (node.GetExecutionProviderType() == utils::kInternalTestingExecutionProvider) {
-      NodeComputeInfo* compute_func = nullptr;
+      const NodeComputeInfo* compute_func = nullptr;
       EXPECT_STATUS_OK(func_mgr.GetFuncs(node.Name(), compute_func));
       EXPECT_NE(compute_func, nullptr);
       ++count;
@@ -232,7 +232,7 @@ TEST(InternalTestingEP, TestModelWithSubgraph) {
   CreateSession(SessionOptions{}, session, ort_model_path, enable_custom_ep, &supported_ops);
 
   const auto& graph = session->GetGraph();
-  const auto& func_mgr = session->GetSessionState().GetFuncMgr();
+  auto& func_mgr = const_cast<SessionState&>(session->GetSessionState()).GetMutableFuncMgr();
 
   int num_replaced_nodes = CountAndValidateAssignedNodes(graph, supported_ops, func_mgr);
 
@@ -317,7 +317,7 @@ TEST(InternalTestingEP, TestOrtModelWithCompileFailure) {
     ASSERT_STATUS_OK(session.Initialize());
 
     int num_replaced_nodes = CountAndValidateAssignedNodes(
-        session.GetGraph(), supported_ops, session.GetSessionState().GetFuncMgr());
+        session.GetGraph(), supported_ops, const_cast<SessionState&>(session.GetSessionState()).GetMutableFuncMgr());
 
     ASSERT_EQ(num_replaced_nodes, 3);
   }
@@ -335,7 +335,7 @@ TEST(InternalTestingEP, TestOrtModelWithCompileFailure) {
     // 2 Conv nodes shoule be replaced with fused nodes
     const auto& graph = session.GetGraph();
     int num_replaced_nodes = CountAndValidateAssignedNodes(
-        session.GetGraph(), {"Conv"}, session.GetSessionState().GetFuncMgr());
+        session.GetGraph(), {"Conv"}, const_cast<SessionState&>(session.GetSessionState()).GetMutableFuncMgr());
 
     ASSERT_EQ(num_replaced_nodes, 2);
 
diff --git a/onnxruntime/test/testdata/custom_execution_provider_library/my_allocator.cc b/onnxruntime/test/testdata/custom_execution_provider_library/my_allocator.cc
index ac053eca95..aca03e7a6b 100644
--- a/onnxruntime/test/testdata/custom_execution_provider_library/my_allocator.cc
+++ b/onnxruntime/test/testdata/custom_execution_provider_library/my_allocator.cc
@@ -11,6 +11,7 @@ MyEPAllocator::MyEPAllocator(OrtDevice::DeviceId device_id)
 }
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(disable : 26400)
+#pragma warning(disable : 26409)
 #endif
 void* MyEPAllocator::Alloc(size_t size) {
   void* device_address = new (std::nothrow) uint8_t[size];
diff --git a/onnxruntime/test/util/test_allocator.cc b/onnxruntime/test/util/test_allocator.cc
index 1383894372..002e759bd0 100644
--- a/onnxruntime/test/util/test_allocator.cc
+++ b/onnxruntime/test/util/test_allocator.cc
@@ -17,6 +17,7 @@ MockedOrtAllocator::~MockedOrtAllocator() {
 }
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(disable : 26400)
+#pragma warning(disable : 26409)
 #endif
 void* MockedOrtAllocator::Alloc(size_t size) {
   constexpr size_t extra_len = sizeof(size_t);
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 82c984608a..f9c900295f 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1620,7 +1620,8 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
             if not args.disable_ml_ops:
                 run_subprocess([sys.executable, 'onnxruntime_test_python_mlops.py'], cwd=cwd, dll_path=dll_path)
 
-            if args.enable_training and args.use_cuda:
+            # The following test has multiple failures on Windows
+            if args.enable_training and args.use_cuda and not is_windows():
                 # run basic frontend tests
                 run_training_python_frontend_tests(cwd=cwd)
 
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml
index 4278039acd..1236a78627 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml
@@ -2,7 +2,7 @@ parameters:
   DoEsrp: 'false'
 
 jobs:
-- template: ../../templates/win-ci-2019.yml
+- template: win-ci-2019.yml
   parameters:
     AgentPool : 'onnxruntime-gpu-winbuild'
     ArtifactName: 'drop-nuget-dml'
@@ -25,7 +25,7 @@ jobs:
      mkdir $(Build.ArtifactStagingDirectory)\testdata
      copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
 
-- template: ../../templates/win-ci-2019.yml
+- template: win-ci-2019.yml
   parameters:
     AgentPool : 'onnxruntime-gpu-winbuild'
     ArtifactName: 'drop-win-dml-x86-zip'
@@ -47,7 +47,7 @@ jobs:
      mkdir $(Build.ArtifactStagingDirectory)\testdata
      copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
 
-- template: ../../templates/win-ci-2019.yml
+- template: win-ci-2019.yml
   parameters:
     AgentPool : 'onnxruntime-gpu-winbuild'
     ArtifactName: 'drop-win-dml-arm64-zip'
@@ -69,7 +69,7 @@ jobs:
      mkdir $(Build.ArtifactStagingDirectory)\testdata
      copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
 
-- template: ../../templates/win-ci-2019.yml
+- template: win-ci-2019.yml
   parameters:
     AgentPool : 'onnxruntime-gpu-winbuild'
     ArtifactName: 'drop-win-dml-arm-zip'
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci-2019.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/win-ci-2019.yml
similarity index 97%
rename from tools/ci_build/github/azure-pipelines/templates/win-ci-2019.yml
rename to tools/ci_build/github/azure-pipelines/nuget/templates/win-ci-2019.yml
index 04e21bcd22..976ae5b921 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci-2019.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/win-ci-2019.yml
@@ -197,14 +197,14 @@ jobs:
           arguments: ${{ parameters.BuildArch }}
           modifyEnvironment: true
       # Esrp signing
-      - template: win-esrp-dll.yml
+      - template: ../../templates/win-esrp-dll.yml
         parameters:
           FolderPath: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
           DisplayName: 'ESRP - Sign Native dlls'
           DoEsrp: ${{ parameters.DoEsrp }}
           Pattern: 'onnx_test_runner.exe, onnxruntime_perf_test.exe,*.dll' #keep sync with src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
 
-      - template: win-esrp-dll.yml
+      - template: ../../templates/win-esrp-dll.yml
         parameters:
           FolderPath: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\$(BuildConfig)'
           DisplayName: 'ESRP - Sign C# dlls'
@@ -238,7 +238,7 @@ jobs:
           filename: 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat'
           arguments: ${{ parameters.BuildArch }}
           modifyEnvironment: true
-      - template: win-esrp-dll.yml
+      - template: ../../templates/win-esrp-dll.yml
         parameters:
           FolderPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\x64'
           DisplayName: 'ESRP - Sign Node.js binding binaries'
@@ -267,11 +267,11 @@ jobs:
         workingDirectory: '$(Build.ArtifactStagingDirectory)'
 
     - ${{ if eq(parameters['DoCompliance'], 'true') }}:
-      - template: compliance.yml
+      - template: ../../templates/compliance.yml
         parameters :
           msbuildPlatform: ${{ parameters.sln_platform }}
 
-    - template: component-governance-component-detection-steps.yml
+    - template: ../../templates/component-governance-component-detection-steps.yml
       parameters :
         condition : 'succeeded'
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml
index 8189eda2cc..47fc0e388e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml
@@ -173,6 +173,7 @@ jobs:
 
   - ${{ if eq(parameters.EnablePython, true) }}:
       - powershell: |
+         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
          Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname}
        
         workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
@@ -196,26 +197,32 @@ jobs:
            **/*.pdb
            **/*.dll
 
-      - task: PythonScript@0
-        displayName: 'Generate cmake config'
+      #Manually set msBuildCommandline so that we can also set CAExcludePath
+      #build_dir must be a sub folder of $(Build.SourcesDirectory)
+      - task: SDLNativeRules@3
+        displayName: 'Run the PREfast SDL Native Rules for MSBuild'
         inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--config RelWithDebInfo --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 16 2019" --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }} --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON'
-          workingDirectory: '$(Build.BinariesDirectory)'
+          msBuildArchitecture: amd64
+          setupCommandlines: 'python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --build_dir $(Build.SourcesDirectory)\b --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 16 2019" --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }} --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON'
+          msBuildCommandline: '"C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.SourcesDirectory)\b\RelWithDebInfo\onnxruntime.sln" /p:RunCodeAnalysis=true /p:platform=${{ parameters.msbuildPlatform }} /p:configuration=RelWithDebInfo /p:VisualStudioVersion="16.0" /m /p:PreferredToolArchitecture=x64'
+          excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files (x86)'
+          rulesetName: Custom
+          customRuleset: $(Build.SourcesDirectory)\cmake\Sdl.ruleset
 
-
-      - task: VSBuild@1
-        displayName: 'Build'
+      - task: SdtReport@2
+        displayName: 'Create Security Analysis Report'
         inputs:
-          solution: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\onnxruntime.sln'
-          platform: ${{ parameters.msbuildPlatform }}
-          configuration: ${{ parameters.BuildConfig }}
-          msbuildArgs: /m /p:CAExcludePath="$(Build.BinariesDirectory);$(Build.SourcesDirectory)\cmake;C:\program files (x86)" /p:PreferredToolArchitecture=x64
-          msbuildArchitecture: ${{ parameters.buildArch }}
-          maximumCpuCount: true
-          logProjectEvents: false
-          workingFolder: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}'
-          createLogFile: true
+          SDLNativeRules: true
+          
+      - task: PublishSecurityAnalysisLogs@3
+        displayName: 'Publish Security Analysis Logs'
+        continueOnError: true
+
+      - task: PostAnalysis@2
+        displayName: 'Guardian Break'
+        inputs:
+          GdnBreakGdnToolSDLNativeRulesSeverity: Warning
+          GdnBreakGdnToolSDLNativeRules: true
 
 
   - ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-gpu-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-gpu-ci.yml
index e919400234..e9167d3684 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-gpu-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-gpu-ci.yml
@@ -20,6 +20,10 @@ parameters:
 - name: isX86
   type: boolean
   default: false
+  
+- name: isTraining
+  type: boolean
+  default: false
 
 - name: EnablePython
   type: boolean
@@ -109,6 +113,13 @@ jobs:
         workingDirectory: '$(Build.SourcesDirectory)\cmake\external\onnx'
         displayName: 'Install ONNX'
 
+      - ${{ if eq(parameters.isTraining, true) }}:
+          - script: |
+             python -m pip install -r $(Build.SourcesDirectory)\tools\ci_build\github\linux\docker\scripts\training\ortmodule\stage1\requirements_torch1.9.0_cu11.1.txt
+             python -m pip install -r $(Build.SourcesDirectory)\tools\ci_build\github\linux\docker\scripts\training\requirements.txt
+            workingDirectory: '$(Build.BinariesDirectory)'
+            displayName: 'Install python modules'
+
   - task: NuGetToolInstaller@0
     displayName: Use Nuget 5.7.0
     inputs:
@@ -182,6 +193,7 @@ jobs:
 
   - ${{ if eq(parameters.EnablePython, true) }}:
       - powershell: |
+         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
          Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname}
        
         workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
@@ -205,26 +217,35 @@ jobs:
            **/*.pdb
            **/*.dll
 
-      - task: PythonScript@0
-        displayName: 'Generate cmake config'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--config Debug --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 16 2019" --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }} onnxruntime_ENABLE_STATIC_ANALYSIS=ON'
-          workingDirectory: '$(Build.BinariesDirectory)'
 
-
-      - task: VSBuild@1
-        displayName: 'Build'
+      #Manually set msBuildCommandline so that we can also set CAExcludePath
+      #build_dir must be a sub folder of $(Build.SourcesDirectory)
+      #TODO: move this step to a CPU-only machine to save GPU resources.
+      - task: SDLNativeRules@3
+        displayName: 'Run the PREfast SDL Native Rules for MSBuild'
         inputs:
-          solution: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\onnxruntime.sln'
-          platform: ${{ parameters.msbuildPlatform }}
-          configuration: ${{ parameters.BuildConfig }}
-          msbuildArgs: /m /p:CAExcludePath="$(Build.BinariesDirectory);$(Build.SourcesDirectory)\cmake;C:\program files (x86)" /p:PreferredToolArchitecture=x64
-          msbuildArchitecture: ${{ parameters.buildArch }}
-          maximumCpuCount: true
-          logProjectEvents: false
-          workingFolder: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}'
-          createLogFile: true
+          msBuildArchitecture: amd64
+          setupCommandlines: 'python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --build_dir $(Build.SourcesDirectory)\b --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 16 2019" --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }} onnxruntime_ENABLE_STATIC_ANALYSIS=ON'
+          msBuildCommandline: '"C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.SourcesDirectory)\b\RelWithDebInfo\onnxruntime.sln" /p:RunCodeAnalysis=true /p:platform=${{ parameters.msbuildPlatform }} /p:configuration=RelWithDebInfo /p:VisualStudioVersion="16.0" /m /p:PreferredToolArchitecture=x64'
+          excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files (x86)'
+          rulesetName: Custom
+          customRuleset: $(Build.SourcesDirectory)\cmake\Sdl.ruleset
+          publishXML: true
+          
+      - task: SdtReport@2
+        displayName: 'Create Security Analysis Report'
+        inputs:
+          SDLNativeRules: true
+    
+      - task: PublishSecurityAnalysisLogs@3
+        displayName: 'Publish Security Analysis Logs'
+        continueOnError: true
+   
+      - task: PostAnalysis@2
+        displayName: 'Guardian Break v2'
+        inputs:
+          GdnBreakGdnToolSDLNativeRulesSeverity: Warning
+          GdnBreakGdnToolSDLNativeRules: true
 
 
   - ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}:
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index a2d9dc3048..3fd5f46721 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -5,229 +5,54 @@ parameters:
   default: true
 
 stages:
-- stage: inference
+- stage: cuda
   dependsOn: []
   jobs:
-    - job: 'build'
-      pool: 'Win-GPU-2019'
-      strategy:
-        matrix:
-          cuda:
-            additionalBuildFlags: --build_java --build_nodejs --use_cuda --cuda_version=11.4 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4" --cudnn_home="C:\local\cudnn-11.4-windows-x64-v8.2.2.26\cuda" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=52 --gen_doc validate --enable_cuda_profiling
-            EnvSetupScript: setup_env_cuda_11.bat
-            ORT_EP_NAME: CUDA
-          dml:
-            additionalBuildFlags: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos  --use_winml
-            EnvSetupScript: setup_env.bat
-            ORT_EP_NAME: DML
-      variables:
-        OrtPackageId: 'Microsoft.ML.OnnxRuntime.Gpu'
-        MsbuildArguments: '-maxcpucount'
-        TESTONGPU: 'ON'
-        OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)'
-        DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
-        setVcvars: true
+    - template: templates/win-gpu-ci.yml
+      parameters:
         BuildConfig: 'RelWithDebInfo'
-        UseOmp: ''
+        EnvSetupScript: setup_env_cuda_11.bat
         buildArch: x64
+        additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_version=11.4 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4" --cudnn_home="C:\local\cudnn-11.4-windows-x64-v8.2.2.26\cuda" --gen_doc validate --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=52
         msbuildPlatform: x64
         isX86: false
-        ALLOW_RELEASED_ONNX_OPSET_ONLY: '0'
-        DocUpdateNeeded: false
-      timeoutInMinutes: 180
-      workspace:
-        clean: all
-      steps:    
-      - task: UsePythonVersion@0
-        inputs: 
-          versionSpec: '3.7' 
-          addToPath: true 
-          architecture: $(buildArch)
+        job_name_suffix: x64_RelWithDebInfo
+        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
+        RunStaticCodeAnalysis: false
+        ORT_EP_NAME: CUDA
+        DocUpdateNeeded: true
 
-      - task: NodeTool@0
-        inputs:
-          versionSpec: '12.x'
-          force32bit: $(isX86)
-
-      - task: JavaToolInstaller@0
-        #Our build machine doesn't have java x86
-        condition: and(succeeded(), eq(variables['buildArch'], 'x64'))
-        inputs:
-          versionSpec: '11'
-          jdkArchitectureOption: $(buildArch)
-          jdkSourceOption: 'PreInstalled'
-
-      - task: BatchScript@1
-        displayName: 'setup env'
-        inputs:
-          filename: '$(Build.SourcesDirectory)\tools\ci_build\github\windows\$(EnvSetupScript)'
-          modifyEnvironment: true
-          workingFolder: '$(Build.BinariesDirectory)'
-
-      - script: |
-         set ORT_DOXY_SRC=$(Build.SourcesDirectory)
-         set ORT_DOXY_OUT=$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)
-         mkdir %ORT_DOXY_SRC% 
-         mkdir %ORT_DOXY_OUT%
-         "C:\Program Files\doxygen\bin\doxygen.exe" $(Build.SourcesDirectory)\tools\ci_build\github\Doxyfile_csharp.cfg
-     
-        workingDirectory: '$(Build.SourcesDirectory)'
-        displayName: 'API Documentation Check and generate'
-
-      - script: |
-         python -m pip install -q setuptools wheel numpy
-        workingDirectory: '$(Build.BinariesDirectory)'
-        displayName: 'Install python modules'
-
-      - powershell: |
-         $Env:USE_MSVC_STATIC_RUNTIME=1
-         $Env:ONNX_ML=1
-         $Env:CMAKE_ARGS="-DONNX_USE_PROTOBUF_SHARED_LIBS=OFF -DProtobuf_USE_STATIC_LIBS=ON -DONNX_USE_LITE_PROTO=ON -DCMAKE_TOOLCHAIN_FILE=C:/vcpkg/scripts/buildsystems/vcpkg.cmake -DVCPKG_TARGET_TRIPLET=$(buildArch)-windows-static"
-         python setup.py bdist_wheel
-         python -m pip uninstall -y onnx -qq
-         Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname}
-        workingDirectory: '$(Build.SourcesDirectory)\cmake\external\onnx'
-        displayName: 'Install ONNX'
-
-      - task: NuGetToolInstaller@0
-        displayName: Use Nuget 5.7.0
-        inputs:
-          versionSpec: 5.7.0
-
-      - task: NuGetCommand@2
-        displayName: 'NuGet restore'
-        inputs:
-          command: 'restore'
-          feedsToUse: 'config'
-          restoreSolution: '$(Build.SourcesDirectory)\packages.config'
-          nugetConfigPath: '$(Build.SourcesDirectory)\NuGet.config'
-          restoreDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
-
-      - task: PythonScript@0
-        displayName: 'Generate cmake config'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) $(UseOmp) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 16 2019" --build_wheel --enable_onnx_tests $(additionalBuildFlags)'
-          workingDirectory: '$(Build.BinariesDirectory)'
-
-      - task: VSBuild@1
-        displayName: 'Build'
-        inputs:
-          solution: '$(Build.BinariesDirectory)\$(BuildConfig)\onnxruntime.sln'
-          platform: $(msbuildPlatform)
-          configuration: $(BuildConfig)
-          msbuildArgs: $(MsbuildArguments)
-          msbuildArchitecture: $(buildArch)
-          maximumCpuCount: true
-          logProjectEvents: false
-          workingFolder: '$(Build.BinariesDirectory)\$(BuildConfig)'
-          createLogFile: true
-
-      - task: PythonScript@0
-        displayName: 'Build wheel'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\setup.py'
-          arguments: 'bdist_wheel'
-          workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
-
-      - task: MSBuild@1
-        displayName: 'Restore NuGet Packages'
-        inputs:
-          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
-          platform: 'Any CPU'
-          configuration: '$(BuildConfig)'
-          msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)'
-          workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-      - task: MSBuild@1
-        displayName: 'Build C#'
-        inputs:
-          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
-          platform: 'Any CPU'
-          configuration: '$(BuildConfig)'
-          msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:DefineConstants=USE_$(ORT_EP_NAME)'
-          workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-      - task: DotNetCoreCLI@2
-        displayName: 'Test C#'
-        condition: and(and(succeeded(), eq(variables['BuildConfig'], 'RelWithDebInfo')),eq('${{ parameters.RunOnnxRuntimeTests}}', true))
-        inputs:
-          command: test
-          projects: '$(Build.SourcesDirectory)\csharp\test\Microsoft.ML.OnnxRuntime.Tests.NetCoreApp\Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj'
-          configuration: '$(BuildConfig)'          
-          arguments: '--configuration $(BuildConfig) -p:Platform="Any CPU" -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:DefineConstants=USE_$(ORT_EP_NAME)'
-          workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-      - powershell: |
-         Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname}
-         python $(Build.SourcesDirectory)\tools\ci_build\build.py --config $(BuildConfig) --build_dir $(Build.BinariesDirectory) $(UseOmp) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 16 2019" --build_wheel --enable_onnx_tests $(additionalBuildFlags)
-        workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
-        condition: and(succeeded(), eq('${{ parameters.RunOnnxRuntimeTests}}', true))
-        displayName: 'Run tests'
-
-      # if the validation from --gen_doc failed it sets a variable so we can publish the latest version of the docs
-      # as an artifact, allowing a developer to download this and replace the current version instead of having to build
-      # and generate the docs locally themselves. handle each of the two md files separately - simpler than copying
-      # them to another location and publishing from there in a single task.
-      - task: PublishBuildArtifacts@1
-        condition: and(failed(), eq(variables['DocUpdateNeeded'], 'true'))
-        inputs:
-          pathtoPublish: '$(Build.SourcesDirectory)/docs/OperatorKernels.md'
-          artifactName: 'OperatorKernels.md'
-
-      - task: PublishBuildArtifacts@1
-        condition: and(failed(), eq(variables['DocUpdateNeeded'], 'true'))
-        inputs:
-          pathtoPublish: '$(Build.SourcesDirectory)/docs/ContribOperators.md'
-          artifactName: 'ContribOperators.md'
-
-
-      - task: PublishSecurityAnalysisLogs@3
-        displayName: 'Publish Security Analysis Logs'
-        condition: and(succeeded(), eq(variables['BuildConfig'], 'RelWithDebInfo'))
-        inputs:
-          ArtifactName: CodeAnalysisLogs
-
-      - task: PublishTestResults@2
-        displayName: 'Publish unit test results'
-        inputs:
-          testResultsFiles: '**/*.results.xml'
-          searchFolder: '$(Build.BinariesDirectory)'
-          testRunTitle: 'Unit Test Run'
-        condition: succeededOrFailed()
-
-      - template: templates/component-governance-component-detection-steps.yml
-        parameters :
-          condition : 'succeeded'
 
 - stage: training
   dependsOn: []
   jobs:
-  - template: templates/win-ci-2019.yml
-    parameters:
-        AgentPool : 'Win-GPU-2019'
-        JobName: 'Win_GPU_Training'
-        # TODO fix test failures and remove --skip_onnx_tests
-        BuildCommand: >-
-            --build_dir $(Build.BinariesDirectory)
-            --build_shared_lib
-            --cmake_generator "Visual Studio 16 2019"
-            --enable_onnx_tests
-            --enable_training
-            --skip_onnx_tests
-            --skip_submodule_sync
-            --use_cuda
-            --cuda_version 10.2
-            --cuda_home "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2"
-            --cudnn_home "C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda"
-            --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52"
-        BuildArch: 'x64'
-        EnvSetupScript: 'setup_env_cuda.bat'
-        sln_platform: 'x64'
-        CudaVersion: '10.2'
-        OrtPackageId: 'Microsoft.ML.OnnxRuntime.Gpu'
-        BuildConfigurations: ['RelWithDebInfo']
-        # Enable unreleased onnx opsets in CI builds
-        # This facilitates testing the implementation for the new opsets
-        AllowReleasedOpsetOnly: '0'
-        DoCompliance: 'true'
\ No newline at end of file
+    - template: templates/win-gpu-ci.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        EnvSetupScript: setup_env_cuda_11.bat
+        buildArch: x64
+        additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_version=11.4 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4" --cudnn_home="C:\local\cudnn-11.4-windows-x64-v8.2.2.26\cuda" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=52
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_RelWithDebInfo
+        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
+        RunStaticCodeAnalysis: false
+        ORT_EP_NAME: CUDA
+        isTraining: true
+     
+- stage: dml
+  dependsOn: []
+  jobs:
+    - template: templates/win-gpu-ci.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        EnvSetupScript: setup_env.bat
+        buildArch: x64
+        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos  --use_winml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_RelWithDebInfo
+        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
+        RunStaticCodeAnalysis: false
+        ORT_EP_NAME: DML     
+