Fix SDL warnings in CPU EP (#9975)

2021-12-19 20:54:29 -08:00 · 2021-12-19 20:54:29 -08:00 · 4e9e01cb3c
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -1114,7 +1114,6 @@ if (onnxruntime_USE_MIGRAPHX)
    list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_MIGRAPHX=1)
    list(APPEND ONNXRUNTIME_PROVIDER_NAMES migraphx)
 endif()
-
 if (onnxruntime_USE_ARMNN)
    list(APPEND ORT_PROVIDER_FLAGS  -DUSE_ARMNN=1)
    list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_ARMNN=1)
@ -1142,8 +1141,8 @@ function(onnxruntime_set_compile_flags target_name)
      set_target_properties(${target_name}
                      PROPERTIES VS_GLOBAL_CAExcludePath "${ORT_BINARY_DIR};${ORT_SOURCE_DIR}")
      if (onnxruntime_ENABLE_STATIC_ANALYSIS)
-        target_compile_options(${target_name} PRIVATE  "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/analyze:stacksize 131072>")
-        target_compile_options(${target_name} PRIVATE  "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/analyze:external->")
+        target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /analyze>" "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/analyze>")
+        target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /analyze:external->" "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/analyze:external->")
      endif()
    else()
      # Enable warning
@ -1190,6 +1189,10 @@ function(onnxruntime_configure_target target_name)
  target_link_directories(${target_name} PRIVATE ${onnxruntime_LINK_DIRS})
  onnxruntime_set_compile_flags(${target_name})
  onnxruntime_set_source_file_properties(${target_name})
+  #Uncomment the following three lines to reproduce static analysis errors locally
+  #if(WIN32 AND onnxruntime_ENABLE_STATIC_ANALYSIS)
+  #  set_target_properties(${target_name} PROPERTIES VS_USER_PROPS ${PROJECT_SOURCE_DIR}/EnableVisualStudioCodeAnalysis.props)
+  #endif()
  target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
  if (onnxruntime_ENABLE_LTO)
    set_target_properties(${target_name} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE)
--- a/cmake/EnableVisualStudioCodeAnalysis.props
+++ b/cmake/EnableVisualStudioCodeAnalysis.props
@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CodeAnalysisRuleSet>$(MSBuildThisFileDirectory)Sdl.ruleset</CodeAnalysisRuleSet>
+    <!-- External libraries are in or below the directory with the sln file. Source is under \onnxruntime so not affected by this.
+         Also need to exclude things under \cmake such as \cmake\external\protobuf, and the easiest way to do that in all
+         environments is to use the directory this file is in.
+         -->
+    <CAExcludePath>$(SolutionDir);$(MSBuildThisFileDirectory)</CAExcludePath>
+	<RunCodeAnalysis>true</RunCodeAnalysis>
+  </PropertyGroup>  
+</Project>
--- a/cmake/Sdl.ruleset
+++ b/cmake/Sdl.ruleset
@ -0,0 +1,268 @@
+<?xml version="1.0" encoding="utf-8"?>
+<RuleSet Name="ONNX Runtime SDL Rules" Description="These rules focus on the most critical and common problems in your native code, including potential security holes and application crashes.  You should include this rule set in any custom rule set you create for your native projects.  This ruleset is designed to work with Visual Studio Professional edition and higher." ToolsVersion="16.0">
+  <Rules AnalyzerId="Microsoft.Analyzers.NativeCodeAnalysis" RuleNamespace="Microsoft.Rules.Native">
+    <Rule Id="C26100" Action="Error" />
+    <Rule Id="C26101" Action="Error" />
+    <Rule Id="C26110" Action="Error" />
+    <Rule Id="C26111" Action="Error" />
+    <Rule Id="C26112" Action="Error" />
+    <Rule Id="C26115" Action="Error" />
+    <Rule Id="C26116" Action="Error" />
+    <Rule Id="C26117" Action="Error" />
+    <Rule Id="C26140" Action="Error" />
+    <Rule Id="C26400" Action="Error" />
+    <Rule Id="C26404" Action="Error" />
+    <Rule Id="C26406" Action="Error" />
+    <Rule Id="C26408" Action="Error" />
+    <Rule Id="C26409" Action="Error" />
+    <Rule Id="C26437" Action="Error" />
+    <Rule Id="C26439" Action="Error" />
+    <Rule Id="C26441" Action="Error" />
+    <Rule Id="C26444" Action="Error" />
+    <Rule Id="C26449" Action="Error" />
+    <Rule Id="C26450" Action="Error" />
+    <Rule Id="C26451" Action="Error" />
+    <Rule Id="C26452" Action="Error" />
+    <Rule Id="C26453" Action="Error" />
+    <Rule Id="C26454" Action="Error" />
+    <Rule Id="C26464" Action="Error" />
+    <Rule Id="C26478" Action="Error" />
+    <Rule Id="C26479" Action="Error" />
+    <Rule Id="C26488" Action="Error" />
+    <Rule Id="C26497" Action="Error" />
+    <Rule Id="C26498" Action="Error" />
+    <Rule Id="C26810" Action="Error" />
+    <Rule Id="C26811" Action="Error" />
+    <Rule Id="C26812" Action="Error" />
+    <Rule Id="C26814" Action="Error" />
+    <Rule Id="C26815" Action="Error" />
+    <Rule Id="C26816" Action="Error" />
+    <Rule Id="C26817" Action="Error" />
+    <Rule Id="C26819" Action="Error" />
+    <Rule Id="C26820" Action="Error" />
+    <Rule Id="C28020" Action="Error" />
+    <Rule Id="C28021" Action="Error" />
+    <Rule Id="C28022" Action="Error" />
+    <Rule Id="C28023" Action="Error" />
+    <Rule Id="C28024" Action="Error" />
+    <Rule Id="C28039" Action="Error" />
+    <Rule Id="C28112" Action="Error" />
+    <Rule Id="C28113" Action="Error" />
+    <Rule Id="C28125" Action="Error" />
+    <Rule Id="C28137" Action="Error" />
+    <Rule Id="C28138" Action="Error" />
+    <Rule Id="C28159" Action="Error" />
+    <Rule Id="C28160" Action="Error" />
+    <Rule Id="C28163" Action="Error" />
+    <Rule Id="C28164" Action="Error" />
+    <Rule Id="C28182" Action="Error" />
+    <Rule Id="C28183" Action="Error" />
+    <Rule Id="C28193" Action="Error" />
+    <Rule Id="C28196" Action="Error" />
+    <Rule Id="C28202" Action="Error" />
+    <Rule Id="C28203" Action="Error" />
+    <Rule Id="C28204" Action="Error" />
+    <Rule Id="C28205" Action="Error" />
+    <Rule Id="C28206" Action="Error" />
+    <Rule Id="C28207" Action="Error" />
+    <Rule Id="C28208" Action="Error" />
+    <Rule Id="C28209" Action="Error" />
+    <Rule Id="C28210" Action="Error" />
+    <Rule Id="C28211" Action="Error" />
+    <Rule Id="C28212" Action="Error" />
+    <Rule Id="C28213" Action="Error" />
+    <Rule Id="C28214" Action="Error" />
+    <Rule Id="C28215" Action="Error" />
+    <Rule Id="C28216" Action="Error" />
+    <Rule Id="C28217" Action="Error" />
+    <Rule Id="C28218" Action="Error" />
+    <Rule Id="C28219" Action="Error" />
+    <Rule Id="C28220" Action="Error" />
+    <Rule Id="C28221" Action="Error" />
+    <Rule Id="C28222" Action="Error" />
+    <Rule Id="C28223" Action="Error" />
+    <Rule Id="C28224" Action="Error" />
+    <Rule Id="C28225" Action="Error" />
+    <Rule Id="C28226" Action="Error" />
+    <Rule Id="C28227" Action="Error" />
+    <Rule Id="C28228" Action="Error" />
+    <Rule Id="C28229" Action="Error" />
+    <Rule Id="C28230" Action="Error" />
+    <Rule Id="C28231" Action="Error" />
+    <Rule Id="C28232" Action="Error" />
+    <Rule Id="C28233" Action="Error" />
+    <Rule Id="C28234" Action="Error" />
+    <Rule Id="C28235" Action="Error" />
+    <Rule Id="C28236" Action="Error" />
+    <Rule Id="C28237" Action="Error" />
+    <Rule Id="C28238" Action="Error" />
+    <Rule Id="C28239" Action="Error" />
+    <Rule Id="C28240" Action="Error" />
+    <Rule Id="C28241" Action="Error" />
+    <Rule Id="C28243" Action="Error" />
+    <Rule Id="C28244" Action="Error" />
+    <Rule Id="C28245" Action="Error" />
+    <Rule Id="C28246" Action="Error" />
+    <Rule Id="C28250" Action="Error" />
+    <Rule Id="C28251" Action="Error" />
+    <Rule Id="C28252" Action="Error" />
+    <Rule Id="C28253" Action="Error" />
+    <Rule Id="C28254" Action="Error" />
+    <Rule Id="C28260" Action="Error" />
+    <Rule Id="C28262" Action="Error" />
+    <Rule Id="C28263" Action="Error" />
+    <Rule Id="C28266" Action="Error" />
+    <Rule Id="C28267" Action="Error" />
+    <Rule Id="C28272" Action="Error" />
+    <Rule Id="C28273" Action="Error" />
+    <Rule Id="C28275" Action="Error" />
+    <Rule Id="C28279" Action="Error" />
+    <Rule Id="C28280" Action="Error" />
+    <Rule Id="C28282" Action="Error" />
+    <Rule Id="C28285" Action="Error" />
+    <Rule Id="C28286" Action="Error" />
+    <Rule Id="C28287" Action="Error" />
+    <Rule Id="C28288" Action="Error" />
+    <Rule Id="C28289" Action="Error" />
+    <Rule Id="C28290" Action="Error" />
+    <Rule Id="C28291" Action="Error" />
+    <Rule Id="C28300" Action="Error" />
+    <Rule Id="C28301" Action="Error" />
+    <Rule Id="C28302" Action="Error" />
+    <Rule Id="C28303" Action="Error" />
+    <Rule Id="C28304" Action="Error" />
+    <Rule Id="C28305" Action="Error" />
+    <Rule Id="C28306" Action="Error" />
+    <Rule Id="C28307" Action="Error" />
+    <Rule Id="C28308" Action="Error" />
+    <Rule Id="C28309" Action="Error" />
+    <Rule Id="C28350" Action="Error" />
+    <Rule Id="C28351" Action="Error" />
+    <Rule Id="C33001" Action="Error" />
+    <Rule Id="C33004" Action="Error" />
+    <Rule Id="C33005" Action="Error" />
+    <Rule Id="C33010" Action="Error" />
+    <Rule Id="C33011" Action="Error" />
+    <Rule Id="C33020" Action="Error" />
+    <Rule Id="C6001" Action="Error" />
+    <Rule Id="C6011" Action="Error" />
+    <Rule Id="C6029" Action="Error" />
+    <Rule Id="C6031" Action="Error" />
+    <Rule Id="C6053" Action="Error" />
+    <Rule Id="C6054" Action="Error" />
+    <Rule Id="C6059" Action="Error" />
+    <Rule Id="C6063" Action="Error" />
+    <Rule Id="C6064" Action="Error" />
+    <Rule Id="C6066" Action="Error" />
+    <Rule Id="C6067" Action="Error" />
+    <Rule Id="C6101" Action="Error" />
+    <Rule Id="C6200" Action="Error" />
+    <Rule Id="C6201" Action="Error" />
+    <Rule Id="C6214" Action="Error" />
+    <Rule Id="C6215" Action="Error" />
+    <Rule Id="C6216" Action="Error" />
+    <Rule Id="C6217" Action="Error" />
+    <Rule Id="C6220" Action="Error" />
+    <Rule Id="C6226" Action="Error" />
+    <Rule Id="C6230" Action="Error" />
+    <Rule Id="C6235" Action="Error" />
+    <Rule Id="C6236" Action="Error" />
+    <Rule Id="C6237" Action="Error" />
+    <Rule Id="C6242" Action="Error" />
+    <Rule Id="C6248" Action="Error" />
+    <Rule Id="C6250" Action="Error" />
+    <Rule Id="C6255" Action="Error" />
+    <Rule Id="C6258" Action="Error" />
+    <Rule Id="C6259" Action="Error" />
+    <Rule Id="C6260" Action="Error" />
+    <Rule Id="C6262" Action="Error" />
+    <Rule Id="C6263" Action="Error" />
+    <Rule Id="C6268" Action="Error" />
+    <Rule Id="C6269" Action="Error" />
+    <Rule Id="C6270" Action="Error" />
+    <Rule Id="C6271" Action="Error" />
+    <Rule Id="C6272" Action="Error" />
+    <Rule Id="C6273" Action="Error" />
+    <Rule Id="C6274" Action="Error" />
+    <Rule Id="C6276" Action="Error" />
+    <Rule Id="C6277" Action="Error" />
+    <Rule Id="C6278" Action="Error" />
+    <Rule Id="C6279" Action="Error" />
+    <Rule Id="C6280" Action="Error" />
+    <Rule Id="C6281" Action="Error" />
+    <Rule Id="C6282" Action="Error" />
+    <Rule Id="C6283" Action="Error" />
+    <Rule Id="C6284" Action="Error" />
+    <Rule Id="C6285" Action="Error" />
+    <Rule Id="C6286" Action="Error" />
+    <Rule Id="C6287" Action="Error" />
+    <Rule Id="C6288" Action="Error" />
+    <Rule Id="C6289" Action="Error" />
+    <Rule Id="C6290" Action="Error" />
+    <Rule Id="C6291" Action="Error" />
+    <Rule Id="C6292" Action="Error" />
+    <Rule Id="C6293" Action="Error" />
+    <Rule Id="C6294" Action="Error" />
+    <Rule Id="C6295" Action="Error" />
+    <Rule Id="C6296" Action="Error" />
+    <Rule Id="C6297" Action="Error" />
+    <Rule Id="C6299" Action="Error" />
+    <Rule Id="C6302" Action="Error" />
+    <Rule Id="C6303" Action="Error" />
+    <Rule Id="C6305" Action="Error" />
+    <Rule Id="C6306" Action="Error" />
+    <Rule Id="C6308" Action="Error" />
+    <Rule Id="C6310" Action="Error" />
+    <Rule Id="C6312" Action="Error" />
+    <Rule Id="C6314" Action="Error" />
+    <Rule Id="C6317" Action="Error" />
+    <Rule Id="C6318" Action="Error" />
+    <Rule Id="C6319" Action="Error" />
+    <Rule Id="C6324" Action="Error" />
+    <Rule Id="C6326" Action="Error" />
+    <Rule Id="C6328" Action="Error" />
+    <Rule Id="C6331" Action="Error" />
+    <Rule Id="C6332" Action="Error" />
+    <Rule Id="C6333" Action="Error" />
+    <Rule Id="C6335" Action="Error" />
+    <Rule Id="C6381" Action="Error" />
+    <Rule Id="C6383" Action="Error" />
+    <Rule Id="C6384" Action="Error" />
+    <Rule Id="C6385" Action="Error" />
+    <Rule Id="C6386" Action="Error" />
+    <Rule Id="C6387" Action="Error" />
+    <Rule Id="C6388" Action="Error" />
+    <Rule Id="C6500" Action="Error" />
+    <Rule Id="C6501" Action="Error" />
+    <Rule Id="C6503" Action="Error" />
+    <Rule Id="C6504" Action="Error" />
+    <Rule Id="C6505" Action="Error" />
+    <Rule Id="C6506" Action="Error" />
+    <Rule Id="C6508" Action="Error" />
+    <Rule Id="C6509" Action="Error" />
+    <Rule Id="C6510" Action="Error" />
+    <Rule Id="C6511" Action="Error" />
+    <Rule Id="C6513" Action="Error" />
+    <Rule Id="C6514" Action="Error" />
+    <Rule Id="C6515" Action="Error" />
+    <Rule Id="C6516" Action="Error" />
+    <Rule Id="C6517" Action="Error" />
+    <Rule Id="C6518" Action="Error" />
+    <Rule Id="C6522" Action="Error" />
+    <Rule Id="C6525" Action="Error" />
+    <Rule Id="C6527" Action="Error" />
+    <Rule Id="C6530" Action="Error" />
+    <Rule Id="C6540" Action="Error" />
+    <Rule Id="C6551" Action="Error" />
+    <Rule Id="C6552" Action="Error" />
+    <Rule Id="C6701" Action="Error" />
+    <Rule Id="C6702" Action="Error" />
+    <Rule Id="C6703" Action="Error" />
+    <Rule Id="C6704" Action="Error" />
+    <Rule Id="C6705" Action="Error" />
+    <Rule Id="C6706" Action="Error" />
+    <Rule Id="C6993" Action="Error" />
+    <Rule Id="C6995" Action="Error" />
+    <Rule Id="C6997" Action="Error" />
+  </Rules>
+</RuleSet>
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@ -127,11 +127,6 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/common  DEST
 set_target_properties(onnxruntime_common PROPERTIES LINKER_LANGUAGE CXX)
 set_target_properties(onnxruntime_common PROPERTIES FOLDER "ONNXRuntime")

-if(WIN32)
-    # Add Code Analysis properties to enable C++ Core checks. Have to do it via a props file include.
-    set_target_properties(onnxruntime_common PROPERTIES VS_USER_PROPS ${PROJECT_SOURCE_DIR}/EnableVisualStudioCodeAnalysis.props)
-endif()
-
 # check if we need to link against librt on Linux
 include(CheckLibraryExists)
 include(CheckFunctionExists)
--- a/cmake/onnxruntime_graph.cmake
+++ b/cmake/onnxruntime_graph.cmake
@ -120,8 +120,5 @@ if (WIN32)
    target_compile_options(onnxruntime_graph PRIVATE
        /EHsc   # exception handling - C++ may throw, extern "C" will not
    )
-  endif()
-
-  # Add Code Analysis properties to enable C++ Core checks. Have to do it via a props file include.
-  set_target_properties(onnxruntime_graph PROPERTIES VS_USER_PROPS ${PROJECT_SOURCE_DIR}/EnableVisualStudioCodeAnalysis.props)
+  endif()  
 endif()
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@ -475,4 +475,7 @@ endforeach()
 set_target_properties(onnxruntime_mlas PROPERTIES FOLDER "ONNXRuntime")
 if (WIN32)
  target_compile_options(onnxruntime_mlas PRIVATE "/wd6385" "/wd4127")
+  if (onnxruntime_ENABLE_STATIC_ANALYSIS)
+    target_compile_options(onnxruntime_mlas PRIVATE  "/analyze:stacksize 131072")
+  endif()
 endif()
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@ -454,6 +454,10 @@ if (onnxruntime_USE_DNNL)
  source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_dnnl_cc_srcs})
  onnxruntime_add_shared_library_module(onnxruntime_providers_dnnl ${onnxruntime_providers_dnnl_cc_srcs})
  target_link_directories(onnxruntime_providers_dnnl PRIVATE ${DNNL_LIB_DIR})
+  if (MSVC AND onnxruntime_ENABLE_STATIC_ANALYSIS)
+    # dnnl_convgrad.cc(47,0): Warning C6262: Function uses '38816' bytes of stack:  exceeds /analyze:stacksize '16384'.  Consider moving some data to heap.
+    target_compile_options(onnxruntime_providers_dnnl PRIVATE  "/analyze:stacksize 131072")
+  endif()

  add_dependencies(onnxruntime_providers_dnnl onnxruntime_providers_shared project_dnnl ${onnxruntime_EXTERNAL_DEPENDENCIES})
  target_include_directories(onnxruntime_providers_dnnl PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} ${DNNL_INCLUDE_DIR} ${DNNL_OCL_INCLUDE_DIR})
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@ -81,6 +81,9 @@ function(AddTest)
      # Lot of such things came from gtest
      target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd6326>"
                "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd6326>")
+      # Raw new and delete. A lot of such things came from googletest.
+      target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd26409>"
+                "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26409>")
    endif()
    target_compile_options(${_UT_TARGET} PRIVATE ${disabled_warnings})
  else()
@ -681,7 +684,12 @@ AddTest(
    onnx_test_data_proto nlohmann_json::nlohmann_json
  DEPENDS ${all_dependencies}
 )
-if(NOT MSVC)
+if (MSVC)
+  # The warning means the type of two integral values around a binary operator is narrow than their result.
+  # If we promote the two input values first, it could be more tolerant to integer overflow.
+  # However, this is test code. We are less concerned. 
+  target_compile_options(onnxruntime_test_all PRIVATE "/wd26451")
+else()
  target_compile_options(onnxruntime_test_all PRIVATE "-Wno-parentheses")
 endif()
 # the default logger tests conflict with the need to have an overall default logger
@ -846,6 +854,15 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
    if(WIN32)
      target_compile_options(onnxruntime_benchmark PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd4141>"
                        "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4141>")
+      # Avoid using new and delete. But this is a benchmark program, it's ok if it has a chance to leak.
+      target_compile_options(onnxruntime_benchmark PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd26409>"
+                        "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26409>")
+      target_compile_options(onnxruntime_benchmark PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd26400>"
+                        "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26400>")
+      target_compile_options(onnxruntime_benchmark PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd26814>"
+                        "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26814>")
+      target_compile_options(onnxruntime_benchmark PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd26814>"
+                        "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26497>")
      target_compile_options(onnxruntime_benchmark PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
              "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
    endif()
@ -858,7 +875,11 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
    onnxruntime_add_executable(onnxruntime_mlas_benchmark ${MLAS_BENCH_SOURCE_FILES})
    target_include_directories(onnxruntime_mlas_benchmark PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc)
    target_link_libraries(onnxruntime_mlas_benchmark PRIVATE benchmark::benchmark onnxruntime_util onnxruntime_framework ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS})
-    if(NOT WIN32)
+    if(WIN32)
+      target_link_libraries(onnxruntime_mlas_benchmark PRIVATE debug Dbghelp)
+      # Avoid using new and delete. But this is a benchmark program, it's ok if it has a chance to leak.
+      target_compile_options(onnxruntime_mlas_benchmark PRIVATE /wd26409)
+    else()
      target_link_libraries(onnxruntime_mlas_benchmark PRIVATE nsync_cpp ${CMAKE_DL_LIBS})
    endif()
    set_target_properties(onnxruntime_mlas_benchmark PROPERTIES FOLDER "ONNXRuntimeTest")
@ -1100,6 +1121,8 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
  )
  onnxruntime_add_executable(onnxruntime_mlas_test ${onnxruntime_mlas_test_src})
  if(MSVC)
+    target_compile_options(onnxruntime_mlas_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd26409>"
+                "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26409>")
    target_compile_options(onnxruntime_mlas_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
            "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
    target_compile_options(onnxruntime_mlas_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd6326>"
@ -1147,6 +1170,8 @@ if(UNIX)
  endif()
 else()
  set(ONNXRUNTIME_CUSTOM_OP_LIB_LINK_FLAG "-DEF:${TEST_SRC_DIR}/testdata/custom_op_library/custom_op_library.def")
+  target_compile_options(custom_op_library PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler /wd26409>"
+                "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26409>")
 endif()
 set_property(TARGET custom_op_library APPEND_STRING PROPERTY LINK_FLAGS ${ONNXRUNTIME_CUSTOM_OP_LIB_LINK_FLAG})

--- a/include/onnxruntime/core/common/common.h
+++ b/include/onnxruntime/core/common/common.h
@ -35,6 +35,7 @@
 #include "core/common/exceptions.h"
 #include "core/common/make_string.h"
 #include "core/common/status.h"
+#include "core/common/gsl_suppress.h"

 namespace onnxruntime {

@ -253,13 +254,6 @@ void LogRuntimeError(uint32_t session_id, const common::Status& status, const ch
    }                                  \
  } while (0)

-// C++ Core Guideline check suppression.
-#if defined(_MSC_VER) && !defined(__NVCC__) && !defined(__clang__)
-#define GSL_SUPPRESS(tag) [[gsl::suppress(tag)]]
-#else
-#define GSL_SUPPRESS(tag)
-#endif
-
 inline long long TimeDiffMicroSeconds(TimePoint start_time) {
  auto end_time = std::chrono::high_resolution_clock::now();
  return std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
--- a/include/onnxruntime/core/common/gsl_suppress.h
+++ b/include/onnxruntime/core/common/gsl_suppress.h
@ -0,0 +1,15 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#ifndef GSL_SUPPRESS
+#if defined(__clang__) && !defined(__NVCC__)
+#define GSL_SUPPRESS(x) [[gsl::suppress("x")]]
+#else
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__NVCC__)
+#define GSL_SUPPRESS(x) [[gsl::suppress(x)]]
+#else
+#define GSL_SUPPRESS(x)
+#endif  // _MSC_VER
+#endif  // __clang__
+#endif
--- a/include/onnxruntime/core/common/status.h
+++ b/include/onnxruntime/core/common/status.h
@ -19,7 +19,7 @@ limitations under the License.
 #ifdef _WIN32
 #include <winerror.h>
 #endif
-
+#include "core/common/gsl_suppress.h"
 namespace onnxruntime {
 namespace common {

@ -80,35 +80,34 @@ constexpr const char* StatusCodeToString(StatusCode status) noexcept {

 #ifdef _WIN32
 constexpr HRESULT StatusCodeToHRESULT(StatusCode status) noexcept {
-    switch (status)
-    {
+  switch (status) {
    case StatusCode::OK:
-        return S_OK;
+      return S_OK;
    case StatusCode::FAIL:
-        return E_FAIL;
+      return E_FAIL;
    case StatusCode::INVALID_ARGUMENT:
-        return E_INVALIDARG;
+      return E_INVALIDARG;
    case StatusCode::NO_SUCHFILE:
-        return HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND);
+      return HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND);
    case StatusCode::NO_MODEL:
-        return HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND);
+      return HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND);
    case StatusCode::ENGINE_ERROR:
-        return E_FAIL;
+      return E_FAIL;
    case StatusCode::RUNTIME_EXCEPTION:
-        return E_FAIL;
+      return E_FAIL;
    case StatusCode::INVALID_PROTOBUF:
-        return HRESULT_FROM_WIN32(ERROR_FILE_CORRUPT);
+      return HRESULT_FROM_WIN32(ERROR_FILE_CORRUPT);
    case StatusCode::MODEL_LOADED:
-        return HRESULT_FROM_WIN32(ERROR_INTERNAL_ERROR);
+      return HRESULT_FROM_WIN32(ERROR_INTERNAL_ERROR);
    case StatusCode::NOT_IMPLEMENTED:
-        return E_NOTIMPL;
+      return E_NOTIMPL;
    case StatusCode::INVALID_GRAPH:
-        return HRESULT_FROM_WIN32(ERROR_FILE_CORRUPT);
+      return HRESULT_FROM_WIN32(ERROR_FILE_CORRUPT);
    case StatusCode::EP_FAIL:
-        return HRESULT_FROM_WIN32(ERROR_INTERNAL_ERROR);
+      return HRESULT_FROM_WIN32(ERROR_INTERNAL_ERROR);
    default:
-        return E_FAIL;
-    }
+      return E_FAIL;
+  }
 }
 #endif

@ -122,9 +121,10 @@ class [[nodiscard]] Status {

  Status(StatusCategory category, int code);

+  GSL_SUPPRESS(r.11)
  Status(const Status& other)
      : state_((other.state_ == nullptr) ? nullptr : new State(*other.state_)) {}
-
+  GSL_SUPPRESS(r.11)
  Status& operator=(const Status& other) {
    if (state_ != other.state_) {
      if (other.state_ == nullptr) {
--- a/include/onnxruntime/core/framework/kernel_def_builder.h
+++ b/include/onnxruntime/core/framework/kernel_def_builder.h
@ -183,7 +183,7 @@ class KernelDefBuilder {
  static std::unique_ptr<KernelDefBuilder> Create() { return std::make_unique<KernelDefBuilder>(); }

  explicit KernelDefBuilder()
-      : kernel_def_(new KernelDef()) {}
+      : kernel_def_(std::make_unique<KernelDef>()) {}

  KernelDefBuilder& SetName(const std::string& op_name);
  KernelDefBuilder& SetName(const char* op_name);
@ -274,7 +274,7 @@ class KernelDefBuilder {
  KernelDefBuilder& Alias(int input_index, int output_index);

  /**
-     Apply variadic number of alias mapping from inputs to outputs. 
+     Apply variadic number of alias mapping from inputs to outputs.
     This is effectively applying Alias(i + input_offset, i + output_offset) for i >= 0
  */
  KernelDefBuilder& VariadicAlias(int input_offset, int output_offset);
@ -290,7 +290,7 @@ class KernelDefBuilder {
  }

  /**
-     Specify that this kernel's output buffers are passed from external, 
+     Specify that this kernel's output buffers are passed from external,
     i.e. not created or managed by ORT's memory allocator.
  */
  KernelDefBuilder& ExternalOutputs() {
--- a/include/onnxruntime/core/framework/kernel_registry.h
+++ b/include/onnxruntime/core/framework/kernel_registry.h
@ -37,7 +37,7 @@ class KernelRegistry {
  // TODO(Task:132) Make usage of unique_ptr/shared_ptr as out param consistent
  Status TryCreateKernel(const Node& node, const IExecutionProvider& execution_provider,
                         const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
-                         const OrtValueNameIdxMap& mlvalue_name_idx_map, const FuncManager& funcs_mgr,
+                         const OrtValueNameIdxMap& mlvalue_name_idx_map, FuncManager& funcs_mgr,
                         const DataTransferManager& data_transfer_mgr,
                         std::unique_ptr<OpKernel>& op_kernel) const;

--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@ -125,9 +125,9 @@ class OpKernel {
  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OpKernel);
  std::unique_ptr<OpKernelInfo> op_kernel_info_;
 };
-
-using KernelCreateFn = std::function<OpKernel*(const OpKernelInfo& info)>;
-using KernelCreatePtrFn = std::add_pointer<OpKernel*(const OpKernelInfo& info)>::type;
+class FuncManager;
+using KernelCreateFn = std::function<Status(FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out)>;
+using KernelCreatePtrFn = std::add_pointer<Status(FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out)>::type;

 struct KernelCreateInfo {
  std::unique_ptr<KernelDef> kernel_def;  // Owned and stored in the global kernel registry.
@ -197,7 +197,7 @@ using BuildKernelCreateInfoFn = KernelCreateInfo (*)();
            .SinceVersion(ver)                                                                                        \
            .Provider(provider)                                                                                       \
            .Build(),                                                                                                 \
-        static_cast<KernelCreatePtrFn>([](const OpKernelInfo& info) -> OpKernel* { return new __VA_ARGS__(info); })); \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
  }

 #define ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(provider, domain, startver, endver, name) \
@ -220,7 +220,7 @@ using BuildKernelCreateInfoFn = KernelCreateInfo (*)();
            .SinceVersion(startver, endver)                                                                           \
            .Provider(provider)                                                                                       \
            .Build(),                                                                                                 \
-        static_cast<KernelCreatePtrFn>([](const OpKernelInfo& info) -> OpKernel* { return new __VA_ARGS__(info); })); \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
  }

 #define ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(provider, domain, ver, type, name) \
@ -246,7 +246,7 @@ using BuildKernelCreateInfoFn = KernelCreateInfo (*)();
            .SinceVersion(ver)                                                                                        \
            .Provider(provider)                                                                                       \
            .Build(),                                                                                                 \
-        static_cast<KernelCreatePtrFn>([](const OpKernelInfo& info) -> OpKernel* { return new __VA_ARGS__(info); })); \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
  }

 #define ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(provider, domain, ver, type1, type2, name) \
@ -263,7 +263,7 @@ using BuildKernelCreateInfoFn = KernelCreateInfo (*)();
            .SinceVersion(ver)                                                                                        \
            .Provider(provider)                                                                                       \
            .Build(),                                                                                                 \
-        static_cast<KernelCreatePtrFn>([](const OpKernelInfo& info) -> OpKernel* { return new __VA_ARGS__(info); })); \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
  }

 #define ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(provider, domain, startver, endver, type, name) \
@ -293,7 +293,7 @@ using BuildKernelCreateInfoFn = KernelCreateInfo (*)();
            .SinceVersion(startver, endver)                                                                           \
            .Provider(provider)                                                                                       \
            .Build(),                                                                                                 \
-        static_cast<KernelCreatePtrFn>([](const OpKernelInfo& info) -> OpKernel* { return new __VA_ARGS__(info); })); \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
  }

 #define ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(provider, domain, startver, endver, type1, type2, name) \
@ -312,7 +312,7 @@ using BuildKernelCreateInfoFn = KernelCreateInfo (*)();
            .SinceVersion(startver, endver)                                                                           \
            .Provider(provider)                                                                                       \
            .Build(),                                                                                                 \
-        static_cast<KernelCreatePtrFn>([](const OpKernelInfo& info) -> OpKernel* { return new __VA_ARGS__(info); })); \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
  }

 template <typename... Types>
--- a/include/onnxruntime/core/framework/op_kernel_info.h
+++ b/include/onnxruntime/core/framework/op_kernel_info.h
@ -26,7 +26,6 @@ class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
                        const IExecutionProvider& execution_provider,
                        const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
                        const OrtValueNameIdxMap& mlvalue_name_idx_map,
-                        const FuncManager& funcs_mgr,
                        const DataTransferManager& data_transfer_mgr);

  OpKernelInfo(const OpKernelInfo& other);
@ -45,8 +44,6 @@ class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {

  bool TryGetConstantInput(int input_index, const Tensor** constant_input_value) const;

-  common::Status GetFusedFuncs(NodeComputeInfo*& compute_info) const;
-
 private:
  ORT_DISALLOW_MOVE(OpKernelInfo);
  ORT_DISALLOW_ASSIGNMENT(OpKernelInfo);
@ -58,7 +55,6 @@ class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
  gsl::not_null<const ::onnxruntime::IExecutionProvider*> execution_provider_;
  const std::unordered_map<int, OrtValue>& constant_initialized_tensors_;
  const OrtValueNameIdxMap& ort_value_name_idx_map_;
-  const FuncManager& funcs_mgr_;
  const DataTransferManager& data_transfer_mgr_;
  ProtoHelperNodeContext proto_helper_context_;
 };
--- a/include/onnxruntime/core/graph/basic_types.h
+++ b/include/onnxruntime/core/graph/basic_types.h
@ -11,6 +11,7 @@
 #include <functional>

 #include "core/common/basic_types.h"
+#include "core/common/status.h"

 namespace ONNX_NAMESPACE {
 class ValueInfoProto;
@ -44,6 +45,6 @@ using IOnnxRuntimeOpSchemaCollectionPtr = std::shared_ptr<IOnnxRuntimeOpSchemaCo

 class OpKernel;
 class OpKernelInfo;
-
-using KernelCreateFn = std::function<OpKernel*(const OpKernelInfo& info)>;
+class FuncManager;
+using KernelCreateFn = std::function<onnxruntime::common::Status(FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out)>;
 }  // namespace onnxruntime
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@ -401,8 +401,8 @@ class Node {
  }

  /** Sets initialized function body for node. This is called right after function body initialization for a node.
-  * or during function inlining when a nested function is encountered.
-  */
+   * or during function inlining when a nested function is encountered.
+   */
  void SetFunctionBody(Function& func);

  /** Call the provided function for all explicit inputs, implicit inputs, and outputs of this Node.
@ -500,15 +500,14 @@ class Node {
    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Relationships);
  };

- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Node);
-
  // NOTE: This friendship relationship should ONLY be used for calling methods of the Node class and not accessing
  // the data members directly, so that the Node can maintain its internal invariants.
  friend class Graph;
-
  Node(NodeIndex index, Graph& graph) : index_(index), graph_(&graph) {}

+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Node);
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
  void Init(const std::string& name,
            const std::string& op_type,
@ -648,7 +647,7 @@ class Graph {
  /** Check if a given name is a sparse initializer's name in the model
   * we currently convert sparse_initializer field in the model into dense Tensor instances.
   * However, we sometimes want to check if this initializer was stored as sparse in the model.
-  */
+   */
  bool IsSparseInitializer(const std::string& name) const;
 #endif

@ -978,7 +977,7 @@ class Graph {
  @remarks As a new Graph instance for the fused nodes is not created, a GraphViewer can be constructed with the
           IndexedSubGraph information to provide a view of the subgraph. The original nodes are left in place
           while this is in use.
-		   Call FinalizeFuseSubGraph to remove them once the fused replacement node is fully created.
+                   Call FinalizeFuseSubGraph to remove them once the fused replacement node is fully created.
  */
  Node& BeginFuseSubGraph(const IndexedSubGraph& sub_graph, const std::string& fused_node_name);

@ -1200,9 +1199,6 @@ class Graph {
  }
 #endif

- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Graph);
-
  // This friendship relationship should only be used to call Graph::Graph and
  // Graph::LoadGraph All other access should be via the public API.
  friend class Model;
@ -1243,6 +1239,8 @@ class Graph {
        const std::vector<const ONNX_NAMESPACE::FunctionProto*>& model_functions,
        const logging::Logger& logger);

+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Graph);
  void InitializeStateFromModelFileGraphProto();

  // Add node with specified <node_proto>.
@ -1530,16 +1528,16 @@ std::ostream& operator<<(std::ostream& out, const NodeArg& node_arg);
 // Print Node as,
 //  (operator's name, operator's type, domain, version) : (input0, input1, ...) -> (output0, output1, ...)
 // For example,
-//  ("Add_14", Add, "", 7) : ("110": tensor(float),"109": tensor(float),) -> ("111": tensor(float),) 
+//  ("Add_14", Add, "", 7) : ("110": tensor(float),"109": tensor(float),) -> ("111": tensor(float),)
 std::ostream& operator<<(std::ostream& out, const Node& node);
 // Print Graph as, for example,
 // Inputs:
 //    "Input": tensor(float)
 // Nodes:
-//    ("add0", Add, "", 7) : ("Input": tensor(float),"Bias": tensor(float),) -> ("add0_out": tensor(float),) 
-//    ("matmul", MatMul, "", 9) : ("add0_out": tensor(float),"matmul_weight": tensor(float),) -> ("matmul_out": tensor(float),) 
-//    ("add1", Add, "", 7) : ("matmul_out": tensor(float),"add_weight": tensor(float),) -> ("add1_out": tensor(float),) 
-//    ("reshape", Reshape, "", 5) : ("add1_out": tensor(float),"concat_out": tensor(int64),) -> ("Result": tensor(float),) 
+//    ("add0", Add, "", 7) : ("Input": tensor(float),"Bias": tensor(float),) -> ("add0_out": tensor(float),)
+//    ("matmul", MatMul, "", 9) : ("add0_out": tensor(float),"matmul_weight": tensor(float),) -> ("matmul_out": tensor(float),)
+//    ("add1", Add, "", 7) : ("matmul_out": tensor(float),"add_weight": tensor(float),) -> ("add1_out": tensor(float),)
+//    ("reshape", Reshape, "", 5) : ("add1_out": tensor(float),"concat_out": tensor(int64),) -> ("Result": tensor(float),)
 // Outputs:
 //    "Result": tensor(float)
 // Inputs' and outputs' format is described in document of NodeArg's operator<< above.
--- a/include/onnxruntime/core/graph/node_arg.h
+++ b/include/onnxruntime/core/graph/node_arg.h
@ -106,12 +106,12 @@ class NodeArg {
  Optional inputs are allowed in ONNX and an empty #Name represents a non-existent input argument. */
  bool Exists() const noexcept;

- private:
-  ORT_DISALLOW_COPY_AND_ASSIGNMENT(NodeArg);
  friend class Graph;

  NodeArg(NodeArgInfo&& node_arg_info);

+ private:
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(NodeArg);
 #if !defined(ORT_MINIMAL_BUILD)
  void SetType(const std::string* p_type);
  void SetType(const ONNX_NAMESPACE::TypeProto& type_proto);
--- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
+++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
@ -786,7 +786,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
 // unique_ptr.  The explicit deleter avoids the Eigen-specific
 // definition of ThreadPoolParallelSection needing to be avilable in
 // threadpool.h where the user-facing parallel section API is defined.
-
+GSL_SUPPRESS(r.11)
 std::unique_ptr<ThreadPoolParallelSection, void(*)(ThreadPoolParallelSection*)> AllocateParallelSection() override {
  return std::unique_ptr<ThreadPoolParallelSection, void(*)(ThreadPoolParallelSection*)>
    (new ThreadPoolParallelSection,
--- a/include/onnxruntime/core/session/environment.h
+++ b/include/onnxruntime/core/session/environment.h
@ -58,31 +58,31 @@ class Environment {
  /**
   * Registers an allocator for sharing between multiple sessions.
   * Return an error if an allocator with the same OrtMemoryInfo is already registered.
-  */
+   */
  Status RegisterAllocator(AllocatorPtr allocator);

  /**
   * Creates and registers an allocator for sharing between multiple sessions.
   * Return an error if an allocator with the same OrtMemoryInfo is already registered.
-  */
+   */
  Status CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info, const OrtArenaCfg* arena_cfg = nullptr);

  /**
   * Returns the list of registered allocators in this env.
-  */
+   */
  const std::vector<AllocatorPtr>& GetRegisteredSharedAllocators() const {
    return shared_allocators_;
  }

  /**
   * Removes registered allocator that was previously registered for sharing between multiple sessions.
-  */
+   */
  Status UnregisterAllocator(const OrtMemoryInfo& mem_info);

+  Environment() = default;
+
 private:
  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Environment);
-
-  Environment() = default;
  Status Initialize(std::unique_ptr<logging::LoggingManager> logging_manager,
                    const OrtThreadingOptions* tp_options = nullptr,
                    bool create_global_thread_pools = false);
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@ -129,6 +129,7 @@ extern "C" {

 // Used in *.cc files. Almost as same as ORT_API_STATUS, except without ORT_MUST_USE_RESULT and ORT_EXPORT
 #define ORT_API_STATUS_IMPL(NAME, ...) \
+  GSL_SUPPRESS(r .11)                  \
  _Success_(return == 0) _Check_return_ _Ret_maybenull_ OrtStatusPtr ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION

 #define ORT_CLASS_RELEASE(X) void(ORT_API_CALL * Release##X)(_Frees_ptr_opt_ Ort##X * input)
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@ -950,8 +950,14 @@ struct CustomOpBase : OrtCustomOp {
    OrtCustomOp::GetOutputType = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetOutputType(index); };

    OrtCustomOp::KernelCompute = [](void* op_kernel, OrtKernelContext* context) { static_cast<TKernel*>(op_kernel)->Compute(context); };
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 26409)
+#endif
    OrtCustomOp::KernelDestroy = [](void* op_kernel) { delete static_cast<TKernel*>(op_kernel); };
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
    OrtCustomOp::GetInputCharacteristic = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetInputCharacteristic(index); };
    OrtCustomOp::GetOutputCharacteristic = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetOutputCharacteristic(index); };
  }
--- a/onnxruntime/contrib_ops/cpu/activations.h
+++ b/onnxruntime/contrib_ops/cpu/activations.h
@ -67,7 +67,7 @@ class Gelu : public OpKernel {

    concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
    int64_t elem_count = input->Shape().Size();
-    static const int64_t length_per_task = 4096;  // this number comes from FastGelu.
+    constexpr int64_t length_per_task = 4096;  // this number comes from FastGelu.
    int64_t task_count = (elem_count + length_per_task - 1) / length_per_task;
    concurrency::ThreadPool::TryBatchParallelFor(
        tp, static_cast<int32_t>(task_count),
--- a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
@ -8,8 +8,9 @@
 #include <memory.h>

 using onnxruntime::rnn::detail::Allocate;
-//TODO: fix the warnings
+// TODO: fix the warnings
 #if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 namespace onnxruntime {
--- a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
@ -12,6 +12,7 @@
 #include "core/framework/allocator.h"
 //TODO: fix the warnings
 #if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 namespace onnxruntime {
--- a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
@ -16,6 +16,7 @@
 #endif
 //TODO: fix the warnings
 #if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 using namespace onnxruntime::rnn::detail;
--- a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
@ -12,6 +12,7 @@
 //TODO: fix the warnings
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(push)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 namespace onnxruntime {
--- a/onnxruntime/contrib_ops/cpu/crop_and_resize.cc
+++ b/onnxruntime/contrib_ops/cpu/crop_and_resize.cc
@ -24,6 +24,7 @@ limitations under the License.
 #include "core/providers/cpu/object_detection/roialign.h"
 //TODO: fix the warnings
 #if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 using namespace onnxruntime::concurrency;
--- a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
+++ b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
@ -68,7 +68,7 @@ Status ReorderInput::Compute(OpKernelContext* context) const {

    if (channels_last_) {
      int64_t work_index = static_cast<int64_t>(work.start);
-      int64_t work_remaining = static_cast<int64_t>(work.end - work.start);
+      int64_t work_remaining = static_cast<int64_t>(work.end) - work.start;

      while (work_remaining > 0) {
        const int64_t batch_index = work_index / spatial_size;
@ -87,7 +87,7 @@ Status ReorderInput::Compute(OpKernelContext* context) const {
      }
    } else {
      int64_t work_index = static_cast<int64_t>(work.start) * nchwc_block_size;
-      int64_t work_remaining = static_cast<int64_t>(work.end - work.start) * nchwc_block_size;
+      int64_t work_remaining = (static_cast<int64_t>(work.end) - work.start) * nchwc_block_size;

      while (work_remaining > 0) {
        const int64_t batch_index = work_index / nchwc_channels;
@ -331,7 +331,7 @@ Status NchwcUpsample::Compute(OpKernelContext* context) const {
    auto upsample_worker = [&](ptrdiff_t batch) {
      auto work = concurrency::ThreadPool::PartitionWork(batch, worker_count, total_work);
      int64_t work_index = static_cast<int64_t>(work.start);
-      int64_t work_remaining = static_cast<int64_t>(work.end - work.start);
+      int64_t work_remaining = static_cast<int64_t>(work.end) - work.start;

      while (work_remaining > 0) {
        // Limit the current loop iteration to the same source image.
--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_pool.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_pool.cc
@ -119,7 +119,7 @@ struct QLinearPoolNhwc1DTask final {
    int64_t batch = begin / y_image_size;
    int64_t offset = begin % y_image_size;

-    for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
+    for (std::ptrdiff_t remains = end - begin; remains > 0; offset = 0, batch++) {
      if (offset + remains <= y_image_size) {
        operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
        remains = 0;
@ -247,7 +247,7 @@ struct QLinearPoolNhwc2DTask final {
    int64_t batch = begin / y_image_size;
    int64_t offset = begin % y_image_size;

-    for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
+    for (int64_t remains = static_cast<int64_t>(end) - begin; remains > 0; offset = 0, batch++) {
      if (offset + remains <= y_image_size) {
        operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
        remains = 0;
@ -268,7 +268,7 @@ struct QLinearPoolNhwc2DTask final {
    start_pw -= (start_ph * pooled_width);

    int64_t pool_index = channels * begin;
-    int64_t remains = end - begin;
+    std::ptrdiff_t remains = end - begin;
    std::vector<float> Yh(channels);

    for (int64_t ph = start_ph; remains > 0 && ph < pooled_height; ++ph) {
@ -281,7 +281,7 @@ struct QLinearPoolNhwc2DTask final {
        wstart = std::max(wstart, static_cast<int64_t>(0));

        // do the pooling here
-        float pool_init_value = PoolType::Initialize();
+        constexpr float pool_init_value = PoolType::Initialize();
        std::fill(Yh.data(), Yh.data() + channels, pool_init_value);
        for (int64_t h = hstart; h < hend; ++h) {
          int64_t input_index = channels * (h * width + wstart);
@ -415,7 +415,7 @@ struct QLinearPoolNhwc3DTask final {
    int64_t batch = begin / y_image_size;
    int64_t offset = begin % y_image_size;

-    for (int64_t remains = end - begin; remains > 0; offset = 0, batch++) {
+    for (int64_t remains = static_cast<int64_t>(end) - begin; remains > 0; offset = 0, batch++) {
      if (offset + remains <= y_image_size) {
        operator()(std::ptrdiff_t(batch), std::ptrdiff_t(offset), std::ptrdiff_t(offset + remains));
        remains = 0;
@ -437,7 +437,7 @@ struct QLinearPoolNhwc3DTask final {
    int64_t start_pw = start_pd / pooled_depth;
    start_pd = start_pd - start_pw * pooled_depth;
    int64_t pool_index = channels * begin;
-    int64_t remains = end - begin;
+    int64_t remains = static_cast<int64_t>(end) - begin;

    std::vector<float> Yh(channels);

--- a/onnxruntime/contrib_ops/cpu/tokenizer.cc
+++ b/onnxruntime/contrib_ops/cpu/tokenizer.cc
@ -93,7 +93,7 @@ Tokenizer::Tokenizer(const OpKernelInfo& info) : OpKernel(info) {
      re2::RE2::Options options;
      options.set_longest_match(true);
      for (const auto& sep : separators) {
-        std::unique_ptr<re2::RE2> regex(new re2::RE2(sep, options));
+        std::unique_ptr<re2::RE2> regex = std::make_unique<re2::RE2>(sep, options);
        if (!regex->ok()) {
          ORT_THROW("Can not digest separators: ", sep, " ", regex->error());
        }
@ -104,7 +104,7 @@ Tokenizer::Tokenizer(const OpKernelInfo& info) : OpKernel(info) {
      assert(!tokenexp.empty());
      re2::RE2::Options options;
      options.set_longest_match(true);
-      std::unique_ptr<re2::RE2> regex(new re2::RE2(tokenexp, options));
+      std::unique_ptr<re2::RE2> regex = std::make_unique<re2::RE2>(tokenexp, options);
      if (!regex->ok()) {
        ORT_THROW("Can not digest tokenexp: ", regex->error());
      }
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
@ -30,6 +30,8 @@

 #ifdef _MSC_VER
 #pragma warning(pop)
+// Could reduce the chance of arithmetic overflow. TODO: fix it
+#pragma warning(disable : 26451)
 #endif

 using namespace ONNX_NAMESPACE;
@ -431,10 +433,10 @@ Status BeamSearchImpl<T>::ProcessLogits(
  Tensor::InitOrtValue(element_type, next_token_scores_shape, next_token_scores.data(), allocator->Info(), next_token_scores_value);
  const Tensor& input = next_token_scores_value.Get<Tensor>();

-  const int axis = 1;
+  constexpr int axis = 1;
  const unsigned top_k = static_cast<unsigned>(2 * parameters_->num_beams);
-  const bool largest = true;
-  const bool sorted = true;  // results returned in sorted order.
+  constexpr bool largest = true;
+  constexpr bool sorted = true;  // results returned in sorted order.

  std::unique_ptr<Tensor> topk_scores;
  std::unique_ptr<Tensor> topk_indices;
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
@ -10,7 +10,10 @@
 #include "core/providers/cpu/tensor/utils.h"
 #include "core/providers/cpu/rnn/rnn_helpers.h"
 #include "beam_search_scorer.h"
-
+#ifdef _MSC_VER
+// Could reduce the chance of arithmetic overflow. TODO: fix it
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 namespace contrib {
 namespace transformers {
@ -126,7 +129,7 @@ void BeamSearchScorer<T>::Initialize(AllocatorPtr& allocator, int sequence_lengt
  ORT_ENFORCE(next_beam_scores_.empty()); // Make sure this is called only once.

  size_t batch_beam_size = static_cast<size_t>(batch_size_ * num_beams_);
-  const bool no_fill = false; // do not fill values after allocation
+  constexpr bool no_fill = false;  // do not fill values after allocation
  next_beam_scores_ = Allocate<T>(allocator, batch_beam_size, next_beam_scores_ptr_, no_fill);
  next_beam_tokens_ = Allocate<int64_t>(allocator, batch_beam_size, next_beam_tokens_ptr_, no_fill);
  next_beam_indices_ = Allocate<int64_t>(allocator, batch_beam_size, next_beam_indices_ptr_, no_fill);
--- a/onnxruntime/contrib_ops/cpu/transformers/gpt_subgraph.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/gpt_subgraph.cc
@ -21,7 +21,10 @@
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
-
+#ifdef _MSC_VER
+// Could reduce the chance of arithmetic overflow. TODO: fix it
+#pragma warning(disable : 26451)
+#endif
 using namespace ONNX_NAMESPACE;
 using namespace onnxruntime::common;

--- a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
@ -1,7 +1,10 @@
 #include <assert.h>
 #include "logits_processor.h"
 #include "dump_tensor.h"
-
+#ifdef _MSC_VER
+// Could reduce the chance of arithmetic overflow. TODO: fix it
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 namespace contrib {
 namespace transformers {
--- a/onnxruntime/contrib_ops/cpu/transformers/sequences.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/sequences.cc
@ -1,5 +1,8 @@
 #include "sequences.h"
-
+#ifdef _MSC_VER
+// Could reduce the chance of arithmetic overflow. TODO: fix it
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 namespace contrib {
 namespace transformers {
--- a/onnxruntime/core/common/threadpool.cc
+++ b/onnxruntime/core/common/threadpool.cc
@ -36,7 +36,10 @@ limitations under the License.
 #include <sched.h>
 #endif
 #endif
-
+#if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {

 namespace concurrency {
@ -66,7 +69,7 @@ void ThreadPoolProfiler::Start() {
 ThreadPoolProfiler::MainThreadStat& ThreadPoolProfiler::GetMainThreadStat() {
  static thread_local std::unique_ptr<MainThreadStat> stat;
  if (!stat) {
-    stat.reset(new MainThreadStat());
+    stat = std::make_unique<MainThreadStat>();
  }
  return *stat;
 }
@ -336,8 +339,8 @@ class alignas(CACHE_LINE_BYTES) LoopCounter {
  //   Hence, at low thread counts, each of N threads will get its own
  //   shard representing 1/N of the work.
  constexpr static unsigned GetNumShards(uint64_t num_iterations,
-                               uint64_t d_of_p,
-                               uint64_t block_size) {
+                                         uint64_t d_of_p,
+                                         uint64_t block_size) {
    unsigned num_shards = 0;
    auto num_blocks = num_iterations / block_size;
    if (num_blocks == 0) {
@ -376,10 +379,10 @@ ThreadPool::ThreadPool(Env* env,
    int threads_to_create = degree_of_parallelism - 1;
    extended_eigen_threadpool_ =
        std::make_unique<ThreadPoolTempl<Env> >(name,
-                                                        threads_to_create,
-                                                        low_latency_hint,
-                                                        *env,
-                                                        thread_options_);
+                                                threads_to_create,
+                                                low_latency_hint,
+                                                *env,
+                                                thread_options_);
    underlying_threadpool_ = extended_eigen_threadpool_.get();
  }
 }
--- a/onnxruntime/core/framework/allocator.cc
+++ b/onnxruntime/core/framework/allocator.cc
@ -110,7 +110,10 @@ void CPUAllocator::Free(void* p) {
 }  // namespace onnxruntime

 std::ostream& operator<<(std::ostream& out, const OrtMemoryInfo& info) { return (out << info.ToString()); }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 26409)
+#endif
 ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtAllocatorType type, int id1,
                    enum OrtMemType mem_type1, _Outptr_ OrtMemoryInfo** out) {
  if (strcmp(name1, onnxruntime::CPU) == 0) {
@ -126,7 +129,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
  } else if (strcmp(name1, onnxruntime::OpenVINO_GPU) == 0) {
    *out = new OrtMemoryInfo(
        onnxruntime::OpenVINO_GPU, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)),
-         id1, mem_type1);
+        id1, mem_type1);
  } else if (strcmp(name1, onnxruntime::DML) == 0) {
    *out = new OrtMemoryInfo(
        onnxruntime::DML, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)),
@ -138,7 +141,9 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
 }

 ORT_API(void, OrtApis::ReleaseMemoryInfo, _Frees_ptr_opt_ OrtMemoryInfo* p) { delete p; }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
 ORT_API_STATUS_IMPL(OrtApis::MemoryInfoGetName, _In_ const OrtMemoryInfo* ptr, _Out_ const char** out) {
  *out = ptr->name;
  return nullptr;
@ -164,4 +169,3 @@ ORT_API_STATUS_IMPL(OrtApis::CompareMemoryInfo, _In_ const OrtMemoryInfo* info1,
  *out = (*info1 == *info2) ? 0 : -1;
  return nullptr;
 }
-
--- a/onnxruntime/core/framework/bfc_arena.cc
+++ b/onnxruntime/core/framework/bfc_arena.cc
@ -164,8 +164,19 @@ Status BFCArena::Extend(size_t rounded_bytes) {
  static constexpr float kBackpedalFactor = 0.9f;
  // Try allocating less memory.
  while (mem_addr == nullptr) {
+  // kBackpedalFactor is float, bytes is size_t. The result of bytes * kBackpedalFactor is float. When we cast it to
+  // size_t, which is a smaller type, it could loss data. This is what C4244 complains. The "static_cast<size_t>" here 
+  // is to suppress the warning. C26451 suggest we may change kBackpedalFactor to double to get better accuary. But if
+  // we do that, AMD GPU CI build pipeline will have an "out-of-memory" error. So I choose to keep this piece of code
+  // untouched and disable the warning first.
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 26451)
+#endif
    bytes = RoundedBytes(static_cast<size_t>(bytes * kBackpedalFactor));
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
    // give up if we can't satisfy the requested size, or we're attempting an allocation of less than 8K.
    //
    // the latter protects against an infinite loop that occurs when bytes is less than 2560. at that point the 10%
--- a/onnxruntime/core/framework/bfc_arena.h
+++ b/onnxruntime/core/framework/bfc_arena.h
@ -211,7 +211,7 @@ class BFCArena : public IAllocator {
      ORT_ENFORCE(0 == memory_size % kMinAllocationSize);
      const size_t n_handles =
          (memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
-      handles_ = new ChunkHandle[n_handles];
+      handles_ = std::make_unique<ChunkHandle[]>(n_handles);
      for (size_t i = 0; i < n_handles; i++) {
        handles_[i] = kInvalidChunkHandle;
      }
@ -219,7 +219,7 @@ class BFCArena : public IAllocator {

    AllocationRegion() = default;

-    ~AllocationRegion() { delete[] handles_; }
+    ~AllocationRegion() = default;

    AllocationRegion(AllocationRegion&& other) noexcept { Swap(other); }

@ -266,7 +266,7 @@ class BFCArena : public IAllocator {
    // Array of size "memory_size / kMinAllocationSize".  It is
    // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle
    // for the memory allocation represented by "p"
-    ChunkHandle* handles_ = nullptr;
+    std::unique_ptr<ChunkHandle[]> handles_;

    ORT_DISALLOW_ASSIGNMENT(AllocationRegion);
  };
--- a/onnxruntime/core/framework/callback.cc
+++ b/onnxruntime/core/framework/callback.cc
@ -2,7 +2,9 @@
 // Licensed under the MIT License.

 #include "core/framework/callback.h"
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 namespace onnxruntime {
 void OrtRunCallback(OrtCallback* f) noexcept {
  if (f == nullptr) return;
--- a/onnxruntime/core/framework/data_types.cc
+++ b/onnxruntime/core/framework/data_types.cc
@ -314,7 +314,10 @@ struct TensorTypeBase::Impl : public data_types_internal::TypeProtoImpl {
 const ONNX_NAMESPACE::TypeProto* TensorTypeBase::GetTypeProto() const {
  return impl_->GetProto();
 }
-
+//TODO: Fix the warning
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 TensorTypeBase::TensorTypeBase()
    : DataTypeImpl{DataTypeImpl::GeneralType::kTensor, sizeof(Tensor)},
      impl_(new Impl()) {}
--- a/onnxruntime/core/framework/error_code.cc
+++ b/onnxruntime/core/framework/error_code.cc
@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

+#include "core/common/gsl_suppress.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/ort_apis.h"
 #include "core/common/status.h"
@ -66,5 +67,7 @@ ORT_API(OrtErrorCode, OrtApis::GetErrorCode, _In_ const OrtStatus* status) {
 ORT_API(const char*, OrtApis::GetErrorMessage, _In_ const OrtStatus* status) {
  return status->msg;
 }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 ORT_API(void, OrtApis::ReleaseStatus, _Frees_ptr_opt_ OrtStatus* value) { delete[] reinterpret_cast<uint8_t*>(value); }
--- a/onnxruntime/core/framework/func_kernel.h
+++ b/onnxruntime/core/framework/func_kernel.h
@ -13,21 +13,29 @@ void release_helper_func(void* allocator, void* p);
 //A kernel that wrapper the ComputeFunction call generated by execution provider when fuse the sub-graph
 class FunctionKernel : public OpKernel {
 public:
+  explicit FunctionKernel(const OpKernelInfo& info, const NodeComputeInfo* compute) : OpKernel(info), compute_info_(compute) {}
+
  //The original design is we load the dll, find the entry point and wrapper it.
  //Here for quick prototype, we keep the entry pointer in the node.
-  explicit FunctionKernel(const OpKernelInfo& info) : OpKernel(info) {
-    num_inputs_ = info.node().InputDefs().size();
-    num_outputs_ = info.node().OutputDefs().size();
-    auto status = info.GetFusedFuncs(compute_info_);
-    ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
-    if (compute_info_->create_state_func) {
+  static Status Create(FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) {
+    const NodeComputeInfo* compute;
+    ORT_RETURN_IF_ERROR(func_mgr.GetFuncs(info.node().Name(), compute));
+    std::unique_ptr<FunctionKernel> funckernel = std::make_unique<FunctionKernel>(info, compute);
+    funckernel->num_inputs_ = info.node().InputDefs().size();
+    funckernel->num_outputs_ = info.node().OutputDefs().size();
+
+    if (compute->create_state_func) {
      //TODO: we are only provide host allocate method in compute context.
      //Do we need to hold the ref-counting here?
-      host_allocator_ = info.GetAllocator(0, OrtMemType::OrtMemTypeDefault);
-      ComputeContext context = {allocate_helper_func, release_helper_func, host_allocator_.get(),
+      funckernel->host_allocator_ = info.GetAllocator(0, OrtMemType::OrtMemTypeDefault);
+      ComputeContext context = {allocate_helper_func, release_helper_func, funckernel->host_allocator_.get(),
                                info.node().Name().c_str()};
-      ORT_ENFORCE(compute_info_->create_state_func(&context, &func_state_) == 0);
+      int ret = funckernel->compute_info_->create_state_func(&context, &funckernel->func_state_);
+      if (ret != 0)
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Create state function failed. Return value:", ret);
    }
+    out = std::move(funckernel);
+    return Status::OK();
  }

  ~FunctionKernel() override {
@ -38,12 +46,14 @@ class FunctionKernel : public OpKernel {

  virtual Status Compute(OpKernelContext* context) const override {
    auto* context_internal = static_cast<OpKernelContextInternal*>(context);
-    return compute_info_->compute_func(func_state_, OrtGetApiBase()->GetApi(ORT_API_VERSION),
+    const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
+    if (api == nullptr) return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "API VERSION ", ORT_API_VERSION, " is invalid.");
+    return compute_info_->compute_func(func_state_, api,
                                       reinterpret_cast<OrtKernelContext*>(context_internal));
  }

 private:
-  NodeComputeInfo* compute_info_{nullptr};
+  const NodeComputeInfo* const compute_info_;
  FunctionState func_state_{nullptr};
  size_t num_inputs_;
  size_t num_outputs_;
--- a/onnxruntime/core/framework/fuse_nodes_funcs.cc
+++ b/onnxruntime/core/framework/fuse_nodes_funcs.cc
@ -22,7 +22,7 @@ Status FuncManager::AddFuncInfo(const std::string& name, NodeComputeInfo&& compu
  return Status::OK();
 }

-Status FuncManager::GetFuncs(const std::string& name, NodeComputeInfo*& compute_info) const {
+Status FuncManager::GetFuncs(const std::string& name, const NodeComputeInfo*& compute_info) {
  auto it = fused_funcs_->find(name);
  if (it == fused_funcs_->end())
    return Status(common::ONNXRUNTIME, common::FAIL, "func info for node: " + name + " not found.");
@ -30,7 +30,7 @@ Status FuncManager::GetFuncs(const std::string& name, NodeComputeInfo*& compute_
  if (!it->second.compute_info.compute_func) {
    //load from path
    void* handle = nullptr;
-    ORT_RETURN_IF_ERROR(lib_loader_->LoadExternalLib(it->second.dso_path, &handle));
+    ORT_RETURN_IF_ERROR(lib_loader_.LoadExternalLib(it->second.dso_path, &handle));
    void* create_func_symbol_handle = nullptr;
    ORT_RETURN_IF_ERROR(Env::Default().GetSymbolFromLibrary(handle,
                                                            kCreateStateFuncSymbol + name,
--- a/onnxruntime/core/framework/fuse_nodes_funcs.h
+++ b/onnxruntime/core/framework/fuse_nodes_funcs.h
@ -8,15 +8,15 @@ namespace onnxruntime {
 class FuncManager {
 public:
  FuncManager()
-      : fused_funcs_(std::make_shared<std::unordered_map<std::string, FuncInfo> >()),
-        lib_loader_(std::make_unique<ExLibLoader>()) {
+      : fused_funcs_(std::make_shared<std::unordered_map<std::string, FuncInfo> >()) {
  }

  Status AddFuncInfo(const std::string& name, const std::string& dll_path);

  Status AddFuncInfo(const std::string& name, NodeComputeInfo&& compute_info);

-  Status GetFuncs(const std::string& name, NodeComputeInfo*& compute_info) const;
+  //Do not call AddFuncInfo after this function is called.
+  Status GetFuncs(const std::string& name, const NodeComputeInfo*& compute_info);

  size_t NumFuncs() const { return fused_funcs_->size(); }

@ -38,7 +38,7 @@ class FuncManager {
  // because it's filled in by the time main graph is traversed,
  // while subgraph session state is created later
  std::shared_ptr<std::unordered_map<std::string, FuncInfo> > fused_funcs_;
-  std::unique_ptr<ExLibLoader> lib_loader_;
+  ExLibLoader lib_loader_;
  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(FuncManager);
 };
 }  // namespace onnxruntime
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@ -258,10 +258,9 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, bool export_dll, FuncMa
        KernelDefBuilder builder;
        BuildFusedKernelDef(builder, *node);
        ORT_RETURN_IF_ERROR(fused_kernel_registry.Register(builder,
-                                                           static_cast<KernelCreatePtrFn>(
-                                                               [](const OpKernelInfo& info) -> OpKernel* {
-                                                                 return new FunctionKernel(info);
-                                                               })));
+                                                           [](FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
+                                                             return FunctionKernel::Create(func_mgr, info, out);
+                                                           }));
      }

    } else {
@ -298,10 +297,9 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, bool export_dll, FuncMa
        KernelDefBuilder builder;
        BuildFusedKernelDef(builder, metadef, type);
        ORT_RETURN_IF_ERROR(fused_kernel_registry.Register(builder,
-                                                           static_cast<KernelCreatePtrFn>(
-                                                               [](const OpKernelInfo& info) -> OpKernel* {
-                                                                 return new FunctionKernel(info);
-                                                               })));
+                                                           [](FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
+                                                             return FunctionKernel::Create(func_mgr, info, out);
+                                                           }));

        // now that we're done compiling we can remove the original nodes from the Graph and wire in the new one
        graph.FinalizeFuseSubGraph(indexed_sub_graph, *node);
@ -477,10 +475,10 @@ static Status PartitionOrtFormatModelImpl(Graph& graph, FuncManager& func_mgr,
      }

      ORT_RETURN_IF_ERROR(fused_kernel_registry.Register(
-          KernelCreateInfo(std::move(kernel_def), static_cast<KernelCreatePtrFn>(
-                                                      [](const OpKernelInfo& info) -> OpKernel* {
-                                                        return new FunctionKernel(info);
-                                                      }))));
+          KernelCreateInfo(std::move(kernel_def),
+                           [](FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
+                             return FunctionKernel::Create(func_mgr, info, out);
+                           })));

      // now that we're done compiling we can remove the original nodes from the Graph and wire in the new one
      graph.FinalizeFuseSubGraph(indexed_sub_graph, node);
--- a/onnxruntime/core/framework/kernel_registry.cc
+++ b/onnxruntime/core/framework/kernel_registry.cc
@ -206,7 +206,7 @@ Status KernelRegistry::TryCreateKernel(const Node& node,
                                       const IExecutionProvider& execution_provider,
                                       const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
                                       const OrtValueNameIdxMap& ort_value_name_idx_map,
-                                       const FuncManager& funcs_mgr,
+                                       FuncManager& funcs_mgr,
                                       const DataTransferManager& data_transfer_mgr,
                                       /*out*/ std::unique_ptr<OpKernel>& op_kernel) const {
  const KernelCreateInfo* kernel_create_info = nullptr;
@ -216,10 +216,8 @@ Status KernelRegistry::TryCreateKernel(const Node& node,
                           execution_provider,
                           constant_initialized_tensors,
                           ort_value_name_idx_map,
-                           funcs_mgr,
                           data_transfer_mgr);
-  op_kernel.reset(kernel_create_info->kernel_create_func(kernel_info));
-  return Status::OK();
+  return kernel_create_info->kernel_create_func(funcs_mgr, kernel_info, op_kernel);
 }

 static std::string ToString(const std::vector<std::string>& error_strs) {
--- a/onnxruntime/core/framework/kernel_registry_manager.cc
+++ b/onnxruntime/core/framework/kernel_registry_manager.cc
@ -13,18 +13,17 @@
 using namespace ONNX_NAMESPACE;
 using namespace ::onnxruntime::common;
 namespace onnxruntime {
-std::unique_ptr<OpKernel> KernelRegistryManager::CreateKernel(const onnxruntime::Node& node,
-                                                              const IExecutionProvider& execution_provider,
-                                                              const SessionState& session_state,
-                                                              const KernelCreateInfo& kernel_create_info) const {
+Status KernelRegistryManager::CreateKernel(const onnxruntime::Node& node,
+                                           const IExecutionProvider& execution_provider,
+                                           SessionState& session_state,
+                                           const KernelCreateInfo& kernel_create_info,
+                                           std::unique_ptr<OpKernel>& out) const {
  OpKernelInfo kernel_info(node, *kernel_create_info.kernel_def, execution_provider,
                           session_state.GetConstantInitializedTensors(),
                           session_state.GetOrtValueNameIdxMap(),
-                           session_state.GetFuncMgr(),
                           session_state.GetDataTransferMgr());

-  // OpKernel is abstract base class so can't use make_unique
-  return std::unique_ptr<OpKernel>(kernel_create_info.kernel_create_func(kernel_info));
+  return kernel_create_info.kernel_create_func(session_state.GetMutableFuncMgr(), kernel_info, out);
 }

 Status KernelRegistryManager::RegisterKernels(const ExecutionProviders& execution_providers) {
--- a/onnxruntime/core/framework/kernel_registry_manager.h
+++ b/onnxruntime/core/framework/kernel_registry_manager.h
@ -77,10 +77,10 @@ class KernelRegistryManager {
  bool SearchKernelRegistriesByHash(HashValue kernel_def_hash,
                                    const KernelCreateInfo** kernel_create_info) const;

-  std::unique_ptr<OpKernel> CreateKernel(const onnxruntime::Node& node,
-                                         const IExecutionProvider& execution_provider,
-                                         const SessionState& session_state,
-                                         const KernelCreateInfo& kernel_create_info) const ORT_MUST_USE_RESULT;
+  Status CreateKernel(const onnxruntime::Node& node,
+                      const IExecutionProvider& execution_provider,
+                      SessionState& session_state,
+                      const KernelCreateInfo& kernel_create_info, std::unique_ptr<OpKernel>& out) const;

  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(KernelRegistryManager);

--- a/onnxruntime/core/framework/onnxruntime_map_type_info.cc
+++ b/onnxruntime/core/framework/onnxruntime_map_type_info.cc
@ -32,7 +32,9 @@ ToONNXTensorElementDataType(ONNX_NAMESPACE::TensorProto_DataType data_type) {
    default:                                               { return ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;    }
  }
 }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 OrtStatus* OrtMapTypeInfo::FromTypeProto(const ONNX_NAMESPACE::TypeProto* type_proto, OrtMapTypeInfo** out) {
  auto value_case = type_proto->value_case();
  if (value_case != ONNX_NAMESPACE::TypeProto::kMapType)
--- a/onnxruntime/core/framework/onnxruntime_sequence_type_info.cc
+++ b/onnxruntime/core/framework/onnxruntime_sequence_type_info.cc
@ -6,21 +6,20 @@
 #include "core/session/ort_apis.h"
 #include "core/framework/error_code_helper.h"

-OrtSequenceTypeInfo::OrtSequenceTypeInfo(OrtTypeInfo* sequence_key_type) noexcept :
-	sequence_key_type_(sequence_key_type, &OrtApis::ReleaseTypeInfo) {
+OrtSequenceTypeInfo::OrtSequenceTypeInfo(OrtTypeInfo* sequence_key_type) noexcept : sequence_key_type_(sequence_key_type, &OrtApis::ReleaseTypeInfo) {
 }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 OrtStatus* OrtSequenceTypeInfo::FromTypeProto(const ONNX_NAMESPACE::TypeProto* type_proto, OrtSequenceTypeInfo** out) {
  auto value_case = type_proto->value_case();
-  if (value_case != ONNX_NAMESPACE::TypeProto::kSequenceType)
-  {
-    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "type_proto is not of type sequence!");;
+  if (value_case != ONNX_NAMESPACE::TypeProto::kSequenceType) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "type_proto is not of type sequence!");
  }

  auto type_proto_sequence = type_proto->sequence_type();
  OrtTypeInfo* sequence_key_type_info = nullptr;
-  if (auto status = OrtTypeInfo::FromTypeProto(&type_proto_sequence.elem_type(), &sequence_key_type_info))
-  {
+  if (auto status = OrtTypeInfo::FromTypeProto(&type_proto_sequence.elem_type(), &sequence_key_type_info)) {
    return status;
  }

@ -30,8 +29,7 @@ OrtStatus* OrtSequenceTypeInfo::FromTypeProto(const ONNX_NAMESPACE::TypeProto* t

 OrtStatus* OrtSequenceTypeInfo::Clone(OrtSequenceTypeInfo** out) {
  OrtTypeInfo* sequence_key_type_copy = nullptr;
-  if (auto status = sequence_key_type_->Clone(&sequence_key_type_copy))
-  {
+  if (auto status = sequence_key_type_->Clone(&sequence_key_type_copy)) {
    return status;
  }
  *out = new OrtSequenceTypeInfo(sequence_key_type_copy);
--- a/onnxruntime/core/framework/onnxruntime_typeinfo.cc
+++ b/onnxruntime/core/framework/onnxruntime_typeinfo.cc
@ -27,7 +27,9 @@ using onnxruntime::Tensor;
 using onnxruntime::TensorShape;

 namespace on = ONNX_NAMESPACE;
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 OrtTypeInfo::OrtTypeInfo(ONNXType type1) noexcept : type(type1) {
 }

--- a/onnxruntime/core/framework/onnxruntime_typeinfo.h
+++ b/onnxruntime/core/framework/onnxruntime_typeinfo.h
@ -4,6 +4,7 @@
 #pragma once
 #include <atomic>
 #include <string>
+#include "core/common/gsl_suppress.h"
 #include "core/session/onnxruntime_c_api.h"

 namespace onnxruntime {
--- a/onnxruntime/core/framework/op_kernel_info.cc
+++ b/onnxruntime/core/framework/op_kernel_info.cc
@ -13,7 +13,6 @@ OpKernelInfo::OpKernelInfo(const onnxruntime::Node& node,
                           const IExecutionProvider& execution_provider,
                           const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
                           const OrtValueNameIdxMap& ort_value_name_idx_map,
-                           const FuncManager& funcs_mgr,
                           const DataTransferManager& data_transfer_mgr)
    : OpNodeProtoHelper(&proto_helper_context_),
      node_(node),
@ -21,13 +20,12 @@ OpKernelInfo::OpKernelInfo(const onnxruntime::Node& node,
      execution_provider_(&execution_provider),
      constant_initialized_tensors_(constant_initialized_tensors),
      ort_value_name_idx_map_(ort_value_name_idx_map),
-      funcs_mgr_(funcs_mgr),
      data_transfer_mgr_(data_transfer_mgr),
      proto_helper_context_(node) {}

 OpKernelInfo::OpKernelInfo(const OpKernelInfo& other)
    : OpKernelInfo(other.node_, other.kernel_def_, *other.execution_provider_, other.constant_initialized_tensors_,
-                   other.ort_value_name_idx_map_, other.funcs_mgr_, other.data_transfer_mgr_) {}
+                   other.ort_value_name_idx_map_, other.data_transfer_mgr_) {}

 const OrtMemoryInfo& OpKernelInfo::GetMemoryInfo(int device_id, OrtMemType mem_type) const {
  AllocatorPtr alloc = GetAllocator(device_id, mem_type);
@ -79,7 +77,4 @@ bool OpKernelInfo::TryGetConstantInput(int input_index, const Tensor** constant_
  return true;
 }

-common::Status OpKernelInfo::GetFusedFuncs(NodeComputeInfo*& compute_info) const {
-  return funcs_mgr_.GetFuncs(node_.Name(), compute_info);
-}
 }  // namespace onnxruntime
--- a/onnxruntime/core/framework/run_options.cc
+++ b/onnxruntime/core/framework/run_options.cc
@ -5,7 +5,9 @@
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/ort_apis.h"
 #include "core/framework/error_code_helper.h"
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 ORT_API_STATUS_IMPL(OrtApis::CreateRunOptions, _Outptr_ OrtRunOptions** out) {
  API_IMPL_BEGIN
  *out = new OrtRunOptions();
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@ -5,6 +5,7 @@

 #include <string>
 #include <vector>
+#include "core/common/gsl_suppress.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/optimizer/graph_transformer_level.h"
 #include "core/util/thread_utils.h"
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@ -164,7 +164,7 @@ Status SessionState::CreateKernels(const KernelRegistryManager& kernel_registry_
      max_nodeid = std::max(max_nodeid, node.Index());
    }
    session_kernels_.clear();
-    session_kernels_.resize(max_nodeid + 1, nullptr);
+    session_kernels_.resize(max_nodeid + 1);
    for (const auto& node : nodes) {
      // construct and save the kernels
      const KernelCreateInfo& kci = GetNodeKernelCreateInfo(node.Index());
@ -173,10 +173,8 @@ Status SessionState::CreateKernels(const KernelRegistryManager& kernel_registry_
      onnxruntime::ProviderType exec_provider_name = node.GetExecutionProviderType();
      const IExecutionProvider& exec_provider = *execution_providers_.Get(exec_provider_name);

-      auto op_kernel = kernel_registry_manager.CreateKernel(node, exec_provider, *this, kci);
-
      // assumes vector is already resize()'ed to the number of nodes in the graph
-      session_kernels_[node.Index()] = op_kernel.release();
+      ORT_RETURN_IF_ERROR(kernel_registry_manager.CreateKernel(node, exec_provider, *this, kci, session_kernels_[node.Index()]));
    }
  }
  node_index_info_ = std::make_unique<NodeIndexInfo>(*graph_viewer_, ort_value_name_idx_map_);
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@ -109,9 +109,6 @@ class SessionState {
  }

  ~SessionState() {
-    for (auto* p : session_kernels_) {
-      delete p;
-    }
    for (auto& kvp : deleter_for_initialized_tensors_) {
      kvp.second.f(kvp.second.param);
    }
@ -124,11 +121,11 @@ class SessionState {
  // Get kernel for specified node.
  // It should called right before graph execution only.
  const OpKernel* GetKernel(size_t node_id) const {
-    return (node_id < session_kernels_.size()) ? session_kernels_[node_id] : nullptr;
+    return (node_id < session_kernels_.size()) ? session_kernels_[node_id].get() : nullptr;
  }

  OpKernel* GetMutableKernel(size_t node_id) {
-    return (node_id < session_kernels_.size()) ? session_kernels_[node_id] : nullptr;
+    return (node_id < session_kernels_.size()) ? session_kernels_[node_id].get() : nullptr;
  }

  const ExecutionProviders& GetExecutionProviders() const noexcept { return execution_providers_; }
@ -144,27 +141,27 @@ class SessionState {
  const OrtValueNameIdxMap& GetOrtValueNameIdxMap() const noexcept { return ort_value_name_idx_map_; }

  /**
-     * Adds an initialized tensor (weight) so that it can be used by the
-     * execution frame to setup the appropriate OrtValue vectors.
-     * This function will take a shallow copy of d if d is not NULL.
-     * If 'constant' is true the tensor value cannot be overridden by an input at runtime.
-     * If 'sparse' is true the tensor value represents a densified weight that was initially stored in the model
-     * as sparse tensor.
-     */
+   * Adds an initialized tensor (weight) so that it can be used by the
+   * execution frame to setup the appropriate OrtValue vectors.
+   * This function will take a shallow copy of d if d is not NULL.
+   * If 'constant' is true the tensor value cannot be overridden by an input at runtime.
+   * If 'sparse' is true the tensor value represents a densified weight that was initially stored in the model
+   * as sparse tensor.
+   */
  Status AddInitializedTensor(int ort_value_index, const OrtValue& ort_value, const OrtCallback* d, bool constant, bool sparse);

  /**
-     * Gets the map of ort_value_index to initialized tensors (weights) so that it can be used by the
-     * execution frame to setup the appropriate OrtValue vectors.
-     * The lifetime of returned OrtValues are limited by this SessionState object.
-     */
+   * Gets the map of ort_value_index to initialized tensors (weights) so that it can be used by the
+   * execution frame to setup the appropriate OrtValue vectors.
+   * The lifetime of returned OrtValues are limited by this SessionState object.
+   */
  const std::unordered_map<int, OrtValue>& GetInitializedTensors() const;

  /**
-     * Gets the map of ort_value_index to initialized tensors (e.g. weights) that are constant
-     * and cannot be overridden at runtime.
-     * The lifetime of returned OrtValues are limited by this SessionState object.
-     */
+   * Gets the map of ort_value_index to initialized tensors (e.g. weights) that are constant
+   * and cannot be overridden at runtime.
+   * The lifetime of returned OrtValues are limited by this SessionState object.
+   */
  const std::unordered_map<int, OrtValue>& GetConstantInitializedTensors() const;

 #if !defined(DISABLE_SPARSE_TENSORS)
@ -355,9 +352,9 @@ class SessionState {
  void CleanInitializedTensorsFromGraph();

  /**
-  * Prepack the constant initialized tensors for better performance.
-  * The original constant initialized tensors will be removed to save memory.
-  */
+   * Prepack the constant initialized tensors for better performance.
+   * The original constant initialized tensors will be removed to save memory.
+   */
  Status PrepackConstantInitializedTensors(std::unordered_map<std::string, size_t>& constant_initializers_use_count,
                                           const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map);

@ -402,7 +399,7 @@ class SessionState {
  std::unordered_map<std::string, HashValue> compiled_kernel_hashes_;

  // cache of the constructed kernels to avoid spending construction time per executor
-  std::vector<OpKernel*> session_kernels_;
+  std::vector<std::unique_ptr<OpKernel>> session_kernels_;
  Graph& graph_;
  std::unique_ptr<GraphViewer> graph_viewer_;  // GraphViewer for const access to Graph

--- a/onnxruntime/core/framework/sparse_tensor.cc
+++ b/onnxruntime/core/framework/sparse_tensor.cc
@ -22,7 +22,7 @@ std::ostream& operator<<(std::ostream& os, SparseFormat flags) {
 namespace {
 // Round up size to a multiple of int64
 constexpr size_t kIndexAlignment = alignof(int64_t);
-inline int64_t Roundup(int64_t size) {
+constexpr inline int64_t Roundup(int64_t size) {
  return ((SafeInt<int64_t>(size) + kIndexAlignment - 1) / kIndexAlignment) * kIndexAlignment;
 }

@ -31,7 +31,7 @@ inline int64_t Roundup(int64_t size) {
 /// after data and make sure indices start at int64_t aligned place
 /// </summary>
 /// <returns></returns>
-inline int64_t CalculateRequiredBufferSize(int64_t data_size, int64_t indices_size) {
+constexpr inline int64_t CalculateRequiredBufferSize(int64_t data_size, int64_t indices_size) {
  return SafeInt<int64_t>(Roundup(data_size)) + indices_size;
 }

--- a/onnxruntime/core/framework/tensor_type_and_shape.cc
+++ b/onnxruntime/core/framework/tensor_type_and_shape.cc
@ -24,7 +24,9 @@ using onnxruntime::MLFloat16;
 using onnxruntime::SparseTensor;
 #endif
 using onnxruntime::Tensor;
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 ORT_API_STATUS_IMPL(OrtApis::CreateTensorTypeAndShapeInfo, _Outptr_ OrtTensorTypeAndShapeInfo** out) {
  API_IMPL_BEGIN
  *out = new OrtTensorTypeAndShapeInfo();
@ -317,10 +319,10 @@ ORT_API_STATUS_IMPL(OrtApis::GetValueType, _In_ const OrtValue* v, _Out_ ONNXTyp
 }

 /**
-* Get the type information of an OrtValue
-* \param value
-* \return The returned value should be freed by OrtReleaseTypeInfo after use
-*/
+ * Get the type information of an OrtValue
+ * \param value
+ * \return The returned value should be freed by OrtReleaseTypeInfo after use
+ */
 ORT_API_STATUS_IMPL(OrtApis::GetTypeInfo, _In_ const OrtValue* v, _Outptr_result_maybenull_ struct OrtTypeInfo** out) {
  API_IMPL_BEGIN
  // TODO: This is consistent with the previous implementation but inconsistent with GetValueType which returns
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@ -533,7 +533,9 @@ ORT_API(void, OrtUninitializeBuffer, _In_opt_ void* input, size_t input_len, enu
    ptr[i].~string();
  }
 }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 class AutoDelete {
 public:
  OrtCallback d{nullptr, nullptr};
@ -594,7 +596,7 @@ static Status GetFileContent(
 * @param tensor_proto  tensor data in protobuf format
 * @param tensorp       pre-allocated tensor object, where we store the data
 * @return
-*/
+ */
 Status TensorProtoToTensor(const Env& env, const ORTCHAR_T* model_path,
                           const ONNX_NAMESPACE::TensorProto& tensor_proto,
                           Tensor& tensor) {
@ -830,7 +832,7 @@ common::Status ConstantNodeProtoToTensorProto(const ONNX_NAMESPACE::NodeProto& n
      break;
    }
 #else
-  ORT_UNUSED_PARAMETER(model_path);
+      ORT_UNUSED_PARAMETER(model_path);
 #endif
    default:
      ORT_THROW("Unsupported attribute value type of ", constant_attribute.type(),
@ -1098,7 +1100,6 @@ inline void CopyElement<uint8_t>(void* dst, const void* src, int64_t dst_index,
  reinterpret_cast<uint8_t*>(dst)[dst_index] = reinterpret_cast<const uint8_t*>(src)[src_index];
 }

-
 template <typename T>
 static void SetIndices(gsl::span<int64_t> gathered_indices,
                       std::string& raw_indices,
@ -1109,7 +1110,8 @@ static void SetIndices(gsl::span<int64_t> gathered_indices,
  for (auto src_index : gathered_indices) {
    ORT_IF_CONSTEXPR(sizeof(T) == sizeof(int8_t)) {
      ind_dest[dest_index] = static_cast<T>(src_index);
-    } else {
+    }
+    else {
      auto* dst = ind_dest + dest_index;
      T v = static_cast<T>(src_index);
      memcpy(dst, &v, sizeof(T));
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@ -563,14 +563,14 @@ static common::Status ExecuteGraphImpl(const SessionState& session_state,
                                       const logging::Logger& logger, const bool only_execute_path_to_fetches = false) {
  std::unique_ptr<IExecutor> p_exec;
  if (execution_mode == ExecutionMode::ORT_SEQUENTIAL) {
-    p_exec = std::unique_ptr<IExecutor>(new SequentialExecutor(terminate_flag, only_execute_path_to_fetches));
+    p_exec = std::make_unique<SequentialExecutor>(terminate_flag, only_execute_path_to_fetches);
  } else if (execution_mode == ExecutionMode::ORT_PARALLEL) {
    auto* p_inter_op_thread_pool = session_state.GetInterOpThreadPool();
    if (!p_inter_op_thread_pool) {
      LOGS(logger, WARNING) << "Only one thread was configured for parallel execution. Hence will use sequential execution.";
-      p_exec = std::unique_ptr<IExecutor>(new SequentialExecutor(terminate_flag, only_execute_path_to_fetches));
+      p_exec = std::make_unique<SequentialExecutor>(terminate_flag, only_execute_path_to_fetches);
    } else {
-      p_exec = std::unique_ptr<IExecutor>(new ParallelExecutor(session_state, terminate_flag));
+      p_exec = std::make_unique<ParallelExecutor>(session_state, terminate_flag);
    }
  }

--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@ -705,7 +705,7 @@ flatbuffers::Offset<fbs::NodeEdge> Node::SaveEdgesToOrtFormat(flatbuffers::FlatB

 Status Node::LoadFromOrtFormat(const onnxruntime::fbs::Node& fbs_node, Graph& graph,
                               const logging::Logger& logger, std::unique_ptr<Node>& node) {
-  node.reset(new Node(fbs_node.index(), graph));
+  node = std::make_unique<Node>(fbs_node.index(), graph);
  return node->LoadFromOrtFormat(fbs_node, logger);
 }

@ -860,9 +860,9 @@ void Node::CreateSubgraph(const std::string& attr_name) {

  if (attr != attributes_.cend() && utils::HasGraph(attr->second)) {
    GraphProto& mutable_graph = *attr->second.mutable_g();
-    std::unique_ptr<Graph> subgraph{new Graph(*graph_, *this, mutable_graph)};
+    std::unique_ptr<Graph> subgraph = std::make_unique<Graph>(*graph_, *this, mutable_graph);
    attr_to_subgraph_map_.insert({std::string(attr_name), gsl::not_null<Graph*>{subgraph.get()}});
-    subgraphs_.push_back(std::move(subgraph));
+    subgraphs_.emplace_back(std::move(subgraph));
  }
 }

@ -3999,7 +3999,7 @@ Status Graph::InlineFunction(Node& node) {
  // main graph.
  const Graph& subgraph = node.GetFunctionBody()->Body();
  auto output_edges = node.GetRelationships().output_edges;
-  for (auto output_edge : output_edges) {
+  for (const auto& output_edge : output_edges) {
    RemoveEdge(node.Index(), output_edge.GetNode().Index(), output_edge.GetSrcArgIndex(), output_edge.GetDstArgIndex());
  }

@ -4130,7 +4130,7 @@ void Graph::SetNodeArgType(NodeArg& arg, const ONNX_NAMESPACE::TypeProto& type_p
  GraphResolveNeeded(true);
 }

-#endif  //  !defined(ORT_MINIMAL_BUILD)
+#endif  // !defined(ORT_MINIMAL_BUILD)

 Graph::~Graph() {
  // nothing to do, but we put it here so we don't need to fully define types in Graph that are held in unique_ptr
@ -4216,12 +4216,11 @@ Status Graph::LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph,
                                IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
 #endif
                                const logging::Logger& logger, std::unique_ptr<Graph>& graph) {
-  // can't use make_unique as we're calling a private ctor
-  graph.reset(new Graph(owning_model, domain_to_version,
+  graph = std::make_unique<Graph>(owning_model, domain_to_version,
 #if !defined(ORT_MINIMAL_BUILD)
-                        schema_registry,
+                                  schema_registry,
 #endif
-                        nullptr, nullptr, logger));
+                                  nullptr, nullptr, logger);

  ORT_RETURN_IF_ERROR(graph->LoadFromOrtFormat(fbs_graph));

@ -4240,14 +4239,13 @@ Status Graph::LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph,
 Status Graph::LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph,
                                Graph& parent_graph, const Node& parent_node,
                                const logging::Logger& logger, std::unique_ptr<Graph>& graph) {
-  // can't use make_unique as we're calling a private ctor
-  graph.reset(new Graph(parent_graph.owning_model_,
-                        parent_graph.domain_to_version_,
+  graph = std::make_unique<Graph>(parent_graph.owning_model_,
+                                  parent_graph.domain_to_version_,
 #if !defined(ORT_MINIMAL_BUILD)
-                        parent_graph.schema_registry_,
+                                  parent_graph.schema_registry_,
 #endif
-                        &parent_graph, &parent_node,
-                        logger));
+                                  &parent_graph, &parent_node,
+                                  logger);

  return graph->LoadFromOrtFormat(fbs_graph);
 }
@ -4348,8 +4346,7 @@ common::Status Graph::LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph
      ORT_RETURN_IF(nullptr == fbs_value_info, "NodeArg is missing. Invalid ORT format model.");
      NodeArgInfo node_arg_info;
      ORT_RETURN_IF_ERROR(fbs::utils::LoadValueInfoOrtFormat(*fbs_value_info, node_arg_info));
-      // NodeArg ctor is private, cannot use make_unique
-      node_args_[fbs_value_info->name()->str()] = std::unique_ptr<NodeArg>(new NodeArg(std::move(node_arg_info)));
+      node_args_[fbs_value_info->name()->str()] = std::make_unique<NodeArg>(std::move(node_arg_info));
    }
  }

--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@ -347,7 +347,7 @@ Status Model::Load(const ModelProto& model_proto,

  auto status = Status::OK();
  ORT_TRY {
-    model.reset(new Model(model_proto, model_path, local_registries, logger));
+    model = std::make_unique<Model>(model_proto, model_path, local_registries, logger);
  }
  ORT_CATCH(const std::exception& ex) {
    ORT_HANDLE_EXCEPTION([&]() {
@ -386,7 +386,7 @@ Status Model::Load(ModelProto&& model_proto,
  GSL_SUPPRESS(r .11)
  auto status = Status::OK();
  ORT_TRY {
-    model.reset(new Model(std::move(model_proto), model_path, local_registries, logger, allow_released_opsets_only));
+    model = std::make_unique<Model>(std::move(model_proto), model_path, local_registries, logger, allow_released_opsets_only);
  }
  ORT_CATCH(const std::exception& ex) {
    ORT_HANDLE_EXCEPTION([&]() {
@ -730,7 +730,7 @@ common::Status Model::LoadFromOrtFormat(const fbs::Model& fbs_model,
 #endif
                                        const logging::Logger& logger,
                                        std::unique_ptr<Model>& model) {
-  model.reset(new Model());
+  model = std::make_unique<Model>();

  // Load the model metadata
  if (const auto* fbs_metadata_props = fbs_model.metadata_props()) {
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@ -272,9 +272,9 @@ class Model {
                                          const logging::Logger& logger,
                                          std::unique_ptr<Model>& model);

- private:
  Model();

+ private:
  // Model data.
 #if !defined(ORT_MINIMAL_BUILD)
  ONNX_NAMESPACE::ModelProto model_proto_;
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@ -1018,6 +1018,7 @@ Return Value:
 }
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(push)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 void
--- a/onnxruntime/core/mlas/lib/dgemm.cpp
+++ b/onnxruntime/core/mlas/lib/dgemm.cpp
@ -805,6 +805,7 @@ Return Value:

 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(push)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 void
--- a/onnxruntime/core/mlas/lib/qgemm_kernel_default.cpp
+++ b/onnxruntime/core/mlas/lib/qgemm_kernel_default.cpp
@ -34,7 +34,7 @@ constexpr MLAS_GEMM_QUANT_STRIDES MLAS_GEMM_QUANT_KERNEL_DEFAULT::Strides;
 constexpr MLAS_GEMM_QUANT_STRIDES MLAS_GEMM_QUANT_KERNEL_DEFAULT::PackedStrides;

 template<>
-MLAS_FORCEINLINE constexpr 
+MLAS_FORCEINLINE constexpr
 int32_t
 MlasGemmQuantFixupZeroPointA<MLAS_GEMM_QUANT_KERNEL_DEFAULT>(
    int32_t ZeroPointA,
@ -49,7 +49,7 @@ MlasGemmQuantFixupZeroPointA<MLAS_GEMM_QUANT_KERNEL_DEFAULT>(
 }

 template<>
-MLAS_FORCEINLINE
+MLAS_FORCEINLINE constexpr
 int32_t
 MlasGemmQuantFixupZeroPointB<MLAS_GEMM_QUANT_KERNEL_DEFAULT>(
    int32_t ZeroPointB,
--- a/onnxruntime/core/mlas/lib/sgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sgemm.cpp
@ -1554,6 +1554,7 @@ Return Value:
 }
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(push)
+// Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
 void
--- a/onnxruntime/core/optimizer/optimizer_execution_frame.cc
+++ b/onnxruntime/core/optimizer/optimizer_execution_frame.cc
@ -41,7 +41,7 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
      size_t cpu_tensor_length;
      ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<0>(tensor_proto, &cpu_tensor_length));
      OrtValue ort_value;
-      std::unique_ptr<char[]> data(new char[cpu_tensor_length]);
+      std::unique_ptr<char[]> data = std::make_unique<char[]>(cpu_tensor_length);
      std::unique_ptr<Tensor> p_tensor;
      ORT_RETURN_IF_ERROR(utils::TensorProtoToMLValue(Env::Default(),
                                                      model_path.IsEmpty() ? nullptr : model_path.ToPathString().c_str(),
@ -103,8 +103,9 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
 std::unique_ptr<const OpKernel> OptimizerExecutionFrame::Info::CreateKernel(const Node* node) const {
  std::unique_ptr<OpKernel> op_kernel;
  std::shared_ptr<KernelRegistry> kernel_registry = execution_provider_.GetKernelRegistry();
+  FuncManager func;
  auto status = kernel_registry->TryCreateKernel(*node, execution_provider_, initializers_,
-                                                 ort_value_name_idx_map_, FuncManager(), data_transfer_mgr_,
+                                                 ort_value_name_idx_map_, func, data_transfer_mgr_,
                                                 op_kernel);

  // Kernel found in the CPU kernel registry
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
@ -28,10 +28,10 @@ void DropQDQNodesRules(SelectorsAndActions& qdq_selectors_and_actions) {
      MoveToSlot(dq, ArgType::kInput, 0, ArgType::kInput, 0),
      MoveToSlot(q, ArgType::kOutput, 0, ArgType::kOutput, 0)};

-  std::unique_ptr<Action> action(new MergeIntoTarget(std::move(moves)));
+  std::unique_ptr<Action> action = std::make_unique<MergeIntoTarget>(std::move(moves));

 #if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> selector(new QDQ::DropDQDNodesSelector());
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropDQDNodesSelector>();
  qdq_selectors_and_actions.RegisterSelectorAndAction(action_name,
                                                      SelectorAndAction::OpVersionsMap{{"Gather", {}},
                                                                                       {"Reshape", {}},
@ -49,10 +49,10 @@ void UnaryOpQDQRules(SelectorsAndActions& qdq_selectors_and_actions, bool is_int
  // 3 nodes. DQ, target, Q
  // Replace with internal QLinear version of operator. Delete all original nodes.
  const std::string action_name{"1DQ"};
-  std::unique_ptr<Action> action(new QDQ::UnaryReplaceWithQLinear(kMSDomain));
+  std::unique_ptr<Action> action = std::make_unique<QDQ::UnaryReplaceWithQLinear>(kMSDomain);

 #if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> selector(new QDQ::UnarySelector(is_int8_allowed));
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::UnarySelector>(is_int8_allowed);
  qdq_selectors_and_actions.RegisterSelectorAndAction(action_name,
                                                      SelectorAndAction::OpVersionsMap{{"AveragePool", {}}},
                                                      std::move(selector),
@ -67,10 +67,10 @@ void BinaryOpQDQRules(SelectorsAndActions& qdq_selectors_and_actions) {
  // 4 nodes. 2 x DQ for inputs, target, Q
  // Replace with internal QLinear version of operator. Delete all original nodes.
  const std::string action_name{"2DQ"};
-  std::unique_ptr<Action> action(new QDQ::BinaryReplaceWithQLinear(kMSDomain));
+  std::unique_ptr<Action> action = std::make_unique<QDQ::BinaryReplaceWithQLinear>(kMSDomain);

 #if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> selector(new QDQ::BinarySelector());
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::BinarySelector>();
  qdq_selectors_and_actions.RegisterSelectorAndAction(action_name,
                                                      SelectorAndAction::OpVersionsMap{{"Add", {}},
                                                                                       {"Mul", {}}},
@ -86,10 +86,10 @@ void VariadicOpQDQRules(SelectorsAndActions& qdq_selectors_and_actions) {
  // 0=variadic DQ nodes 2=target, 3=Q
  // Replace with QLinear version of operator. Delete all original nodes.
  const std::string action_name{"*DQ"};
-  std::unique_ptr<Action> action(new QDQ::VariadicReplaceWithQLinear(kMSDomain));
+  std::unique_ptr<Action> action = std::make_unique<QDQ::VariadicReplaceWithQLinear>(kMSDomain);

 #if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> selector(new QDQ::VariadicSelector());
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::VariadicSelector>();

  qdq_selectors_and_actions.RegisterSelectorAndAction(action_name,
                                                      SelectorAndAction::OpVersionsMap{{"Concat", {}}},
@ -107,10 +107,10 @@ void ConvQDQRules(SelectorsAndActions& qdq_selectors_and_actions, bool is_int8_a
  // Replace Conv with QLinearConv
  // Delete all original nodes
  const std::string action_name{"Conv"};
-  std::unique_ptr<Action> action(new QDQ::ConvReplaceWithQLinear());
+  std::unique_ptr<Action> action = std::make_unique<QDQ::ConvReplaceWithQLinear>();

 #if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> selector(new QDQ::ConvSelector(is_int8_allowed));
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::ConvSelector>(is_int8_allowed);

  qdq_selectors_and_actions.RegisterSelectorAndAction(action_name,
                                                      SelectorAndAction::OpVersionsMap{{"Conv", {}}},
@ -129,10 +129,10 @@ void MatMulQDQRules(SelectorsAndActions& qdq_selectors_and_actions, bool is_int8
  // Delete all original nodes.
  const std::string action_name{"MatMul"};

-  std::unique_ptr<Action> action(new QDQ::MatMulReplaceWithQLinear());
+  std::unique_ptr<Action> action = std::make_unique<QDQ::MatMulReplaceWithQLinear>();

 #if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> selector(new QDQ::MatMulSelector(is_int8_allowed));
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::MatMulSelector>(is_int8_allowed);
  qdq_selectors_and_actions.RegisterSelectorAndAction(action_name,
                                                      SelectorAndAction::OpVersionsMap{{"MatMul", {}}},
                                                      std::move(selector),
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@ -54,6 +54,11 @@ class WindowsThread : public EnvThread {
    unsigned (*start_address)(int id, Eigen::ThreadPoolInterface* param);
    Eigen::ThreadPoolInterface* param;
    const ThreadOptions& thread_options;
+    Param(const ORTCHAR_T* name_prefix1,
+          int index1,
+          unsigned (*start_address1)(int id, Eigen::ThreadPoolInterface* param),
+          Eigen::ThreadPoolInterface* param1,
+          const ThreadOptions& thread_options1) : name_prefix(name_prefix1), index(index1), start_address(start_address1), param(param1), thread_options(thread_options1) {}
  };

 public:
@ -63,15 +68,15 @@ class WindowsThread : public EnvThread {
    custom_create_thread_fn = thread_options.custom_create_thread_fn;
    custom_thread_creation_options = thread_options.custom_thread_creation_options;
    custom_join_thread_fn = thread_options.custom_join_thread_fn;
-
+    std::unique_ptr<Param> local_param = std::make_unique<Param>(name_prefix, index, start_address, param, thread_options);
    if (custom_create_thread_fn) {
-      custom_thread_handle = custom_create_thread_fn(custom_thread_creation_options, (OrtThreadWorkerFn)CustomThreadMain, new Param{name_prefix, index, start_address, param, thread_options});
+      custom_thread_handle = custom_create_thread_fn(custom_thread_creation_options, (OrtThreadWorkerFn)CustomThreadMain, local_param.release());
      if (!custom_thread_handle) {
-        ORT_THROW("custom_create_thread_fn returned invalid handle."); 
+        ORT_THROW("custom_create_thread_fn returned invalid handle.");
      }
    } else {
      hThread.reset(reinterpret_cast<HANDLE>(_beginthreadex(nullptr, thread_options.stack_size, ThreadMain,
-                                                            new Param{name_prefix, index, start_address, param, thread_options}, 0,
+                                                            local_param.release(), 0,
                                                            &threadID)));
    }
  }
@ -142,12 +147,18 @@ class WindowsThread : public EnvThread {

 class WindowsEnv : public Env {
 public:
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 26409)
+#endif
  EnvThread* CreateThread(_In_opt_z_ const ORTCHAR_T* name_prefix, int index,
                          unsigned (*start_address)(int id, Eigen::ThreadPoolInterface* param),
                          Eigen::ThreadPoolInterface* param, const ThreadOptions& thread_options) {
    return new WindowsThread(name_prefix, index, start_address, param, thread_options);
  }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
  void SleepForMicroseconds(int64_t micros) const override {
    Sleep(static_cast<DWORD>(micros) / 1000);
  }
@ -524,14 +535,32 @@ class WindowsEnv : public Env {
  }

  virtual Status LoadDynamicLibrary(const std::string& library_filename, bool /*global_symbols*/, void** handle) const override {
+    const std::wstring& wlibrary_filename = ToWideString(library_filename);
 #if WINAPI_FAMILY == WINAPI_FAMILY_PC_APP
-    *handle = ::LoadPackagedLibrary(ToWideString(library_filename).c_str(), 0);
+    *handle = ::LoadPackagedLibrary(wlibrary_filename.c_str(), 0);
 #else
-    *handle = ::LoadLibraryExA(library_filename.c_str(), nullptr, LOAD_WITH_ALTERED_SEARCH_PATH);
+    // TODO: in most cases, the path name is a relative path and the behavior of the following line of code is undefined.
+    *handle = ::LoadLibraryExW(wlibrary_filename.c_str(), nullptr, LOAD_WITH_ALTERED_SEARCH_PATH);
 #endif
    if (!*handle) {
      const auto error_code = GetLastError();
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "LoadLibrary failed with error ", error_code, " \"", std::system_category().message(error_code), "\" when trying to load \"", library_filename, "\"");
+      LPVOID lpMsgBuf;
+      FormatMessageW(
+          FORMAT_MESSAGE_ALLOCATE_BUFFER |
+              FORMAT_MESSAGE_FROM_SYSTEM |
+              FORMAT_MESSAGE_IGNORE_INSERTS,
+          NULL,
+          error_code,
+          MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+          (LPWSTR)&lpMsgBuf,
+          0, NULL);
+      std::wostringstream oss;
+      oss << L"LoadLibrary failed with error " << error_code << L" \"" << (LPWSTR)lpMsgBuf << L"\" when trying to load \"" << wlibrary_filename << L"\"";
+      std::wstring errmsg = oss.str();
+      // TODO: errmsg should be converted to UTF-8 as it will be passed out to the C interface.
+      common::Status status(common::ONNXRUNTIME, common::FAIL, ToMBString(errmsg));
+      LocalFree(lpMsgBuf);
+      return status;
    }
    return Status::OK();
  }
@ -548,7 +577,23 @@ class WindowsEnv : public Env {
    *symbol = ::GetProcAddress(reinterpret_cast<HMODULE>(handle), symbol_name.c_str());
    if (!*symbol) {
      const auto error_code = GetLastError();
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to find symbol in library, error code: ", error_code, " - ", std::system_category().message(error_code));
+      LPVOID lpMsgBuf;
+      FormatMessageW(
+          FORMAT_MESSAGE_ALLOCATE_BUFFER |
+              FORMAT_MESSAGE_FROM_SYSTEM |
+              FORMAT_MESSAGE_IGNORE_INSERTS,
+          NULL,
+          error_code,
+          MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+          (LPWSTR)&lpMsgBuf,
+          0, NULL);
+      std::wostringstream oss;
+      oss << L"Failed to find symbol " << ToWideString(symbol_name) << L" in library, error code: " << error_code << L" \"" << (LPWSTR)lpMsgBuf << L"\"";
+      std::wstring errmsg = oss.str();
+      // TODO: errmsg should be converted to UTF-8 as it will be passed out to the C interface.
+      common::Status status(common::ONNXRUNTIME, common::FAIL, ToMBString(errmsg));
+      LocalFree(lpMsgBuf);
+      return status;
    }
    return Status::OK();
  }
@ -576,11 +621,11 @@ class WindowsEnv : public Env {
    // Create buffer to hold the result
    std::string buffer(kBufferSize, '\0');

-    //The last argument is the size of the buffer pointed to by the lpBuffer parameter, including the null-terminating character, in characters.
-    //If the function succeeds, the return value is the number of characters stored in the buffer pointed to by lpBuffer, not including the terminating null character.
-    //Therefore, If the function succeeds, kBufferSize should be larger than char_count.
+    // The last argument is the size of the buffer pointed to by the lpBuffer parameter, including the null-terminating character, in characters.
+    // If the function succeeds, the return value is the number of characters stored in the buffer pointed to by lpBuffer, not including the terminating null character.
+    // Therefore, If the function succeeds, kBufferSize should be larger than char_count.
    auto char_count = GetEnvironmentVariableA(var_name.c_str(), buffer.data(), kBufferSize);
-    
+
    if (kBufferSize > char_count) {
      buffer.resize(char_count);
      return buffer;
--- a/onnxruntime/core/platform/windows/stacktrace.cc
+++ b/onnxruntime/core/platform/windows/stacktrace.cc
@ -104,7 +104,7 @@ std::vector<std::string> CaptureStackTrace::Trace() const {
  stacktrace.reserve(num_frames);

  // hide CaptureStackTrace::Trace and GetStackTrace so the output starts with the 'real' location
-  const int frames_to_skip = 2;
+  constexpr int frames_to_skip = 2;

  // we generally want to skip the first two frames, but if something weird is going on (e.g. code coverage is
  // running) and we only have 1 or 2 frames, output them so there's at least something that may be meaningful
--- a/onnxruntime/core/providers/common.h
+++ b/onnxruntime/core/providers/common.h
@ -118,9 +118,9 @@ inline Status ComputePad(const int64_t in_dim,
  return Status::OK();
 }

-inline int64_t ComputeOutputShape(const int64_t in_dim,
-                                  const int64_t stride, const int64_t kernel, const int64_t dilation,
-                                  const int64_t pad_head, const int64_t pad_tail) {
+constexpr inline int64_t ComputeOutputShape(const int64_t in_dim,
+                                            const int64_t stride, const int64_t kernel, const int64_t dilation,
+                                            const int64_t pad_head, const int64_t pad_tail) {
  const int64_t dkernel = dilation * (kernel - 1) + 1;
  return static_cast<int64_t>(static_cast<double>(in_dim + pad_head + pad_tail - dkernel) / stride + 1);
 }
--- a/onnxruntime/core/providers/cpu/activation/activations.h
+++ b/onnxruntime/core/providers/cpu/activation/activations.h
@ -84,6 +84,7 @@ struct Softplus : public ElementWiseRangedTransform<T> {
  Status Init(const onnxruntime::NodeAttributes&) {
    return Status::OK();
  }
+  GSL_SUPPRESS(r .11)
  ElementWiseRangedTransform<T>* Copy() const {
    using T1 = typename std::remove_pointer<decltype(this)>::type;
    using T2 = typename std::remove_const<T1>::type;
@ -106,6 +107,7 @@ struct Relu : public ElementWiseRangedTransform<T> {
  Status Init(const onnxruntime::NodeAttributes&) {
    return Status::OK();
  }
+  GSL_SUPPRESS(r .11)
  ElementWiseRangedTransform<T>* Copy() const {  // replace it with a macro. why this?
    using T1 = typename std::remove_pointer<decltype(this)>::type;
    using T2 = typename std::remove_const<T1>::type;  //redundant?
@ -128,6 +130,7 @@ struct Sigmoid : public ElementWiseRangedTransform<T> {
  Status Init(const onnxruntime::NodeAttributes&) {
    return Status::OK();
  }
+  GSL_SUPPRESS(r .11)
  ElementWiseRangedTransform<T>* Copy() const {
    using T1 = typename std::remove_pointer<decltype(this)>::type;
    using T2 = typename std::remove_const<T1>::type;
@ -153,6 +156,7 @@ struct Softsign : public ElementWiseRangedTransform<T> {
  Status Init(const onnxruntime::NodeAttributes&) {
    return Status::OK();
  }
+  GSL_SUPPRESS(r .11)
  ElementWiseRangedTransform<T>* Copy() const {
    using T1 = typename std::remove_pointer<decltype(this)>::type;
    using T2 = typename std::remove_const<T1>::type;
@ -175,6 +179,7 @@ struct Tanh : public ElementWiseRangedTransform<T> {
  Status Init(const onnxruntime::NodeAttributes&) {
    return Status::OK();
  }
+  GSL_SUPPRESS(r .11)
  ElementWiseRangedTransform<T>* Copy() const {
    using T1 = typename std::remove_pointer<decltype(this)>::type;
    using T2 = typename std::remove_const<T1>::type;
@ -192,6 +197,7 @@ struct Tanh : public ElementWiseRangedTransform<T> {
    ym = xm.tanh();
  }
 };
+
 template <>
 void Tanh<float>::operator()(std::ptrdiff_t first, std::ptrdiff_t last) const;

@ -226,7 +232,6 @@ struct Selu : public ElementWiseRangedTransform<T> {
    ym = (T)gamma * (xm.cwiseMax(0.0f) + ((T)alpha * (xm.array().exp() - 1.0f)).cwiseMin(0.0f));
  }
 };
-
 }  // namespace functors

 DEFINE_ELE_KERNEL(Celu);
--- a/onnxruntime/core/providers/cpu/controlflow/loop.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/loop.cc
@ -142,7 +142,7 @@ Loop::Info::Info(const onnxruntime::Node& node, const GraphViewer& subgraph_in)
  const auto& node_input_types = node.InputDefs();
  loop_carried_vars_types.reserve(num_subgraph_inputs);
  for (int i = 0; i < num_loop_carried_vars; ++i) {
-    loop_carried_vars_types.push_back(node_input_types[i + 2]->TypeAsProto());
+    loop_carried_vars_types.push_back(node_input_types[static_cast<size_t>(i) + 2]->TypeAsProto());
  }

  auto& subgraph_inputs = subgraph.GetInputs();
@ -280,7 +280,7 @@ common::Status Loop::SetupSubgraphExecutionInfo(const SessionState& session_stat
  // the Loop inputs are matched to subgraph feeds based on order.
  // we first need the names of the Loop inputs to determine what device they are available on
  std::vector<std::string> feed_names;
-  feed_names.reserve(info_->num_subgraph_inputs + info_->num_implicit_inputs);
+  feed_names.reserve(static_cast<size_t>(info_->num_subgraph_inputs) + info_->num_implicit_inputs);

  // iter_num and cond subgraph inputs - created by the LoopImpl::Initialize so the name doesn't matter
  // as we skip them when we call FindDevicesForValues, and default them to always being on CPU.
@ -291,7 +291,7 @@ common::Status Loop::SetupSubgraphExecutionInfo(const SessionState& session_stat
  const auto& loop_inputs = node.InputDefs();
  for (int i = 0; i < info_->num_loop_carried_vars; ++i) {
    // + 2 to skip 'M' and 'cond' Loop inputs
-    feed_names.push_back(loop_inputs[i + 2]->Name());
+    feed_names.push_back(loop_inputs[static_cast<size_t>(i) + 2]->Name());
  }

  for (auto& entry : node.ImplicitInputDefs()) {
@ -306,7 +306,7 @@ common::Status Loop::SetupSubgraphExecutionInfo(const SessionState& session_stat

  // now update the feed names to use the subgraph input names for the loop carried vars so that we can determine
  // what device the subgraph needs them on
-  for (int i = 0; i < info_->num_loop_carried_vars; ++i) {
+  for (ptrdiff_t i = 0; i < info_->num_loop_carried_vars; ++i) {
    // +2 for both to skip the iter_num and cond values
    feed_names[i + 2] = info_->subgraph_input_names[i + 2];
  }
@ -329,7 +329,7 @@ common::Status Loop::SetupSubgraphExecutionInfo(const SessionState& session_stat

  // Loop state variables need to be where we can feed them in to the next iteration, so set the fetch location
  // to match the feed location.
-  for (int i = 0; i < info_->num_loop_carried_vars; ++i) {
+  for (ptrdiff_t i = 0; i < info_->num_loop_carried_vars; ++i) {
    // +2 for both to skip the iter_num and cond input values
    const auto& alloc_info = utils::FindMemoryInfoForValue(session_state, loop_inputs[i + 2]->Name());
    fetch_locations.push_back(&alloc_info);
@ -421,13 +421,13 @@ Status LoopImpl::Initialize() {
  iter_num_mlvalue_ = MakeScalarMLValue<int64_t>(cpu_allocator, 0, iter_num_rank != 0);
  condition_mlvalue_ = MakeScalarMLValue<bool>(cpu_allocator, condition_, condition_rank != 0);

-  loop_output_tensors_.resize(info_.num_outputs - info_.num_loop_carried_vars);
+  loop_output_tensors_.resize(static_cast<size_t>(info_.num_outputs) - info_.num_loop_carried_vars);

  return status;
 }

 void LoopImpl::CreateInitialFeeds(std::vector<OrtValue>& feeds) {
-  feeds.reserve(info_.num_subgraph_inputs + info_.num_implicit_inputs);
+  feeds.reserve(static_cast<size_t>(info_.num_subgraph_inputs) + info_.num_implicit_inputs);

  // This ordering is the same as used in SetupSubgraphExecutionInfo
  feeds.push_back(iter_num_mlvalue_);
@ -450,12 +450,12 @@ void LoopImpl::SaveOutputsAndUpdateFeeds(const std::vector<OrtValue>& last_outpu
  // next_input: iter_num, cond, loop_vars. iter_num is re-used

  // simple copy for cond and loop carried vars. start at 1 to skip iter_num in input
-  for (int i = 1; i < info_.num_subgraph_inputs; ++i) {
+  for (ptrdiff_t i = 1; i < info_.num_subgraph_inputs; ++i) {
    next_inputs[i] = last_outputs[i - 1];
  }

  // save loop outputs as we have to concatenate at the end
-  for (int j = info_.num_loop_carried_vars; j < info_.num_outputs; ++j) {
+  for (ptrdiff_t j = info_.num_loop_carried_vars; j < info_.num_outputs; ++j) {
    ORT_ENFORCE(last_outputs[j + 1].IsTensor(), "All scan outputs MUST be tensors");
    loop_output_tensors_[j - info_.num_loop_carried_vars].push_back(last_outputs[j + 1]);  // skip 'cond' in output
  }
@ -570,13 +570,13 @@ Status LoopImpl::Execute(const FeedsFetchesManager& ffm) {
  if (iter_num_value != 0) {
    for (int i = 0; i < info_.num_loop_carried_vars; ++i) {
      // need to allocate Loop output and copy OrtValue from fetches
-      ORT_RETURN_IF_ERROR(copy_mlvalue_to_output(fetches[i + 1], i, iter_num_value, *info_.loop_carried_vars_types[i]));  // skip cond
+      ORT_RETURN_IF_ERROR(copy_mlvalue_to_output(fetches[static_cast<ptrdiff_t>(i) + 1], i, iter_num_value, *info_.loop_carried_vars_types[static_cast<ptrdiff_t>(i)]));  // skip cond
    }

    for (int i = info_.num_loop_carried_vars; i < info_.num_outputs; ++i) {
      // add last output
-      auto& per_iteration_outputs = loop_output_tensors_[i - info_.num_loop_carried_vars];
-      per_iteration_outputs.push_back(fetches[i + 1]);  // skip cond
+      auto& per_iteration_outputs = loop_output_tensors_[static_cast<ptrdiff_t>(i) - info_.num_loop_carried_vars];
+      per_iteration_outputs.push_back(fetches[static_cast<ptrdiff_t>(i) + 1]);  // skip cond

      ORT_RETURN_IF_ERROR(ConcatenateLoopOutput(per_iteration_outputs, i));
    }
@ -584,7 +584,7 @@ Status LoopImpl::Execute(const FeedsFetchesManager& ffm) {
    // no iterations.
    // copy input loop carried vars to output.
    for (int i = 0; i < info_.num_loop_carried_vars; ++i) {
-      ORT_RETURN_IF_ERROR(copy_mlvalue_to_output(feeds[i + 2], i, iter_num_value, *info_.loop_carried_vars_types[i]));  // skip iter# and cond
+      ORT_RETURN_IF_ERROR(copy_mlvalue_to_output(feeds[static_cast<ptrdiff_t>(i) + 2], i, iter_num_value, *info_.loop_carried_vars_types[i]));  // skip iter# and cond
    }

    // create empty outputs for loop outputs using the subgraph output shapes for the rank
@ -592,11 +592,11 @@ Status LoopImpl::Execute(const FeedsFetchesManager& ffm) {

    for (int i = info_.num_loop_carried_vars; i < info_.num_outputs; ++i) {
      // get shape from subgraph output if possible to attempt to have the correct rank
-      auto* graph_output = graph_outputs.at(i + 1);  // + 1 as first subgraph output is condition value
+      auto* graph_output = graph_outputs.at(static_cast<ptrdiff_t>(i) + 1);  // + 1 as first subgraph output is condition value
      auto* graph_output_shape = graph_output->Shape();

      std::vector<int64_t> output_dims;
-      output_dims.reserve((graph_output_shape ? graph_output_shape->dim_size() : 0) + 1);
+      output_dims.reserve(static_cast<ptrdiff_t>(graph_output_shape ? graph_output_shape->dim_size() : 0) + 1);
      output_dims.push_back(0);  // num iterations is first dim

      if (graph_output_shape) {
--- a/onnxruntime/core/providers/cpu/controlflow/scan_8.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_8.cc
@ -161,7 +161,7 @@ Status Scan<8>::SetupSubgraphExecutionInfo(const SessionState& session_state,

  const auto& node = Node();
  info_ = std::make_unique<Scan<8>::Info>(node, subgraph_session_state.GetGraphViewer(),
-                                                  static_cast<int>(num_scan_inputs_));
+                                          static_cast<int>(num_scan_inputs_));

  auto status = scan::detail::CreateFeedsFetchesManager(node, *info_, session_state, subgraph_session_state,
                                                        /* is_v8 */ true, feeds_fetches_manager_);
@ -396,13 +396,13 @@ Status Scan8Impl::Execute(const FeedsFetchesManager& ffm) {

    // Setup input OrtValue streams
    std::vector<OrtValueTensorSlicer<const OrtValue>::Iterator> scan_input_stream_iterators;
-    scan_input_stream_iterators.reserve(info_.num_variadic_inputs - info_.num_loop_state_variables);
+    scan_input_stream_iterators.reserve(static_cast<size_t>(info_.num_variadic_inputs) - info_.num_loop_state_variables);

    for (int i = info_.num_loop_state_variables, end = info_.num_variadic_inputs; i < end; ++i) {
      const auto& ort_value = GetSubgraphInputMLValue(context_, i);

      // forward
-      if (directions_[i - info_.num_loop_state_variables] == static_cast<int64_t>(ScanDirection::kForward)) {
+      if (directions_[static_cast<ptrdiff_t>(i) - info_.num_loop_state_variables] == static_cast<int64_t>(ScanDirection::kForward)) {
        // the iterator is self contained, so we don't need to keep the OrtValueTensorSlicer instance around
        scan_input_stream_iterators.push_back(device_helpers_.create_const_slicer_func(ort_value, 1, b).begin());
      } else {  // reverse
--- a/onnxruntime/core/providers/cpu/controlflow/scan_9.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_9.cc
@ -205,7 +205,7 @@ Status Scan<9>::SetupSubgraphExecutionInfo(const SessionState& session_state,

  const auto& node = Node();
  info_ = std::make_unique<Scan<9>::Info>(node, subgraph_session_state.GetGraphViewer(),
-                                                  static_cast<int>(num_scan_inputs_));
+                                          static_cast<int>(num_scan_inputs_));

  auto status = scan::detail::CreateFeedsFetchesManager(node, *info_, session_state, subgraph_session_state,
                                                        /* is_v8 */ false, feeds_fetches_manager_);
@ -281,7 +281,7 @@ Status ScanImpl::ValidateSubgraphInput(int start_input, int end_input,
                             " Expected ", min_dims_required,
                             " dimensions or more but input had shape of ", input_shape);

-    auto seq_len_dim = input_axes_[i - info_.num_loop_state_variables];
+    auto seq_len_dim = input_axes_[static_cast<ptrdiff_t>(i) - info_.num_loop_state_variables];
    auto this_seq_len = input_shape[seq_len_dim];

    if (sequence_len_ < 0) {
@ -432,7 +432,7 @@ Status ScanImpl::Execute(const FeedsFetchesManager& ffm) {

  // Setup input OrtValue streams
  std::vector<OrtValueTensorSlicer<const OrtValue>::Iterator> scan_input_stream_iterators;
-  scan_input_stream_iterators.reserve(info_.num_inputs - info_.num_loop_state_variables);
+  scan_input_stream_iterators.reserve(static_cast<size_t>(info_.num_inputs) - info_.num_loop_state_variables);

  for (int i = 0, end = info_.num_scan_inputs; i < end; ++i) {
    const auto& ort_value = inputs_[i];
--- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
@ -142,7 +142,7 @@ Status CreateFeedsFetchesManager(const Node& node,
  // we need the names of the Scan inputs to determine what device they are available on,
  // so first create a list using those value
  std::vector<std::string> feed_names;
-  feed_names.reserve(info.num_variadic_inputs + info.num_implicit_inputs);
+  feed_names.reserve(static_cast<size_t>(info.num_variadic_inputs) + info.num_implicit_inputs);

  const auto& scan_inputs = node.InputDefs();
  int start = is_v8 ? 1 : 0;  // skip sequence_lens for v8
@ -217,7 +217,7 @@ Status IterateSequence(OpKernelContextInternal& context, const SessionState& ses
        feeds[input] = loop_state_variables[input].Input();
      } else {
        // add sliced input
-        auto& iterator = scan_input_stream_iterators[input - num_loop_state_variables];
+        auto& iterator = scan_input_stream_iterators[static_cast<ptrdiff_t>(input) - num_loop_state_variables];
        feeds[input] = *iterator;

        ++iterator;
--- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.h
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.h
@ -52,7 +52,7 @@ class LoopStateVariable {
  const OrtValue original_value_;
  OrtValue final_value_;

-  /* we use original_value and final_value once, 
+  /* we use original_value and final_value once,
     and alternate between a_ and b_ as input/output for each iteration to avoid copies

    Iteration   Input             Output
@ -88,8 +88,8 @@ class OutputIterator {
                       ScanDirection direction = ScanDirection::kForward,
                       bool temporary = false,
                       MLDataType data_type = nullptr) {
-    iterator.reset(new OutputIterator(context, output_index, is_loop_state_var, is_v8, final_shape,
-                                      create_slicer_func, zero_data_func, direction, temporary, data_type));
+    iterator = std::make_unique<OutputIterator>(context, output_index, is_loop_state_var, is_v8, final_shape,
+                                                create_slicer_func, zero_data_func, direction, temporary, data_type);
    return iterator->Initialize();
  }

@ -115,8 +115,7 @@ class OutputIterator {
    ORT_ENFORCE(final_output_mlvalue_, "Attempt to retrieve final output before it was set.");
    return *final_output_mlvalue_;
  }
-
- private:
+  //std::unique_ptr needs to access this function.
  OutputIterator(OpKernelContextInternal& context,
                 int output_index,
                 bool is_loop_state_var,
@ -128,6 +127,7 @@ class OutputIterator {
                 bool temporary,
                 MLDataType data_type);

+ private:
  Status Initialize();
  Status AllocateFinalBuffer();

@ -201,7 +201,7 @@ void CalculateTransposedShapeForInput(const TensorShape& original_shape, int64_t
 /**
 Calculate the transpose permutations and shape by shifting the chosen axis FROM the first dimension.

-e.g. if shape is {4, 2, 3} and axis 2 is chosen, dimension 0 will move to dimension 2, 
+e.g. if shape is {4, 2, 3} and axis 2 is chosen, dimension 0 will move to dimension 2,
     the permutations will be {1, 2, 0} and output shape will be {2, 3, 4}
 */
 void CalculateTransposedShapeForOutput(const TensorShape& original_shape, int64_t axis,
--- a/onnxruntime/core/providers/cpu/cpu_provider_factory.cc
+++ b/onnxruntime/core/providers/cpu/cpu_provider_factory.cc
@ -37,7 +37,9 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_CPU, _In_ OrtSessio
  options->provider_factories.push_back(onnxruntime::CreateExecutionProviderFactory_CPU(use_arena));
  return nullptr;
 }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26409)
+#endif
 ORT_API_STATUS_IMPL(OrtApis::CreateCpuMemoryInfo, enum OrtAllocatorType type, enum OrtMemType mem_type,
                    _Outptr_ OrtMemoryInfo** out) {
  *out = new OrtMemoryInfo(onnxruntime::CPU, type, OrtDevice(), 0, mem_type);
--- a/onnxruntime/core/providers/cpu/element_wise_ranged_transform.h
+++ b/onnxruntime/core/providers/cpu/element_wise_ranged_transform.h
@ -56,6 +56,7 @@ ElementWiseRangedTransform<T>::~ElementWiseRangedTransform() {
  Status Init(const onnxruntime::NodeAttributes& attributes) {     \
    return (GetFloatParam(#X, attributes, X));                     \
  }                                                                \
+  GSL_SUPPRESS(r .11)                                              \
  ElementWiseRangedTransform<T>* Copy() const final {              \
    using T1 = typename std::remove_pointer<decltype(this)>::type; \
    using T2 = typename std::remove_const<T1>::type;               \
@ -70,6 +71,7 @@ ElementWiseRangedTransform<T>::~ElementWiseRangedTransform() {
    ORT_RETURN_IF_ERROR(GetFloatParam(#Y, attributes, Y));         \
    return Status::OK();                                           \
  }                                                                \
+  GSL_SUPPRESS(r .11)                                              \
  ElementWiseRangedTransform<T>* Copy() const final {              \
    using T1 = typename std::remove_pointer<decltype(this)>::type; \
    using T2 = typename std::remove_const<T1>::type;               \
--- a/onnxruntime/core/providers/cpu/generator/range.cc
+++ b/onnxruntime/core/providers/cpu/generator/range.cc
@ -6,7 +6,10 @@
 #include <cmath>

 #include "core/providers/op_kernel_type_control.h"
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {

 namespace op_kernel_type_control {
--- a/onnxruntime/core/providers/cpu/math/clip.h
+++ b/onnxruntime/core/providers/cpu/math/clip.h
@ -16,8 +16,8 @@ template <typename T>
 class Clip_6Base {
 public:
  explicit Clip_6Base(const OpKernelInfo& info) {
-    auto min_val = std::numeric_limits<T>::lowest();
-    auto max_val = std::numeric_limits<T>::max();
+    constexpr auto min_val = std::numeric_limits<T>::lowest();
+    constexpr auto max_val = std::numeric_limits<T>::max();
    info.GetAttrOrDefault("min", &min_, min_val);
    info.GetAttrOrDefault("max", &max_, max_val);
    ORT_ENFORCE(min_ <= max_);
--- a/onnxruntime/core/providers/cpu/math/det.cc
+++ b/onnxruntime/core/providers/cpu/math/det.cc
@ -3,6 +3,11 @@

 #include "core/providers/cpu/math/det.h"
 #include "core/util/math_cpuonly.h"
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif

 using namespace onnxruntime::common;

--- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc
+++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc
@ -114,7 +114,7 @@ Status EinsumComputePreprocessor::ProcessSubscripts() {
          // Example for the following line of code
          // Subscript "...ij" for an input of rank 6
          // num_of_ellipsis_dims = 6 - 5 + 3 = 4
-          int64_t current_num_of_ellipsis_dims = rank - subscript.length() + 3;
+          int64_t current_num_of_ellipsis_dims = static_cast<int64_t>(rank) - subscript.length() + 3;
          if (current_num_of_ellipsis_dims < 0) {
            return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                                   "Einsum subscripts string contains too many subscript labels when compared to the rank of the input");
--- a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
+++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
@ -1794,8 +1794,8 @@ void UntypedBroadcastTwo(OpKernelContext& context, const ProcessBroadcastSpanFun

    concurrency::ThreadPool::TryParallelFor(
        tp, output_size / span_size,
-        TensorOpCost{static_cast<float>(input_broadcaster.Input0ElementSize() * span_size),
-                     static_cast<float>(output_tensor.DataType()->Size() * span_size),
+        TensorOpCost{static_cast<double>(input_broadcaster.Input0ElementSize()) * span_size,
+                     static_cast<double>(output_tensor.DataType()->Size()) * span_size,
                     unit_cost * span_size},
        [span_size, &const_input_broadcaster, &output_tensor, &funcs, user_data](std::ptrdiff_t first_span,
                                                                                 std::ptrdiff_t last_span) {
--- a/onnxruntime/core/providers/cpu/math/element_wise_ops.h
+++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.h
@ -10,7 +10,12 @@

 namespace onnxruntime {
 namespace functors {
-
+// TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+// Do not use raw new/delete.
+#pragma warning(disable : 26409)
+#endif
 template <typename T>
 struct Log final : public ElementWiseRangedTransform<T> {
  Status Init(const onnxruntime::NodeAttributes) {
@ -982,4 +987,7 @@ struct TensorAllocator {
 private:
  AllocatorPtr allocator_;
 };
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/cpu/ml/ml_common.h
+++ b/onnxruntime/core/providers/cpu/ml/ml_common.h
@ -241,7 +241,7 @@ static inline void multiclass_probability(int64_t classcount,
  }
 }

-static const float ml_sqrt2 = 1.41421356f;
+static constexpr float ml_sqrt2 = 1.41421356f;

 static inline float ComputeLogistic(float val) {
  float v = 1 / (1 + std::exp(-std::abs(val)));
--- a/onnxruntime/core/providers/cpu/ml/svmclassifier.cc
+++ b/onnxruntime/core/providers/cpu/ml/svmclassifier.cc
@ -3,6 +3,11 @@

 #include "core/providers/cpu/ml/svmclassifier.h"
 #include "core/platform/threadpool.h"
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif

 namespace onnxruntime {
 namespace ml {
--- a/onnxruntime/core/providers/cpu/nn/batch_norm.h
+++ b/onnxruntime/core/providers/cpu/nn/batch_norm.h
@ -58,7 +58,10 @@ class BatchNorm : public OpKernel {
 #endif
    }
  }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 26451)
+#endif
  Status Compute(OpKernelContext* p_op_kernel_context) const override {
    const auto* X = p_op_kernel_context->Input<Tensor>(0);
    const auto* scale = p_op_kernel_context->Input<Tensor>(1);
@ -206,4 +209,7 @@ class BatchNorm : public OpKernel {
  const bool is_spatial_;
  int64_t is_train_;
 };
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
+++ b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
@ -8,7 +8,12 @@
 #include "core/framework/tensor.h"
 #endif
 #include <sstream>
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {
 class BatchNormHelper {
 public:
@ -124,3 +129,6 @@ class BatchNormHelper {
  }
 };
 }  // namespace onnxruntime
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
--- a/onnxruntime/core/providers/cpu/nn/conv_attributes.h
+++ b/onnxruntime/core/providers/cpu/nn/conv_attributes.h
@ -243,7 +243,7 @@ struct ConvAttributes {
          }

          post_slicing_needed = true;
-          slice_axes.push_back(dim + 2);
+          slice_axes.push_back(static_cast<int64_t>(dim) + 2);
          slice_starts.push_back(excess_output_head);
          slice_ends.push_back(excess_output_head + output_dim_size);                      // we may modify this below
          output_shape_with_revised_pads.push_back(excess_output_head + output_dim_size);  // we may modify this below
@ -273,7 +273,7 @@ struct ConvAttributes {
              // Head has not been over-padded. Only tail pads need to be modified.
              post_slicing_needed = true;

-              slice_axes.push_back(dim + 2);
+              slice_axes.push_back(static_cast<int64_t>(dim) + 2);
              slice_starts.push_back(0);
              slice_ends.push_back(output_dim_size - revised_dim_size);
            }
--- a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
@ -76,7 +76,7 @@ Status ConvTranspose<float>::PrePack(const Tensor& tensor, int input_idx, Alloca
    transposed_filter_ = BufferUniquePtr(packed_filter_data, BufferDeleter(alloc));

    for (int64_t group_id = 0; group_id < conv_transpose_attrs_.group; ++group_id) {
-      MlasTranspose(tensor.Data<float>() + (N * K * group_id),
+      MlasTranspose(tensor.Data<float>() + (group_id * N * K),
                    ((float*)packed_filter_data) + (group_id * packed_elements_per_group),
                    K, N);
    }
--- a/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h
+++ b/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h
@ -50,7 +50,7 @@ struct ConvTransposeAttributes : public ConvAttributes {
    const TensorShape& F_Shape = (filter_shape != nullptr) ? *filter_shape : F->Shape();
    const Tensor* Pads = dynamic_padding ? context->Input<Tensor>(2) : nullptr;
    const Tensor* B = has_bias ? (dynamic_padding ? context->Input<Tensor>(3) : context->Input<Tensor>(2)) : nullptr;
-    const TensorShape& input_shape = X->Shape().Slice(2);
+    TensorShape input_shape = X->Shape().Slice(2);

    const int64_t num_input_channels = X->Shape()[1];
    const int64_t N = X->Shape()[0];
--- a/onnxruntime/core/providers/cpu/nn/lrn.cc
+++ b/onnxruntime/core/providers/cpu/nn/lrn.cc
@ -21,7 +21,11 @@
 #include "core/common/safeint.h"
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
 namespace onnxruntime {

 namespace functors {
@ -74,7 +78,7 @@ Status LRN<float>::Compute(OpKernelContext* context) const {
  auto* scale_data = static_cast<float*>(scale_buffer.get());
  math::Set<float, CPUMathUtil>(Xsize, bias_, scale_data, &CPUMathUtil::Instance());

-  const size_t padded_square_size = (C + size_ - 1) * H * W;
+  const size_t padded_square_size = (static_cast<size_t>(C) + size_ - 1) * H * W;
  auto psdata = alloc->Alloc(SafeInt<size_t>(sizeof(float)) * padded_square_size);
  BufferUniquePtr padded_square_buffer(psdata, BufferDeleter(alloc));
  auto* padded_square_data = static_cast<float*>(padded_square_buffer.get());
--- a/onnxruntime/core/providers/cpu/nn/pool_attributes.h
+++ b/onnxruntime/core/providers/cpu/nn/pool_attributes.h
@ -168,7 +168,11 @@ struct PoolAttributes {
      *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head + *pad_tail, dilation);
    }
  }
-
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+// Chance of arithmetic overflow could be reduced
+#pragma warning(disable : 26451)
+#endif
  int64_t ComputeOutputSize(int64_t in_size,
                            int64_t stride,
                            int64_t kernel,
@ -180,6 +184,9 @@ struct PoolAttributes {
    return static_cast<int64_t>(
        std::ceil(static_cast<float>(in_size + pad_needed - dilation * (kernel - 1) - 1) / stride + 1));
  }
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
 };

 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/cpu/nn/pool_base.h
+++ b/onnxruntime/core/providers/cpu/nn/pool_base.h
@ -36,7 +36,7 @@ class PoolProcessContext {

 class AveragePool {
 public:
-  static float Initialize() {
+  constexpr static float Initialize() {
    return 0.0;
  }

@ -59,7 +59,7 @@ class MaxPool;
 template <>
 class MaxPool<1 /*START_VERSION*/> {
 public:
-  static float Initialize() {
+  constexpr static float Initialize() {
    return std::numeric_limits<float>::lowest();
  }

@ -84,7 +84,7 @@ class MaxPool<8 /*START_VERSION*/> {

 class LpPool {
 public:
-  static float Initialize() {
+  constexpr static float Initialize() {
    return 0.0f;
  }

--- a/onnxruntime/core/providers/cpu/nn/shrink.cc
+++ b/onnxruntime/core/providers/cpu/nn/shrink.cc
@ -31,7 +31,10 @@ ONNX_CPU_OPERATOR_KERNEL(
                        BuildKernelDefConstraintsFromTypeList<ShrinkDataTypes>(),
                        BuildKernelDefConstraintsFromTypeList<EnabledShrinkDataTypes>()),
    Shrink);
-
+//TODO: fix the warnings
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 26451)
+#endif
 namespace shrink_internal {
 template <class T>
 inline T ShrinkCore(const T& val, float bias, float lambd) {
--- a/Показать больше
+++ b/Показать больше