[CoreML ] ML Program more operators support [3/N] (#22710)

### Description - Erf - Round - Max - ReduceMax - ReduceMean - ReduceSum - Unsqueeze - Squeeze - Softmax ### Motivation and Context  --------- Co-authored-by: Scott McKay <skottmckay@gmail.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2024-11-28 09:21:02 +08:00 · 2024-11-28 09:21:02 +08:00 · a24723df16
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@ -13,14 +13,6 @@ using namespace CoreML::Specification;
 namespace onnxruntime {
 namespace coreml {

-// Once all ops are supportted FP16, we can remove it. Before that, we keep a set of ops to
-// filter suppported ones.
-static std::set<std::string> Float16Ops = {
-    "Add", "ArgMax", "AveragePool", "BatchNormalization", "Cast", "Clip", "Concat", "Conv", "ConvTranspose",
-    "DepthToSpace", "Div", "Gelu", "Gemm", "GlobalAveragePool", "GlobalMaxPool", "GridSample", "GroupNormalization",
-    "InstanceNormalization", "LayerNormalization", "LeakyRelu", "MatMul", "MaxPool", "Mul", "PRelu", "Pow",
-    "Reciprocal", "Relu", "Reshape", "Resize", "Sigmoid", "Slice", "Split", "Sqrt", "Sub", "Tanh", "Transpose"};
-
 namespace {
 // TODO, move this to shared_library
 bool HasExternalInitializer(const InitializedTensorSet& initializers, const Node& node,
@ -64,20 +56,27 @@ bool BaseOpBuilder::IsOpSupported(const Node& node, const OpBuilderInputParams&
  }

  if (!HasSupportedOpSet(node, logger)) {
+    LOGS(logger, VERBOSE) << "Operator [" << node.OpType() << "] does not support this opset";
    return false;
  }

  if (!HasSupportedInputs(node, input_params, logger)) {
+    LOGS(logger, VERBOSE) << "Operator [" << node.OpType() << "] has unsupported inputs";
    return false;
  }

  // We do not support external initializers for now
  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
  if (HasExternalInitializer(initializers, node, logger)) {
+    LOGS(logger, VERBOSE) << "Operator [" << node.OpType() << "] has external initializers";
    return false;
  }

-  return IsOpSupportedImpl(node, input_params, logger);
+  if (!IsOpSupportedImpl(node, input_params, logger)) {
+    LOGS(logger, VERBOSE) << "Operator [" << node.OpType() << "] is not supported by the impl";
+    return false;
+  }
+  return true;
 }

 bool BaseOpBuilder::HasSupportedInputs(const Node& node, const OpBuilderInputParams& input_params,
@ -114,13 +113,10 @@ bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx,
    return true;
  }

-// only support MLProgram for FP16
-#if defined(COREML_ENABLE_MLPROGRAM)
-  if (input_params.create_mlprogram && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 &&
-      Float16Ops.count(node.OpType())) {
+  // only MLProgram support FP16
+  if (input_params.create_mlprogram && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
    return true;
  }
-#endif

  LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not currently supported";
  return false;
--- a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
@ -6,6 +6,7 @@
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
 #include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/shape_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"
@ -55,6 +56,64 @@ bool CheckIfBothInputShapesMatch(const Node& node, const logging::Logger& logger
 }
 }  // namespace

+#if defined(COREML_ENABLE_MLPROGRAM)
+static std::vector<int64_t> InferOutputShape(const std::vector<int64_t>& a, const std::vector<int64_t>& b) {
+  std::vector<int64_t> output_shape;
+  int64_t i_a = 0, j_b = 0;
+  if (a.size() >= b.size()) {
+    output_shape = a;
+    j_b -= a.size() - b.size();
+  } else {
+    output_shape = b;
+    i_a -= b.size() - a.size();
+  }
+
+  for (size_t i = 0; i < output_shape.size(); i++, i_a++, j_b++) {
+    const int64_t a_dim = (i_a >= 0) ? a[i_a] : 1;
+    const int64_t b_dim = (j_b >= 0) ? b[j_b] : 1;
+    if (a_dim == -1 || b_dim == -1) {
+      output_shape[i] = -1;
+    } else {
+      output_shape[i] = std::max(a_dim, b_dim);
+    }
+  }
+  return output_shape;
+}
+
+// Add variadic inputs to the model builder
+// in onnx spec, some node allows variadic inputs, such as max(x, y, z, ...)
+// while in coreml, maximum op only allows two inputs maximum(x, y)
+// the conversion is doing the following:
+// max(x, y, z, ...) -> max(max(x, y), z, ...)
+static void AddVariadicInputs(std::unique_ptr<CoreML::Specification::MILSpec::Operation>* op,
+                              ModelBuilder& model_builder,
+                              const Node& node,
+                              const logging::Logger& logger) {
+  using namespace CoreML::Specification::MILSpec;
+  const auto& input_defs(node.InputDefs());
+  std::string_view layer_input_name_x = model_builder.GetUniqueName(node, "variadic");
+  auto input_dtype = input_defs[0]->TypeAsProto()->tensor_type().elem_type();
+  const int32_t elem_type = static_cast<int32_t>(input_dtype);
+  std::vector<int64_t> x0_shape, x1_shape;
+  GetShape(*input_defs[0], x0_shape, logger);
+  GetShape(*input_defs[1], x1_shape, logger);
+  x0_shape = InferOutputShape(x0_shape, x1_shape);
+  std::unique_ptr<Operation> op_prev = std::move(*op);
+  for (size_t i = 2; i < input_defs.size(); i++) {
+    AddIntermediateOperationOutput(*op_prev, layer_input_name_x, elem_type, x0_shape);
+    std::unique_ptr<Operation> op_cur = model_builder.CreateOperation(node, op_prev->type());
+    AddOperationInput(*op_cur, "x", layer_input_name_x);
+    AddOperationInput(*op_cur, "y", input_defs[i]->Name());
+    model_builder.AddOperation(std::move(op_prev));
+    op_prev = std::move(op_cur);
+    layer_input_name_x = model_builder.GetUniqueName(node, "variadic");
+    GetShape(*input_defs[i], x1_shape, logger);
+    x0_shape = InferOutputShape(x0_shape, x1_shape);
+  }
+  *op = std::move(op_prev);
+}
+#endif
+
 Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                              const logging::Logger& logger) const {
  const auto& op_type(node.OpType());
@ -70,6 +129,8 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
      coreml_op_type = "add";
    } else if (op_type == "Mul") {
      coreml_op_type = "mul";
+    } else if (op_type == "Max") {
+      coreml_op_type = "maximum";
    } else if (op_type == "Sub") {
      coreml_op_type = "sub";
    } else if (op_type == "Div") {
@ -86,8 +147,11 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
    AddOperationInput(*op, "x", input_defs[0]->Name());
    AddOperationInput(*op, "y", input_defs[1]->Name());
+    if (input_defs.size() > 2) {
+      // "max" node may have variadic inputs
+      AddVariadicInputs(&op, model_builder, node, logger);
+    }
    AddOperationOutput(*op, *node.OutputDefs()[0]);
-
    model_builder.AddOperation(std::move(op));
  } else
 #endif  // defined (COREML_ENABLE_MLPROGRAM)
@ -157,6 +221,10 @@ bool BinaryOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderIn
    return false;
  }

+  if (node.OpType() == "Max" && !input_params.create_mlprogram) {
+    return false;
+  }
+
  return true;
 }

--- a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
@ -98,26 +98,24 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
        const bool min_max_attribs = node.SinceVersion() < 11;
        std::string_view min_name;
        if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
-          min_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "min", min)
-                                     : node.InputDefs()[1]->Name();
+          min_name = (min_max_attribs || !has_min) ? model_builder.AddScalarConstant(clip_op.type(), "min", min)
+                                                   : node.InputDefs()[1]->Name();
        } else {
-          min_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "min", MLFloat16(min))
-                                     : node.InputDefs()[1]->Name();
+          min_name = (min_max_attribs || !has_min) ? model_builder.AddScalarConstant(clip_op.type(), "min", MLFloat16(min))
+                                                   : node.InputDefs()[1]->Name();
        }

        AddOperationInput(clip_op, "alpha", min_name);

-        if (has_max) {
-          std::string_view max_name;
-          if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
-            max_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "max", max)
-                                       : node.InputDefs()[2]->Name();
-          } else {
-            max_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "max", MLFloat16(max))
-                                       : node.InputDefs()[2]->Name();
-          }
-          AddOperationInput(clip_op, "beta", max_name);
+        std::string_view max_name;
+        if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+          max_name = (min_max_attribs || !has_max) ? model_builder.AddScalarConstant(clip_op.type(), "max", max)
+                                                   : node.InputDefs()[2]->Name();
+        } else {
+          max_name = (min_max_attribs || !has_max) ? model_builder.AddScalarConstant(clip_op.type(), "max", MLFloat16(max))
+                                                   : node.InputDefs()[2]->Name();
        }
+        AddOperationInput(clip_op, "beta", max_name);
      }
    }

@ -200,7 +198,9 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 bool ClipOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                      const logging::Logger& logger) const {
  float min, max;
-  return GetClipMinMax(input_params.graph_viewer, node, min, max, logger);
+  bool ret = GetClipMinMax(input_params.graph_viewer, node, min, max, logger);
+  // what does it mean if min == max?
+  return ret && (min != max);
 }

 void CreateClipOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
--- a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
@ -5,10 +5,15 @@
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"

+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#endif
+
 namespace onnxruntime {
 namespace coreml {

@ -20,6 +25,7 @@ class ReductionOpBuilder : public BaseOpBuilder {

  bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                         const logging::Logger& logger) const override;
+  bool SupportsMLProgram() const override { return true; }
 };

 namespace {
@ -48,13 +54,12 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
                                                 const logging::Logger& /* logger */) const {
  const auto& op_type(node.OpType());
  const auto& input_defs(node.InputDefs());
-  const auto& initializers(model_builder.GetInitializerTensors());

  std::vector<int64_t> axes;

  NodeAttrHelper helper(node);
  if (input_defs.size() > 1 && input_defs[1]->Exists()) {
-    auto& axes_tensor = *initializers.at(input_defs[1]->Name());
+    auto& axes_tensor = *model_builder.GetConstantInitializer(input_defs[1]->Name());
    Initializer axes_initializer(axes_tensor);
    int64_t* data = axes_initializer.data<int64_t>();
    int64_t size = axes_initializer.size();
@ -66,28 +71,77 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co

  const bool keepdims = helper.Get("keepdims", 1) != 0;
  const bool noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0) != 0;
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;

-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+    std::string_view coreml_op_type;
+    if (noop_with_empty_axes && axes.size() == 0) {
+      coreml_op_type = "identity";
+    } else if (op_type == "ReduceSum") {
+      coreml_op_type = "reduce_sum";
+    } else if (op_type == "ReduceMean") {
+      coreml_op_type = "reduce_mean";
+    } else if (op_type == "ReduceMax") {
+      coreml_op_type = "reduce_max";
+    } else if (op_type == "ReduceMin") {
+      coreml_op_type = "reduce_min";
+    } else if (op_type == "ReduceProd") {
+      coreml_op_type = "reduce_prod";
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "ReductionOpBuilder::AddToModelBuilderImpl, unexpected op: ", op_type);
+    }
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+    if (coreml_op_type != "identity") {
+      if (axes.size() > 0) {
+        AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", axes));
+      }
+      AddOperationInput(*op, "keep_dims", model_builder.AddScalarConstant(op->type(), "keep_dims", keepdims));
+    }
+    AddOperationOutput(*op, *node.OutputDefs()[0]);

-  if (op_type == "ReduceSum") {
-    AddReductionParams(layer->mutable_reducesum(), axes, keepdims, noop_with_empty_axes);
-  } else if (op_type == "ReduceMean") {
-    AddReductionParams(layer->mutable_reducemean(), axes, keepdims, noop_with_empty_axes);
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "ReductionOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
+    model_builder.AddOperation(std::move(op));
+  } else
+#endif  // (COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    if (op_type == "ReduceSum") {
+      AddReductionParams(layer->mutable_reducesum(), axes, keepdims, noop_with_empty_axes);
+    } else if (op_type == "ReduceMean") {
+      AddReductionParams(layer->mutable_reducemean(), axes, keepdims, noop_with_empty_axes);
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "ReductionOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
+    }
+
+    *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
  }
-
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
-
-  model_builder.AddLayer(std::move(layer));
  return Status::OK();
 }

 bool ReductionOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                           const logging::Logger& logger) const {
  const auto& input_defs = node.InputDefs();
+  if (!input_params.create_mlprogram &&
+      (node.OpType() == "ReduceMax" || node.OpType() == "ReduceMin" || node.OpType() == "ReduceProd")) {
+    return false;
+  }
+
+#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64)
+  // to pass https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=1563483&view=logs&j=f7cc61a9-cc70-56e7-b06c-4668ca17e426
+  // ReductionOpTest.ReduceSum_half_bert
+  int32_t input_type;
+  GetType(*input_defs[0], input_type, logger);
+  if (node.OpType() == "ReduceSum" && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
+    return false;
+  }
+#endif

  NodeAttrHelper helper(node);

@ -99,18 +153,16 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInpu
  if (input_defs.size() > 1 && input_defs[1]->Exists()) {
    // 'axes' is optional input in new opsets
    const auto& axes_name = input_defs[1]->Name();
-    const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
-    if (!Contains(initializers, axes_name)) {
+    const auto* axes = input_params.graph_viewer.GetConstantInitializer(axes_name);
+    if (!axes) {
      LOGS(logger, VERBOSE) << "Axes of reduction must be a constant initializer";
      return false;
    }

-    empty_axes = initializers.at(axes_name)->int64_data_size() == 0;
+    empty_axes = axes->int64_data_size() == 0;
  }
-
-  if (empty_axes && noop_with_empty_axes) {
-    // TODO: When we add ML Program support we should enable this as it makes the node an Identity op
-    LOGS(logger, VERBOSE) << "CoreML doesn't support noop on empty axes for reduction layers" << std::endl;
+  if (empty_axes && noop_with_empty_axes && !input_params.create_mlprogram) {
+    LOGS(logger, VERBOSE) << "NeuralNetwork doesn't support noop on empty axes for reduction layers";
    return false;
  }

--- a/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
@ -2,7 +2,9 @@
 // Licensed under the MIT License.

 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/shape_utils.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"  // for NodeAttrHelper

@ -14,28 +16,132 @@ class ShapeOpBuilder : public BaseOpBuilder {

  bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                         const logging::Logger& logger) const override;
+  bool HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
+                              const logging::Logger& logger) const override;
+  bool SupportsMLProgram() const override { return true; }
 };

 Status ShapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                             const logging::Logger& /*logger*/) const {
-  auto layer = model_builder.CreateNNLayer(node);
-  layer->mutable_getshape();
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
-  model_builder.AddLayer(std::move(layer));
+  const auto& input_defs = node.InputDefs();
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+    NodeAttrHelper node_attr_helper{node};
+    int64_t size = -1;
+    int64_t num_dims = 0;
+    int64_t start = node_attr_helper.Get("start", 0);
+    // If the input shape is not available, size is -1 and start is 0
+    if (input_defs[0]->Shape()) {
+      num_dims = input_defs[0]->Shape()->dim_size();
+      start = HandleNegativeAxis(start, num_dims);
+      if (node_attr_helper.HasAttr("end")) {
+        int64_t end = HandleNegativeAxis(node_attr_helper.Get("end", -1), num_dims);
+        size = end - start;
+      }
+    }
+
+    int32_t output_datatype = ONNX_NAMESPACE::TensorProto_DataType_INT32;
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "shape");
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+    if (size != -1 || start != 0) {
+      std::string_view layer_input_name_x = model_builder.GetUniqueName(node, "slice_by_size");
+      std::vector<int64_t> x0_shape{num_dims};
+      AddIntermediateOperationOutput(*op, layer_input_name_x, output_datatype, x0_shape);
+      model_builder.AddOperation(std::move(op));
+
+      auto slice_op = model_builder.CreateOperation(node, "slice_by_size");
+      AddOperationInput(*slice_op, "x", layer_input_name_x);
+      std::vector<int64_t> starts = {start};
+      std::vector<int64_t> sizes = {size};
+      AddOperationInput(*slice_op, "begin", model_builder.AddConstant(slice_op->type(), "begin", starts));
+      AddOperationInput(*slice_op, "size", model_builder.AddConstant(slice_op->type(), "size", sizes));
+      AddOperationOutput(*slice_op, *node.OutputDefs()[0], output_datatype);
+      model_builder.AddOperation(std::move(slice_op));
+    } else {
+      AddOperationOutput(*op, *node.OutputDefs()[0], output_datatype);
+      model_builder.AddOperation(std::move(op));
+    }
+  } else  // NOLINT
+#endif
+  {
+    auto layer = model_builder.CreateNNLayer(node);
+    layer->mutable_getshape();
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    model_builder.AddLayer(std::move(layer));
+  }
  return Status::OK();
 }

-bool ShapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
+bool ShapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                       const logging::Logger& logger) const {
+  const auto* tensor_shape = node.InputDefs()[0]->Shape();
+
  NodeAttrHelper node_attr_helper{node};
-  if (node_attr_helper.Get("start", 0) != 0) {
-    LOGS(logger, VERBOSE) << "Shape does not support 'start' attribute with value other than 0";
+  if (!input_params.create_mlprogram) {
+    if (node_attr_helper.HasAttr("end")) {
+      LOGS(logger, VERBOSE) << "Shape does not support 'end' attribute";
+      return false;
+    }
+
+    if (node_attr_helper.Get("start", 0) != 0) {
+      LOGS(logger, VERBOSE) << "Shape does not support 'start' attribute with value other than 0";
+      return false;
+    }
+  } else {
+    int64_t end = node_attr_helper.HasAttr("end")
+                      ? node_attr_helper.Get("end", -1)
+                      : std::numeric_limits<int64_t>::max();
+    int64_t start = node_attr_helper.Get("start", 0);
+    // no need to slice if start is 0 and end is max
+    if (end == std::numeric_limits<int64_t>::max() && start == 0) {
+    } else if (tensor_shape == nullptr) {
+      LOGS(logger, VERBOSE) << "Shape does not support slicing when tensor_shape is not available";
+      return false;
+    }
+    int64_t dim_size = tensor_shape->dim_size();
+    int64_t size = node_attr_helper.HasAttr("end")
+                       ? HandleNegativeAxis(node_attr_helper.Get("end", -1), dim_size)
+                       : dim_size;
+    start = HandleNegativeAxis(start, dim_size);
+    size = size - start;
+    if (size == 0) {
+      LOGS(logger, VERBOSE) << "Shape does not support slicing when size is 0";
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool ShapeOpBuilder::HasSupportedInputsImpl(const Node& node,
+                                            [[maybe_unused]] const OpBuilderInputParams& input_params,
+                                            const logging::Logger& logger) const {
+  // We only check the type of input 0
+  const auto& input = *node.InputDefs()[0];
+
+  int32_t input_type;
+  if (!GetType(input, input_type, logger)) {
    return false;
  }

-  if (node_attr_helper.HasAttr("end")) {
-    LOGS(logger, VERBOSE) << "Shape does not support 'end' attribute";
+  if (input_params.create_mlprogram) {
+    if ((input_type == ONNX_NAMESPACE::TensorProto_DataType_INT32 ||
+         input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT ||
+         input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16)) {
+      return true;
+    } else {
+      LOGS(logger, VERBOSE) << "[" << node.OpType()
+                            << "] Input type: [" << input_type
+                            << "] is not supported.";
+      return false;
+    }
+  } else if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+    LOGS(logger, VERBOSE) << "[" << node.OpType()
+                          << "] Input type: [" << input_type
+                          << "] is not supported.";
    return false;
  }

--- a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
@ -4,6 +4,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@ -18,6 +19,7 @@ class SoftmaxOpBuilder : public BaseOpBuilder {

  bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                         const logging::Logger& logger) const override;
+  bool SupportsMLProgram() const override { return true; }
 };

 Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
@ -33,55 +35,100 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
  NodeAttrHelper helper(node);
  int32_t axis_default_value = (node.SinceVersion() < 13) ? 1 : -1;
  const auto axis = helper.Get("axis", axis_default_value);
-  const auto axis_nonnegative = HandleNegativeAxis(axis, data_shape.size());
+  auto axis_nonnegative = HandleNegativeAxis(axis, data_shape.size());

-  if (node.SinceVersion() >= 13 || (data_shape.size() == 2)) {
-    auto* coreml_softmaxnd = layer->mutable_softmaxnd();
-    coreml_softmaxnd->set_axis(axis);
-    *layer->mutable_input()->Add() = input_name;
-    *layer->mutable_output()->Add() = output_name;
-    model_builder.AddLayer(std::move(layer));
-  } else {
-    // note: if opsets < 13, onnx Softmax coerces the input shape to be 2D based on axis.
-    // we need to manually reshape to 2D and apply SoftmaxND to axis -1 to achieve equivalent results for CoreML.
-    TensorShape input_shape(data_shape);
-    const auto size_to_dimension = input_shape.SizeToDimension(axis_nonnegative);
-    const auto size_from_dimension = input_shape.SizeFromDimension(axis_nonnegative);
+#if defined(COREML_ENABLE_MLPROGRAM)
+  // CoreML's softmax match onnx's softmax behavior since opset 13.
+  // For opset < 13, we need to reshape to 2D and set axis to -1 to simulate onnx softmax behavior.
+  // [B,D,...](onnx softmax opset 12, axis=1)->[B,D*...](CoreML softmax, axis=-1)->[B,D,...](reshape back)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+    auto input_dtype = node.InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+    const int32_t elem_type = static_cast<int32_t>(input_dtype);

-    TensorShapeVector target_shape;
-    target_shape.push_back(size_to_dimension);
-    target_shape.push_back(size_from_dimension);
-
-    const auto reshape1_output_name = model_builder.GetUniqueName(node, "reshape1_output");
-    {  // Add reshape layer
-      auto reshape_layer = model_builder.CreateNNLayer(node, "_Softmax_reshape1");
-      *reshape_layer->mutable_reshapestatic()->mutable_targetshape() = {target_shape.cbegin(), target_shape.cend()};
-      *reshape_layer->mutable_input()->Add() = input_name;
-      *reshape_layer->mutable_output()->Add() = reshape1_output_name;
-      model_builder.AddLayer(std::move(reshape_layer));
+    std::string_view layer_input_name_x = node.InputDefs()[0]->Name();
+    const bool need_reshape = node.SinceVersion() < 13 && axis_nonnegative != static_cast<int64_t>(data_shape.size()) - 1;
+    std::vector<int64_t> target_shape;
+    if (need_reshape) {
+      // reshape to 2D to simulate onnx softmax behavior
+      auto reshape1 = model_builder.CreateOperation(node, "reshape", "pre");
+      TensorShape input_shape(data_shape);
+      target_shape.push_back(input_shape.SizeToDimension(axis_nonnegative));
+      target_shape.push_back(input_shape.SizeFromDimension(axis_nonnegative));
+      axis_nonnegative = 1;
+      AddOperationInput(*reshape1, "x", layer_input_name_x);
+      AddOperationInput(*reshape1, "shape", model_builder.AddConstant(reshape1->type(), "shape1", target_shape));
+      layer_input_name_x = model_builder.GetUniqueName(node, "ln_reshape1_");
+      AddIntermediateOperationOutput(*reshape1, layer_input_name_x, elem_type, target_shape);
+      model_builder.AddOperation(std::move(reshape1));
    }
-    const auto softmax_output_name = model_builder.GetUniqueName(node, "softmax_output");
-    {
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "softmax");
+    AddOperationInput(*op, "x", layer_input_name_x);
+    AddOperationInput(*op, "axis", model_builder.AddScalarConstant(op->type(), "axis", axis_nonnegative));
+    if (!need_reshape) {
+      AddOperationOutput(*op, *node.OutputDefs()[0]);
+      model_builder.AddOperation(std::move(op));
+    } else {
+      std::string_view ln_output_name = model_builder.GetUniqueName(node, "ln_reshape1_");
+      AddIntermediateOperationOutput(*op, ln_output_name, elem_type, target_shape);
+      model_builder.AddOperation(std::move(op));
+      auto reshape2 = model_builder.CreateOperation(node, "reshape", "post");
+      AddOperationInput(*reshape2, "x", ln_output_name);
+      AddOperationInput(*reshape2, "shape", model_builder.AddConstant(reshape2->type(), "shape2", data_shape));
+      AddOperationOutput(*reshape2, *node.OutputDefs()[0]);
+      model_builder.AddOperation(std::move(reshape2));
+    }
+  } else  // NOLINT
+#endif
+  {
+    if (node.SinceVersion() >= 13 || (data_shape.size() == 2)) {
      auto* coreml_softmaxnd = layer->mutable_softmaxnd();
-      coreml_softmaxnd->set_axis(-1);
-      *layer->mutable_input()->Add() = reshape1_output_name;
-      *layer->mutable_output()->Add() = softmax_output_name;
+      coreml_softmaxnd->set_axis(axis);
+      *layer->mutable_input()->Add() = input_name;
+      *layer->mutable_output()->Add() = output_name;
      model_builder.AddLayer(std::move(layer));
-    }
-    {
-      // Add reshape back layer
-      auto reshape_layer = model_builder.CreateNNLayer(node, "_Softmax_reshape2");
-      *reshape_layer->mutable_reshapestatic()->mutable_targetshape() = {data_shape.cbegin(), data_shape.cend()};
-      *reshape_layer->mutable_input()->Add() = softmax_output_name;
-      *reshape_layer->mutable_output()->Add() = output_name;
-      model_builder.AddLayer(std::move(reshape_layer));
+    } else {
+      // note: if opsets < 13, onnx Softmax coerces the input shape to be 2D based on axis.
+      // we need to manually reshape to 2D and apply SoftmaxND to axis -1 to achieve equivalent results for CoreML.
+      TensorShape input_shape(data_shape);
+      const auto size_to_dimension = input_shape.SizeToDimension(axis_nonnegative);
+      const auto size_from_dimension = input_shape.SizeFromDimension(axis_nonnegative);
+
+      TensorShapeVector target_shape;
+      target_shape.push_back(size_to_dimension);
+      target_shape.push_back(size_from_dimension);
+
+      const auto reshape1_output_name = model_builder.GetUniqueName(node, "reshape1_output");
+      {  // Add reshape layer
+        auto reshape_layer = model_builder.CreateNNLayer(node, "_Softmax_reshape1");
+        *reshape_layer->mutable_reshapestatic()->mutable_targetshape() = {target_shape.cbegin(), target_shape.cend()};
+        *reshape_layer->mutable_input()->Add() = input_name;
+        *reshape_layer->mutable_output()->Add() = reshape1_output_name;
+        model_builder.AddLayer(std::move(reshape_layer));
+      }
+      const auto softmax_output_name = model_builder.GetUniqueName(node, "softmax_output");
+      {
+        auto* coreml_softmaxnd = layer->mutable_softmaxnd();
+        coreml_softmaxnd->set_axis(-1);
+        *layer->mutable_input()->Add() = reshape1_output_name;
+        *layer->mutable_output()->Add() = softmax_output_name;
+        model_builder.AddLayer(std::move(layer));
+      }
+      {
+        // Add reshape back layer
+        auto reshape_layer = model_builder.CreateNNLayer(node, "_Softmax_reshape2");
+        *reshape_layer->mutable_reshapestatic()->mutable_targetshape() = {data_shape.cbegin(), data_shape.cend()};
+        *reshape_layer->mutable_input()->Add() = softmax_output_name;
+        *reshape_layer->mutable_output()->Add() = output_name;
+        model_builder.AddLayer(std::move(reshape_layer));
+      }
    }
  }

  return Status::OK();
 }

-bool SoftmaxOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /* input_params */,
+bool SoftmaxOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
                                         const logging::Logger& logger) const {
  const auto& input_defs = node.InputDefs();
  std::vector<int64_t> input_shape;
--- a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
@ -51,8 +51,8 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
  auto calculate_remainder_and_chunk_size = [&](int32_t num_outputs) {
    // note: checked in IsOpSupportedImpl that ensures the dim value at splitting axis exists
    auto split_dim_size = data_shape[HandleNegativeAxis(axis, data_shape.size())];
-    uint64_t chunk_size = (split_dim_size + num_outputs - 1) / num_outputs;
-    uint64_t remainder = split_dim_size % chunk_size;
+    int64_t chunk_size = (split_dim_size + num_outputs - 1) / num_outputs;
+    int64_t remainder = split_dim_size % chunk_size;
    return std::make_tuple(remainder, chunk_size);
  };

@ -106,20 +106,20 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
      // if "split" is explicitly provided as an input
      // const auto& split_tensor = *model_builder.GetInitializerTensors().at(input_defs[1]->Name());
      Initializer unpacked_tensor(*model_builder.GetConstantInitializer(input_defs[1]->Name()));
-      auto split_span = unpacked_tensor.DataAsSpan<uint64_t>();
+      auto split_span = unpacked_tensor.DataAsSpan<int64_t>();
      for (const auto& split_size : split_span) {
        coreml_splitnd->add_splitsizes(split_size);
      }
    } else if (node.SinceVersion() < 18) {
-      uint64_t num_outputs = narrow<uint64_t>(node.OutputDefs().size());
+      int64_t num_outputs = narrow<int64_t>(node.OutputDefs().size());
      coreml_splitnd->set_numsplits(num_outputs);
    } else {
      // note: for opset 18+ 'num_outputs' is a required attribute
-      uint64_t num_outputs = narrow<uint64_t>(helper.GetInt64("num_outputs").value());
+      int64_t num_outputs = narrow<int64_t>(helper.GetInt64("num_outputs").value());
      auto [remainder, chunk_size] = calculate_remainder_and_chunk_size(static_cast<int32_t>(num_outputs));
      if (remainder) {
        // uneven
-        auto split_sizes = InlinedVector<uint64_t>(num_outputs, chunk_size);
+        auto split_sizes = InlinedVector<int64_t>(num_outputs, chunk_size);
        split_sizes.back() = remainder;
        for (size_t i = 0; i < split_sizes.size(); i++) {
          coreml_splitnd->add_splitsizes(split_sizes[i]);
@ -162,7 +162,7 @@ bool SplitOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPar
    }

    const auto split_shape = *input_defs[1]->Shape();
-    if (split_shape.dim_size() < 2) {
+    if (split_shape.dim(0).dim_value() < 2) {
      LOGS(logger, VERBOSE) << "CoreML Split must produce at least 2 outputs.";
      return false;
    }
--- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
@ -5,10 +5,13 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/optimizer/initializer.h"
+#include "core/providers/cpu/tensor/unsqueeze.h"

 namespace onnxruntime {
 namespace coreml {
@ -21,16 +24,16 @@ class SqueezeOpBuilder : public BaseOpBuilder {

  bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                         const logging::Logger& logger) const override;
+  bool SupportsMLProgram() const override { return true; }
 };

 namespace {
-Status GetAxes(ModelBuilder& model_builder, const Node& node, std::vector<int64_t>& axes) {
+void GetAxes(ModelBuilder& model_builder, const Node& node, TensorShapeVector& axes) {
  // Squeeze opset 13 use input as axes
  if (node.SinceVersion() > 12) {
    // If axes is not provided, return an empty axes as default to squeeze all
    if (node.InputDefs().size() > 1) {
-      const auto& initializers(model_builder.GetInitializerTensors());
-      const auto& axes_tensor = *initializers.at(node.InputDefs()[1]->Name());
+      const auto& axes_tensor = *model_builder.GetConstantInitializer(node.InputDefs()[1]->Name());
      Initializer unpacked_tensor(axes_tensor);
      auto raw_axes = unpacked_tensor.DataAsSpan<int64_t>();
      const auto size = SafeInt<size_t>(axes_tensor.dims()[0]);
@ -39,10 +42,9 @@ Status GetAxes(ModelBuilder& model_builder, const Node& node, std::vector<int64_
    }
  } else {
    NodeAttrHelper helper(node);
-    axes = helper.Get("axes", std::vector<int64_t>());
+    auto axes_attr = helper.Get("axes", std::vector<int64_t>());
+    axes.assign(axes_attr.begin(), axes_attr.end());
  }
-
-  return Status::OK();
 }
 }  // namespace

@ -54,38 +56,83 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const

 Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                               const Node& node,
-                                               const logging::Logger& /* logger */) const {
+                                               [[maybe_unused]] const logging::Logger& logger) const {
  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
+  const auto& input_defs(node.InputDefs());
  auto* coreml_squeeze = layer->mutable_squeeze();
-  std::vector<int64_t> axes;
-  ORT_RETURN_IF_ERROR(GetAxes(model_builder, node, axes));
-  if (axes.empty()) {
-    coreml_squeeze->set_squeezeall(true);
-  } else {
-    *coreml_squeeze->mutable_axes() = {axes.cbegin(), axes.cend()};
-    coreml_squeeze->set_squeezeall(false);
+  TensorShapeVector axes;
+  GetAxes(model_builder, node, axes);
+  std::vector<int64_t> input_shape;
+  GetShape(*input_defs[0], input_shape, logger);
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    std::string_view coreml_op_type = node.OpType() == "Squeeze" ? "squeeze" : "reshape";
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+
+    if (coreml_op_type == "squeeze") {
+      if (!axes.empty()) {
+        // coreml squeeze op does support negative axes
+        AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", AsSpan(axes)));
+      }
+    } else {
+      TensorShapeVector output_shape = UnsqueezeBase::ComputeOutputShape(TensorShape(input_shape), axes);
+      AddOperationInput(*op, "shape", model_builder.AddConstant(op->type(), "shape", AsSpan(output_shape)));
+    }
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+    model_builder.AddOperation(std::move(op));
+  } else  // NOLINT
+#endif
+  {
+    if (axes.empty()) {
+      coreml_squeeze->set_squeezeall(true);
+    } else {
+      *coreml_squeeze->mutable_axes() = {axes.cbegin(), axes.cend()};
+      coreml_squeeze->set_squeezeall(false);
+    }
+
+    *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
  }
-
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
-
-  model_builder.AddLayer(std::move(layer));
  return Status::OK();
 }

 bool SqueezeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
-                                         const logging::Logger& /*logger*/) const {
+                                         const logging::Logger& logger) const {
  // Squeeze opset 13 uses input 1 as axes, if we have input 1 then it needs to be an initializer
-  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
-  if (node.SinceVersion() > 12 && node.InputDefs().size() > 1) {
-    const auto& axes_name = node.InputDefs()[1]->Name();
-    if (!Contains(initializers, axes_name)) {
-      LOGS_DEFAULT(VERBOSE) << "Input axes of Squeeze must be known";
+  const auto& input_defs = node.InputDefs();
+  if (node.SinceVersion() > 12 && input_defs.size() > 1) {
+    const auto& axes_name = input_defs[1]->Name();
+    if (!input_params.graph_viewer.GetConstantInitializer(axes_name)) {
+      LOGS(logger, VERBOSE) << "Input axes must be known";
      return false;
    }
  }

+  if (node.OpType() == "Unsqueeze") {
+    if (!input_params.create_mlprogram) {
+      return false;
+    }
+
+    int64_t num_of_new_dims = 0;
+    if (node.SinceVersion() > 12) {
+      num_of_new_dims = node.InputDefs()[1]->Shape()->dim(0).dim_value();
+    } else {
+      NodeAttrHelper helper(node);
+      auto axes = helper.Get("axes", std::vector<int64_t>());
+      num_of_new_dims = static_cast<int64_t>(axes.size());
+    }
+
+    std::vector<int64_t> input_shape;
+    if (!GetShape(*input_defs[0], input_shape, logger) || input_shape.size() + num_of_new_dims > 5) {
+      LOGS(logger, VERBOSE) << "Unsqueeze to output shape with > 5 dimensions is not supported";
+      return false;
+    }
+  }
  return true;
 }

--- a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
@ -16,6 +16,8 @@ class UnaryOpBuilder : public BaseOpBuilder {
  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                               const logging::Logger& logger) const override;
  bool SupportsMLProgram() const override { return true; }
+  bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                         const logging::Logger& logger) const override;
 };

 Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
@ -32,6 +34,10 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
      coreml_op_type = "sqrt";
    } else if (op_type == "Reciprocal") {
      coreml_op_type = "inverse";
+    } else if (op_type == "Erf") {
+      coreml_op_type = "erf";
+    } else if (op_type == "Round") {
+      coreml_op_type = "round";
    } else {
      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                             "UnaryOpBuilder::AddToModelBuilderImpl, unexpected op: ", op_type);
@ -74,6 +80,14 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
  return Status::OK();
 }

+bool UnaryOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                                       const logging::Logger& /*logger*/) const {
+  if (!input_params.create_mlprogram && (node.OpType() == "Erf" || node.OpType() == "Round")) {
+    return false;
+  }
+  return true;
+}
+
 void CreateUnaryOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
  op_registrations.builders.push_back(std::make_unique<UnaryOpBuilder>());
  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
@ -24,13 +24,16 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
  CreateActivationOpBuilder("Gelu", op_registrations);

  // Unary ops
+  CreateUnaryOpBuilder("Erf", op_registrations);
  CreateUnaryOpBuilder("Reciprocal", op_registrations);
+  CreateUnaryOpBuilder("Round", op_registrations);
  CreateUnaryOpBuilder("Sqrt", op_registrations);

  // Binary elementwise ops
  CreateBinaryOpBuilder("Add", op_registrations);
  CreateBinaryOpBuilder("Div", op_registrations);
  CreateBinaryOpBuilder("Mul", op_registrations);
+  CreateBinaryOpBuilder("Max", op_registrations);
  CreateBinaryOpBuilder("Pow", op_registrations);
  CreateBinaryOpBuilder("Sub", op_registrations);

@ -42,6 +45,9 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {

  // Reduction ops
  CreateReductionOpBuilder("ReduceMean", op_registrations);
+  CreateReductionOpBuilder("ReduceMin", op_registrations);
+  CreateReductionOpBuilder("ReduceMax", op_registrations);
+  CreateReductionOpBuilder("ReduceProd", op_registrations);
  CreateReductionOpBuilder("ReduceSum", op_registrations);

  // Normalization ops
@ -72,6 +78,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
  CreateSoftmaxOpBuilder("Softmax", op_registrations);
  CreateSqueezeOpBuilder("Squeeze", op_registrations);
  CreateTransposeOpBuilder("Transpose", op_registrations);
+  CreateSqueezeOpBuilder("Unsqueeze", op_registrations);

  return op_registrations;
 }
--- a/onnxruntime/core/providers/cpu/tensor/unsqueeze.h
+++ b/onnxruntime/core/providers/cpu/tensor/unsqueeze.h
@ -20,15 +20,6 @@ class UnsqueezeBase {
  };

  Status PrepareCompute(OpKernelContext* context, Prepare& p) const;
-
- protected:
-  UnsqueezeBase(const OpKernelInfo& info) {
-    size_t num_inputs = info.GetInputCount();
-    if (num_inputs == 1) {  // axes must be a valid attribute
-      ORT_ENFORCE(info.GetAttrs("axes", axes_).IsOK(), "Missing/Invalid 'axes' attribute value");
-    }
-  }
-
  static TensorShapeVector ComputeOutputShape(
      const TensorShape& input_shape,
      const TensorShapeVector& axes) {
@ -59,6 +50,14 @@ class UnsqueezeBase {
    return output_shape;
  }

+ protected:
+  UnsqueezeBase(const OpKernelInfo& info) {
+    size_t num_inputs = info.GetInputCount();
+    if (num_inputs == 1) {  // axes must be a valid attribute
+      ORT_ENFORCE(info.GetAttrs("axes", axes_).IsOK(), "Missing/Invalid 'axes' attribute value");
+    }
+  }
+
  TensorShapeVector axes_;
 };

--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@ -2271,6 +2271,21 @@ TEST(MathOpTest, Max_12_MLFloat16_Scalar1) {
  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }

+TEST(MathOpTest, Max_12_MLFloat16_Scalar2) {
+  OpTester test("Max", 12);
+  test.AddInput<MLFloat16>("data_0", {1},
+                           MakeMLFloat16({-1.f}));
+  test.AddInput<MLFloat16>("data_1", {},
+                           MakeMLFloat16({2.f}));
+  test.AddInput<MLFloat16>("data_2", {1, 3},
+                           MakeMLFloat16({-2.f, -3.f, -4.f}));
+  test.AddInput<MLFloat16>("data_3", {1, 1, 3},
+                           MakeMLFloat16({-2.f, -3.f, -4.f}));
+  test.AddOutput<MLFloat16>("max", {1, 1, 3},
+                            MakeMLFloat16({2.f, 2.f, 2.f}));
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
+}
+
 TEST(MathOpTest, Max_13_Float16_MatrixVector) {
  TestFloat16MinMax("Max",
                    {4, 3},
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@ -1375,7 +1375,7 @@ TEST(ReductionOpTest, ReduceMax_double) {
  test.Run();
 }

-#if defined(USE_CUDA) || defined(USE_ROCM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
 TEST(ReductionOpTest, ReduceMax_half) {
  OpTester test("ReduceMax");
  test.AddAttribute("axes", std::vector<int64_t>{1, 2});
@ -2158,7 +2158,7 @@ TEST(ReductionOpTest, ReduceMin_double) {
  test.Run();
 }

-#if defined(USE_CUDA) || defined(USE_ROCM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
 TEST(ReductionOpTest, ReduceMin_half) {
  OpTester test("ReduceMin");
  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
@ -2356,7 +2356,7 @@ TEST(ReductionOpTest, ReduceSum_int32) {
  test.Run();
 }

-#if defined(USE_CUDA) || defined(USE_ROCM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
 TEST(ReductionOpTest, ReduceSumHalfHalf) {
  OpTester test("ReduceSum");
  test.AddAttribute("keepdims", (int64_t)0);
@ -5681,7 +5681,7 @@ TEST(ReductionOpTest, ReduceSum_RK_parallel) {
  test.AddOutput<float>("reduced", {32}, expected);

  // CoreML does not provide 1e-5 precision here (it's off by 1e-4)
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCoreMLExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
 }

 TEST(ReductionOpTest, ReduceSum_RK_keepdims) {
--- a/onnxruntime/test/providers/cpu/tensor/unsqueeze_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/unsqueeze_op_test.cc
@ -11,7 +11,7 @@ namespace test {

 // Disable TensorRT on the tests because of SegFault errors in the parser

-TEST(TensorOpTest, Unsqueeze_1) {
+TEST(UnsqueezeOpTest, Unsqueeze_1) {
  OpTester test("Unsqueeze");

  test.AddAttribute("axes", std::vector<int64_t>{1});
@ -20,7 +20,7 @@ TEST(TensorOpTest, Unsqueeze_1) {
  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }

-TEST(TensorOpTest, Unsqueeze_1_int32) {
+TEST(UnsqueezeOpTest, Unsqueeze_1_int32) {
  OpTester test("Unsqueeze");

  test.AddAttribute("axes", std::vector<int64_t>{1});
@ -29,7 +29,7 @@ TEST(TensorOpTest, Unsqueeze_1_int32) {
  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }

-TEST(TensorOpTest, Unsqueeze_2) {
+TEST(UnsqueezeOpTest, Unsqueeze_2) {
  OpTester test("Unsqueeze");

  test.AddAttribute("axes", std::vector<int64_t>{0, 4});
@ -38,7 +38,7 @@ TEST(TensorOpTest, Unsqueeze_2) {
  test.Run();
 }

-TEST(TensorOpTest, Unsqueeze_3) {
+TEST(UnsqueezeOpTest, Unsqueeze_3) {
  OpTester test("Unsqueeze");

  test.AddAttribute("axes", std::vector<int64_t>{2, 1, 0});
@ -47,7 +47,7 @@ TEST(TensorOpTest, Unsqueeze_3) {
  test.Run();
 }

-TEST(TensorOpTest, Unsqueeze_scalar) {
+TEST(UnsqueezeOpTest, Unsqueeze_scalar) {
  {
    OpTester test("Unsqueeze");

@ -85,7 +85,7 @@ TEST(TensorOpTest, Unsqueeze_scalar) {
  run_test(true);
 }

-TEST(TensorOpTest, Unsqueeze_scalar_2) {
+TEST(UnsqueezeOpTest, Unsqueeze_scalar_2) {
  {
    OpTester test("Unsqueeze");

@ -105,7 +105,7 @@ TEST(TensorOpTest, Unsqueeze_scalar_2) {
  run_test(true);
 }

-TEST(TensorOpTest, Unsqueeze_Duplicate) {
+TEST(UnsqueezeOpTest, Unsqueeze_Duplicate) {
  {
    OpTester test("Unsqueeze", 12);  // opset 1-12 has axes attribute

@ -128,7 +128,7 @@ TEST(TensorOpTest, Unsqueeze_Duplicate) {
  }
 }

-TEST(TensorOpTest, Unsqueeze_OutOfRange) {
+TEST(UnsqueezeOpTest, Unsqueeze_OutOfRange) {
  {
    OpTester test("Unsqueeze", 12);  // opset 1-12 has axes attribute
    test.AddAttribute("axes", std::vector<int64_t>{4});
@ -149,7 +149,7 @@ TEST(TensorOpTest, Unsqueeze_OutOfRange) {
  }
 }

-TEST(TensorOpTest, UnsqueezeNegAxis_3) {
+TEST(UnsqueezeOpTest, UnsqueezeNegAxis_3) {
  {
    OpTester test("Unsqueeze", 12);  // opset 1-12 has axes attribute
    test.AddAttribute("axes", std::vector<int64_t>{-4, 1, -6});
@ -171,7 +171,7 @@ TEST(TensorOpTest, UnsqueezeNegAxis_3) {
  run_test(true);
 }

-TEST(TensorOpTest, Unsqueeze_1_int32_axes_input) {
+TEST(UnsqueezeOpTest, Unsqueeze_1_int32_axes_input) {
  auto run_test = [](bool axes_is_initializer) {
    OpTester test("Unsqueeze", 13);

@ -185,7 +185,7 @@ TEST(TensorOpTest, Unsqueeze_1_int32_axes_input) {
  run_test(true);
 }

-TEST(TensorOpTest, Unsqueeze_3_axes_input) {
+TEST(UnsqueezeOpTest, Unsqueeze_3_axes_input) {
  auto run_test = [](bool axes_is_initializer) {
    OpTester test("Unsqueeze", 13);

@ -200,7 +200,7 @@ TEST(TensorOpTest, Unsqueeze_3_axes_input) {
 }

 #if defined(USE_DNNL)
-TEST(TensorOpTest, Unsqueeze_3_axes_input_bfloat16) {
+TEST(UnsqueezeOpTest, Unsqueeze_3_axes_input_bfloat16) {
 #ifdef USE_DNNL
  if (!DnnlHasBF16Support()) {
    LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16";
@ -218,7 +218,7 @@ TEST(TensorOpTest, Unsqueeze_3_axes_input_bfloat16) {
  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }

-TEST(TensorOpTest, UnsqueezeNegAxis_3_bfloat16) {
+TEST(UnsqueezeOpTest, UnsqueezeNegAxis_3_bfloat16) {
 #ifdef USE_DNNL
  if (!DnnlHasBF16Support()) {
    LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16";
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@ -13,6 +13,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:ConvTranspose|Weight and bias must be constant.<br/>padding_type of SAME_UPPER/SAME_LOWER is not supported.<br/>kernel_shape must have default values.<br/>output_shape is not supported.<br/>output_padding must have default values.|
 |ai.onnx:DepthToSpace|If 'mode' is 'CRD' the input must have a fixed shape.|
 |ai.onnx:Div||
+|ai.onnx:Erf||
 |ai.onnx:Gemm|Input B must be constant.|
 |ai.onnx:Gelu||
 |ai.onnx:GlobalAveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
@ -24,17 +25,26 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:LeakyRelu||
 |ai.onnx:MatMul|Only support for transA == 0, alpha == 1.0 and beta == 1.0 is currently implemented.|
 |ai.onnx:MaxPool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
+|ai.onnx:Max||
 |ai.onnx:Mul||
 |ai.onnx:Pow|Only supports cases when both inputs are fp32.|
 |ai.onnx:PRelu||
 |ai.onnx:Reciprocal|this ask for a `epislon` (default 1e-4) where onnx don't provide|
+|ai.onnx:ReduceSum||
+|ai.onnx:ReduceMean||
+|ai.onnx:ReduceMax||
 |ai.onnx:Relu||
 |ai.onnx:Reshape||
 |ai.onnx:Resize|See [resize_op_builder.cc](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc) implementation. There are too many permutations to describe the valid combinations.|
+|ai.onnx:Round||
+|ai.onnx:Shape||
 |ai.onnx:Slice|starts/ends/axes/steps must be constant initializers.|
 |ai.onnx:Split|If provided, `splits` must be constant.|
 |ai.onnx:Sub||
 |ai.onnx:Sigmoid||
+|ai.onnx:Softmax||
 |ai.onnx:Sqrt||
+|ai.onnx:Squeeze||
 |ai.onnx:Tanh||
 |ai.onnx:Transpose||
+|ai.onnx:Unsqueeze||