Add the MLlama Imaging Processing Support (#823)

* initial checkins for mllama image process * fix some tests * some fixings * add more image * More test assertions * parity test passed * code clean up * code refinement
2024-10-22 14:24:09 -07:00 · 2024-10-22 14:24:09 -07:00 · aa2c82fa67
--- a/include/custom_op/tensor_api.h
+++ b/include/custom_op/tensor_api.h
@ -557,7 +557,6 @@ class Tensor<std::string_view> : public TensorBase {
  std::unique_ptr<IStringTensorStorage<std::string_view>> storage_;
 };

-
 template<typename ...Args>
 class NamedArgumentDict{
 public:
--- a/onnxruntime_extensions/pp_api.py
+++ b/onnxruntime_extensions/pp_api.py
@ -72,8 +72,8 @@ class ImageProcessor:
        return image_pre_process(self.processor, images)

    @staticmethod
-    def to_numpy(result):
-        return tensor_result_get_at(result, 0)
+    def to_numpy(result, idx):
+        return tensor_result_get_at(result, idx)

    def __del__(self):
        if delete_object and self.processor:
--- a/shared/api/c_api_processor.cc
+++ b/shared/api/c_api_processor.cc
@ -95,5 +95,5 @@ extError_t ORTX_API_CALL OrtxImagePreProcess(OrtxProcessor* processor, OrtxRawIm
    *result = nullptr;
  }

-  return {};
+  return status.Code();
 }
--- a/shared/api/c_api_utils.cc
+++ b/shared/api/c_api_utils.cc
@ -160,26 +160,3 @@ extError_t ORTX_API_CALL OrtxGetTensorData(OrtxTensor* tensor, const void** data
  return extError_t();
 }

-extError_t ORTX_API_CALL OrtxGetTensorDataInt64(OrtxTensor* tensor, const int64_t** data, const int64_t** shape,
-                                                size_t* num_dims) {
-  const void* data_ptr{};
-  auto err = OrtxGetTensorData(tensor, &data_ptr, shape, num_dims);
-  *data = reinterpret_cast<const int64_t*>(data_ptr);  // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
-  return err;
-}
-
-extError_t ORTX_API_CALL OrtxGetTensorDataFloat(OrtxTensor* tensor, const float** data, const int64_t** shape,
-                                                size_t* num_dims) {
-  const void* data_ptr{};
-  auto err = OrtxGetTensorData(tensor, &data_ptr, shape, num_dims);
-  *data = reinterpret_cast<const float*>(data_ptr);  // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
-  return err;
-}
-
-extError_t ORTX_API_CALL OrtxGetTensorDataUint8(OrtxTensor* tensor, const uint8_t** data, const int64_t** shape,
-                                                size_t* num_dims) {
-  const void* data_ptr{};
-  auto err = OrtxGetTensorData(tensor, &data_ptr, shape, num_dims);
-  *data = reinterpret_cast<const uint8_t*>(data_ptr);  // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
-  return err;
-}
--- a/shared/api/c_api_utils.hpp
+++ b/shared/api/c_api_utils.hpp
@ -4,6 +4,7 @@
 #pragma once
 #include <vector>
 #include <fstream>
+#include <variant>

 #include "ortx_utils.h"
 #include "file_sys.h"
@ -258,4 +259,10 @@ std::tuple<std::unique_ptr<T[]>, size_t> LoadRawData(It begin, It end) {

  return std::make_tuple(std::move(raw_data), n);
 }
+
+using AttrType =
+  std::variant<std::string, double, int64_t, std::vector<std::string>, std::vector<double>, std::vector<int64_t>>;
+using AttrDict = std::unordered_map<std::string, AttrType>;
 }  // namespace ort_extensions
+
+namespace ortx = ort_extensions;
--- a/shared/api/image_processor.cc
+++ b/shared/api/image_processor.cc
@ -17,19 +17,28 @@
    #include "image_decoder.hpp"
  #endif
 #else
-#include "image_decoder.hpp"
+  #include "image_decoder.hpp"
 #endif
+
 #include "image_transforms.hpp"
 #include "image_transforms_phi_3.hpp"
+#include "image_transforms_mllama.hpp"
+
+namespace ort_extensions {
+std::tuple<std::unique_ptr<ImageRawData[]>, size_t>
+LoadRawImages(const std::initializer_list<const char*>& image_paths) {
+  return ort_extensions::LoadRawData<const char* const*, ImageRawData>(image_paths.begin(), image_paths.end());
+}
+
+std::tuple<std::unique_ptr<ImageRawData[]>, size_t>
+LoadRawImages(const char* image_paths[], size_t num_images) {
+  return ort_extensions::LoadRawData<const char* const*, ImageRawData>(image_paths, image_paths + num_images);
+}
+}  // namespace ort_extensions

 using namespace ort_extensions;
 using json = nlohmann::json;

-std::tuple<std::unique_ptr<ImageRawData[]>, size_t>
-ort_extensions::LoadRawImages(const std::initializer_list<const char*>& image_paths) {
-  return ort_extensions::LoadRawData<const char* const*, ImageRawData>(image_paths.begin(), image_paths.end());
-}
-
 Operation::KernelRegistry ImageProcessor::kernel_registry_ = {
    {"DecodeImage", []() { return CreateKernelInstance(&DecodeImage::Compute); }},
    {"Resize", []() { return CreateKernelInstance(&Resize::Compute); }},
@ -37,7 +46,9 @@ Operation::KernelRegistry ImageProcessor::kernel_registry_ = {
    {"Normalize", []() { return CreateKernelInstance(&Normalize::Compute); }},
    {"CenterCrop", []() { return CreateKernelInstance(&CenterCrop::Compute); }},
    {"ConvertRGB", []() { return CreateKernelInstance(convert_to_rgb); }},
+    {"Permute3D", []() { return CreateKernelInstance(&Permute3D::Compute); }},
    {"Phi3ImageTransform", []() { return CreateKernelInstance(phi3_hd_transform); }},
+    {"Llama3ImageTransform", []() { return CreateKernelInstance(&Llama3ImageTransform::Compute); }},
 };

 OrtxStatus ImageProcessor::Init(std::string_view processor_def) {
@ -189,7 +200,6 @@ OrtxStatus ImageProcessor::PreProcess(ort_extensions::span<ImageRawData> image_d
  operations_.back()->ResetTensors(allocator_);
  if (status.IsOk()) {
    r.SetTensors(std::move(img_result));
-    // r.SetTensorTypes({kOrtxFloat, kOrtxInt64, kOrtxInt64});
  }

  return status;
--- a/shared/api/image_processor.h
+++ b/shared/api/image_processor.h
@ -19,6 +19,9 @@ using ImageRawData = std::vector<uint8_t>;
 std::tuple<std::unique_ptr<ImageRawData[]>, size_t> LoadRawImages(
  const std::initializer_list<const char*>& image_paths);

+std::tuple<std::unique_ptr<ImageRawData[]>, size_t> LoadRawImages(
+  const char* image_paths[], size_t num_images);
+
 class ProcessorResult : public OrtxObjectImpl {
 public:
  ProcessorResult() : OrtxObjectImpl(kOrtxKindProcessorResult) {}
@ -26,6 +29,7 @@ class ProcessorResult : public OrtxObjectImpl {
  ortc::Tensor<int64_t>* image_sizes{};
  ortc::Tensor<int64_t>* num_img_tokens{};
 };
+
 class ImageProcessor : public OrtxObjectImpl {
 public:
  ImageProcessor();
--- a/shared/api/image_transforms.hpp
+++ b/shared/api/image_transforms.hpp
@ -3,9 +3,31 @@

 #pragma once

-#include "ocos.h"
+#include "ext_status.h"
+#include "op_def_struct.h"
 #include "image_resample.h"

+template <typename T>
+void DumpTensorToFile(const ortc::Tensor<T>& tensor, const char* name) {
+#if WIN32
+  auto tic = GetTickCount();
+  std::string dtype;
+  if constexpr (std::is_same_v<T, uint8_t> || std::is_same_v<T, std::byte>) {
+    dtype = "_u_";
+  } else {
+    dtype = "_f_";
+  }
+  dtype += std::to_string(tensor.Shape()[1]);
+  // use tic to be filename in a temp file name
+  auto filename = std::string("\\temp\\") + name + std::to_string(tic) + dtype + ".bin";
+  std::ofstream file(filename, std::ios::out | std::ios::binary);
+  if (file.is_open()) {
+    file.write(reinterpret_cast<const char*>(tensor.DataRaw()), tensor.SizeInBytes());
+    file.close();
+  }
+#endif
+}
+
 inline OrtxStatus convert_to_rgb(const ortc::Tensor<uint8_t>& input, ortc::Tensor<uint8_t>& output) {
  auto& dimensions = input.Shape();
  if (dimensions.size() != 3ULL || dimensions[2] != 3) {
@ -31,23 +53,13 @@ inline OrtxStatus convert_to_rgb(const ortc::Tensor<uint8_t>& input, ortc::Tenso
 }

 struct Resize {
-  template <typename DictT>
-  OrtxStatus Init(const DictT& attrs) {
-    for (const auto& [key, value] : attrs) {
-      if (key == "height") {
-        height_ = std::get<int64_t>(value);
-      } else if (key == "width") {
-        width_ = std::get<int64_t>(value);
-      } else if (key == "interpolation") {
-        interpolation_ = std::get<std::string>(value);
-        if (interpolation_ != "NEAREST" && interpolation_ != "LINEAR" && interpolation_ != "CUBIC") {
-          return {kOrtxErrorInvalidArgument, "[Resize]: Invalid interpolation method"};
-        }
-      } else {
-        return {kOrtxErrorInvalidArgument, "[Resize]: Invalid argument"};
-      }
-    }
-    return {};
+  static const std::unordered_map<std::string, int> InterpolationMethods() {
+    return {
+      {"NEAREST", IMAGING_TRANSFORM_NEAREST},
+      {"LINEAR", IMAGING_TRANSFORM_BILINEAR},
+      {"CUBIC", IMAGING_TRANSFORM_BICUBIC},
+      {"LANCZOS", IMAGING_TRANSFORM_LANCZOS}
+    };
  }

  OrtxStatus Compute(const ortc::Tensor<uint8_t>& input, ortc::Tensor<uint8_t>& output) {
@ -72,40 +84,57 @@ struct Resize {
      }
    }

-    int interp = IMAGING_TRANSFORM_NEAREST;
-    if (interpolation_ == "NEAREST") {
-      interp = IMAGING_TRANSFORM_NEAREST;
-    } else if (interpolation_ == "LINEAR") {
-      interp = IMAGING_TRANSFORM_BILINEAR;
-    } else if (interpolation_ == "CUBIC") {
-      interp = IMAGING_TRANSFORM_BICUBIC;
-    } else if (interpolation_ == "LANCZOS") {
-      interp = IMAGING_TRANSFORM_LANCZOS;
-    } else {
-      return {kOrtxErrorInvalidArgument, "[Resize]: Invalid interpolation method"};
+    int interp = InterpolationMethods().at(interpolation_);
+    float box[4] = {0.0f, 0.0f, static_cast<float>(w), static_cast<float>(h)};
+    auto [height, width] = std::make_tuple(height_, width_);
+
+    if (keep_aspect_ratio_) {
+      double scale = (std::max)(static_cast<double>(width) / w, static_cast<double>(height) / h);
+      width = static_cast<int64_t>(w * scale);
+      height = static_cast<int64_t>(h * scale);
    }

-    float box[4] = {0.0f, 0.0f, static_cast<float>(width_), static_cast<float>(height_)};
-
-    auto output_image = ImagingResample(rgb_image, static_cast<int>(width_), static_cast<int>(height_), interp, box);
-    // cv::resize(image, output_image, {static_cast<int32_t>(width_), static_cast<int32_t>(height_)}, 0.0, 0.0, interp);
+    auto output_image = ImagingResample(rgb_image, static_cast<int>(width), static_cast<int>(height), interp, box);
    ImagingDelete(rgb_image);

-    auto* p_output_image = output.Allocate({height_, width_, c});
-    for (auto i = height_ - height_; i < height_; ++i) {
-      for (auto j = width_ - width_; j < width_; ++j) {
-        auto c0_index = i * width_ * c + j * c;
+    auto* p_output_image = output.Allocate({height, width, c});
+    for (auto i = height - height; i < height; ++i) {
+      for (auto j = width - width; j < width; ++j) {
+      auto c0_index = i * width * c + j * c;
      std::memcpy(p_output_image + c0_index, output_image->image[i] + j * 4, c);
      }
    }
+//    DumpTensor(output);

    ImagingDelete(output_image);
    return {};
  }

+  template <typename DictT>
+  OrtxStatus Init(const DictT& attrs) {
+    for (const auto& [key, value] : attrs) {
+      if (key == "height") {
+        height_ = std::get<int64_t>(value);
+      } else if (key == "width") {
+        width_ = std::get<int64_t>(value);
+      } else if (key == "keep_aspect_ratio") {
+        keep_aspect_ratio_ = std::get<int64_t>(value) != 0;
+      } else if (key == "interpolation") {
+        interpolation_ = std::get<std::string>(value);
+        if (InterpolationMethods().find(interpolation_) == InterpolationMethods().end()) {
+          return {kOrtxErrorInvalidArgument, "[Resize]: Invalid interpolation method"};
+        }
+      } else {
+        return {kOrtxErrorInvalidArgument, "[Resize]: Invalid argument"};
+      }
+    }
+    return {};
+  }
+
 private:
  int64_t height_{256};
  int64_t width_{256};
+  bool keep_aspect_ratio_{true};
  std::string interpolation_{"CUBIC"};  // LINEAR, NEAREST, CUBIC
 };

@ -113,7 +142,7 @@ struct Rescale {
  template <typename DictT>
  OrtxStatus Init(const DictT& attrs) {
    for (const auto& [key, value] : attrs) {
-      if (key == "scale") {
+      if (key == "rescale_factor") {
        scale_ = static_cast<float>(std::get<double>(value));
      } else {
        return {kOrtxErrorInvalidArgument, "[Rescale]: Invalid argument"};
@ -139,7 +168,7 @@ struct Rescale {
      for (int64_t k = 0; k < w; ++k) {
        auto c0_index = j * w * c + k * c;
        for (int64_t l = 0; l < c; ++l) {
-          p_output_image[c0_index + l] = input_data[c0_index + l] * scale_;
+          p_output_image[c0_index + l] = static_cast<float>(input_data[c0_index + l]) * scale_;
        }
      }
    }
@ -220,7 +249,6 @@ struct CenterCrop {
  // s_h = torch.div((img_h - height), 2, rounding_mode='trunc')
  // s_w = torch.div((img_w - width), 2, rounding_mode='trunc')
  // x = img[:, :, s_h:s_h + height, s_w:s_w + width]
-
  OrtxStatus Compute(const ortc::Tensor<uint8_t>& input, ortc::Tensor<uint8_t>& output) {
    auto& dimensions = input.Shape();
    if (dimensions.size() != 3ULL) {
@ -252,3 +280,47 @@ struct CenterCrop {
  int64_t target_h_{224};
  int64_t target_w_{224};
 };
+
+struct Permute3D {
+
+  OrtxStatus Compute(const ortc::Tensor<float>& input, ortc::Tensor<float>& output) {
+    auto& dimensions = input.Shape();
+    if (dimensions.size() != 3ULL || dims_.size() != 3ULL) {
+      return {kOrtxErrorInvalidArgument, "[Permute]: Only 3D tensors are supported"};
+    }
+
+    auto* input_data = input.Data();
+    std::vector<int64_t> output_shape = {dimensions[dims_[0]], dimensions[dims_[1]], dimensions[dims_[2]]};
+    auto* p_output_image = output.Allocate(output_shape);
+
+    for (int64_t i = 0; i < dimensions[0]; ++i) {
+      for (int64_t j = 0; j < dimensions[1]; ++j) {
+        for (int64_t k = 0; k < dimensions[2]; ++k) {
+          auto c0_index = i * dimensions[1] * dimensions[2] + j * dimensions[2] + k;
+          auto c1_index = (dims_[0] == 0 ? i : (dims_[0] == 1 ? j : k)) * output_shape[1] * output_shape[2] +
+                          (dims_[1] == 0 ? i : (dims_[1] == 1 ? j : k)) * output_shape[2] +
+                          (dims_[2] == 0 ? i : (dims_[2] == 1 ? j : k));
+          p_output_image[c1_index] = input_data[c0_index];
+        }
+      }
+    }
+
+    return {};
+  }
+
+  template <typename DictT>
+  OrtxStatus Init(const DictT& attrs) {
+    for (const auto& [key, value] : attrs) {
+      if (key == "dims") {
+        dims_ = std::get<std::vector<int64_t>>(value);
+      } else {
+        return {kOrtxErrorInvalidArgument, "[Permute]: Invalid argument"};
+      }
+    }
+
+    return {};
+  }
+
+ private:
+  std::vector<int64_t> dims_{1, 2, 0};
+};
--- a/shared/api/image_transforms_mllama.hpp
+++ b/shared/api/image_transforms_mllama.hpp
@ -0,0 +1,390 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <vector>
+
+#include "ortx_processor.h"
+#include "c_api_utils.hpp"
+#include "image_resample.h"
+#include "image_transforms.hpp"
+
+struct Llama3ImageTransform {
+  static void SplitIntoTitles(const ortc::Tensor<float>& normalized_image, ortc::Tensor<float>& pixel_values,
+                              int64_t tile_height, int64_t tile_width) {
+    auto& shape = normalized_image.Shape();
+    int64_t image_height = shape[0];
+    int64_t image_width = shape[1];
+    int64_t num_channels = shape[2];
+
+    const int64_t image_1c_size = tile_height * tile_width;
+    assert(image_height % tile_height == 0);
+    int64_t num_tiles_height = static_cast<int64_t>(image_height / tile_height);
+    assert(image_width % tile_width == 0);
+    int64_t num_tiles_width = static_cast<int64_t>(image_width / tile_width);
+
+    auto p_normalized_image = normalized_image.Data();
+    // shape (num_tiles_width * num_tiles_height, num_channels, tile_height, tile_width)
+    float* output_pixel =
+        pixel_values.Allocate({num_tiles_height * num_tiles_width, num_channels, tile_height, tile_width});
+
+    // From (image_height, image_width, num_channels)
+    // Permute to (num_tiles_height, num_tiles_width, num_channels, tile_height, tile_width)
+    for (int64_t i = 0; i < num_tiles_height; ++i) {
+      for (int64_t j = 0; j < num_tiles_width; ++j) {
+        // convert to be channel first
+        for (int64_t k = 0; k < num_channels; ++k) {
+          auto sub_index = image_1c_size * (i * num_tiles_width + j) * num_channels + image_1c_size * k;
+          for (int64_t y = 0; y < tile_height; ++y) {
+            for (int64_t x = 0; x < tile_width; ++x) {
+              output_pixel[sub_index + y * tile_width + x] =
+                  p_normalized_image[(i * tile_height + y) * image_width * num_channels +
+                                     (j * tile_width + x) * num_channels + k];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  OrtxStatus Compute(const ortc::Tensor<uint8_t>& image, ortc::Tensor<float>& pixel_values,
+                     ortc::Tensor<int64_t>& aspect_ratio_ids, ortc::Tensor<int64_t>& aspect_ratio_mask,
+                     ortc::Tensor<int64_t>& num_tiles) {
+    auto& dimensions = image.Shape();
+    if (dimensions.size() != 3ULL) {
+      return {kOrtxErrorInvalidArgument, "[Llama3ImageTransform]: Only 3D decoded image tensors are supported"};
+    }
+
+    std::pair<int64_t, int64_t> aspect_ratio;
+    ortc::Tensor<uint8_t> resized_image(&ortx::CppAllocator::Instance());
+    OrtxStatus status = DoResize(image, resized_image, aspect_ratio);
+    if (!status.IsOk()) {
+      return status;
+    }
+
+    ortc::Tensor<uint8_t> padded_image(&ortx::CppAllocator::Instance());
+    status = DoPad(resized_image, aspect_ratio, padded_image);
+    if (!status.IsOk()) {
+      return status;
+    }
+    resized_image.Release();
+
+    ortc::Tensor<float> rescaled_image(&ortx::CppAllocator::Instance());
+    status = rescale_.Compute(padded_image, rescaled_image);
+    if (!status.IsOk()) {
+      return status;
+    }
+
+    ortc::Tensor<float> normalized_image(&ortx::CppAllocator::Instance());
+    status = normalize_.Compute(rescaled_image, normalized_image);
+    if (!status.IsOk()) {
+      return status;
+    }
+
+    // DumpTensorToFile(normalized_image, "normalized_image");
+
+    SplitIntoTitles(normalized_image, pixel_values, tile_size_.first, tile_size_.second);
+
+    std::vector<std::pair<int64_t, int64_t>> aspect_ratios = {aspect_ratio};
+    auto v_aspect_ratio_ids = ConvertAspectRatiosToIds(aspect_ratios, max_image_tiles_);
+    auto v_aspect_ratio_mask = BuildAspectRatioMask(aspect_ratios, max_image_tiles_);
+
+    auto p_ids = aspect_ratio_ids.Allocate({static_cast<int64_t>(v_aspect_ratio_ids.size())});
+    std::copy(v_aspect_ratio_ids.begin(), v_aspect_ratio_ids.end(), p_ids);
+
+    auto p_mask = aspect_ratio_mask.Allocate({static_cast<int64_t>(v_aspect_ratio_mask[0].size())});
+    std::copy(v_aspect_ratio_mask[0].begin(), v_aspect_ratio_mask[0].end(), p_mask);
+
+    auto p_num_tiles = num_tiles.Allocate({1});
+    p_num_tiles[0] = aspect_ratios[0].first * aspect_ratios[0].second;
+
+    return status;
+  }
+
+ private:
+  static std::vector<std::pair<int64_t, int64_t>> GetAllSupportedAspectRatios(int64_t max_image_tiles) {
+    std::vector<std::pair<int64_t, int64_t>> aspect_ratios;
+
+    for (int64_t width = 1; width <= max_image_tiles; ++width) {
+      for (int64_t height = 1; height <= max_image_tiles; ++height) {
+        if (width * height <= max_image_tiles) {
+          aspect_ratios.emplace_back(width, height);
+        }
+      }
+    }
+
+    return aspect_ratios;
+  }
+
+  /*
+    Calculates the new size of an image to fit within a canvas while maintaining aspect ratio.
+
+    This function calculates the optimal size for an image to fit within a canvas defined by
+    canvas_height and canvas_width, while ensuring that the image dimensions are not smaller than
+    tile_size. If the image is larger than the canvas, the returned size will fit within the canvas.
+    If the image already fits within the canvas, the size remains unchanged.
+    The aspect ratio of the original image is preserved.
+
+    Args:
+        image_height (`int`):
+            The height of the original image.
+        image_width (`int`):
+            The width of the original image.
+        canvas_height (`int`):
+            The height of the canvas.
+        canvas_width (`int`):
+            The width of the canvas.
+        tile_size (`int`):
+            The tile size.
+
+    Returns:
+        `Tuple[int, int]`: A tuple containing the new height and width of the image.
+
+  */
+  static std::tuple<int64_t, int64_t> GetImageSizeFitToCanvas(int64_t image_height, int64_t image_width,
+                                                              int64_t canvas_height, int64_t canvas_width,
+                                                              int64_t tile_size) {
+    // Set target image size in between `tile_size` and canvas_size
+    int64_t target_width = std::clamp(image_width, tile_size, canvas_width);
+    int64_t target_height = std::clamp(image_height, tile_size, canvas_height);
+
+    double scale_h = static_cast<double>(target_height) / image_height;
+    double scale_w = static_cast<double>(target_width) / image_width;
+
+    int64_t new_width, new_height;
+
+    if (scale_w < scale_h) {
+      new_width = target_width;
+      new_height = static_cast<int64_t>(std::round(image_height * scale_w));
+    } else {
+      new_height = target_height;
+      new_width = static_cast<int64_t>(std::round(image_width * scale_h));
+    }
+
+    return std::make_tuple(new_height, new_width);
+  }
+
+  static std::vector<std::vector<int64_t>> BuildAspectRatioMask(
+      const std::vector<std::pair<int64_t, int64_t>>& aspect_ratios, int64_t max_image_tiles) {
+    int64_t max_num_images = aspect_ratios.size();
+
+    // Initialize the 2D vector with zeros
+    std::vector<std::vector<int64_t>> aspect_ratio_mask(max_num_images, std::vector<int64_t>(max_image_tiles, 0));
+
+    // Set the first tile to 1 for all aspect ratios
+    for (int64_t j = 0; j < max_num_images; ++j) {
+      aspect_ratio_mask[j][0] = 1;
+    }
+
+    // Set the aspect ratio mask for the rest of the tiles
+    for (size_t j = 0; j < aspect_ratios.size(); ++j) {
+      int64_t num_tiles_w = aspect_ratios[j].first;
+      int64_t num_tiles_h = aspect_ratios[j].second;
+      int64_t num_tiles = num_tiles_w * num_tiles_h;
+      for (int64_t k = 0; k < num_tiles && k < max_image_tiles; ++k) {
+        aspect_ratio_mask[j][k] = 1;
+      }
+    }
+
+    return aspect_ratio_mask;
+  }
+
+  /*
+  Determines the best canvas based on image and tile size and maximum number of tiles.
+
+  First, calculates possible resolutions based on the maximum number of tiles and tile size.
+  For example for max_image_tiles=2, tile_size=100, possible tile arrangements are:
+  [(1, 1), (1, 2), (2, 1)] and corresponding canvas sizes are:
+  [(100, 100), (100, 200), (200, 100)]
+
+  For each possible resolution, calculates the scaling factors for
+  width and height, and selects the smallest one, which is the limiting side.
+  E.g. to match the canvas you can upscale height by 2x, and width by 1.5x,
+  therefore, the maximum upscaling you can do is min(2, 1.5) = 1.5.
+
+  If upscaling is possible (any of the scaling factors is greater than 1),
+  then picks the smallest upscaling factor > 1.
+
+  If upscaling is not possible, then picks the largest scaling factor <= 1, i.e.
+  reduce downscaling as much as possible.
+
+  If there are multiple resolutions with the same max scale, we pick the one with the lowest area,
+  to minimize padding. E.g., the same image can be upscaled to 224x224 and 224x448, but the latter
+  has more padding.
+
+  Args:
+      image_height (`int`):
+          The height of the image.
+      image_width (`int`):
+          The width of the image.
+      max_image_tiles (`int`):
+          The maximum number of tiles any image can be split into.
+      tile_size (`int`):
+          The tile size.
+
+  Returns:
+      `pair[int, int]`: The best canvas resolution [height, width] for the given image.
+  */
+  static std::pair<int64_t, int64_t> GetOptimalTiledCanvas(int64_t image_height, int64_t image_width,
+                                                           int64_t max_image_tiles, int64_t tile_size) {
+    {
+      auto possible_tile_arrangements = GetAllSupportedAspectRatios(max_image_tiles);
+      std::vector<std::pair<int, int>> possible_canvas_sizes;
+
+      for (const auto& arrangement : possible_tile_arrangements) {
+        possible_canvas_sizes.emplace_back(arrangement.first * tile_size, arrangement.second * tile_size);
+      }
+
+      std::vector<double> scales;
+      for (const auto& size : possible_canvas_sizes) {
+        double scale_h = static_cast<double>(size.first) / image_height;
+        double scale_w = static_cast<double>(size.second) / image_width;
+        scales.push_back(std::min(scale_h, scale_w));
+      }
+
+      double selected_scale = 0;
+      std::vector<double> upscaling_options;
+      for (double scale : scales) {
+        if (scale >= 1) {
+          upscaling_options.push_back(scale);
+        }
+      }
+
+      if (!upscaling_options.empty()) {
+        selected_scale = *std::min_element(upscaling_options.begin(), upscaling_options.end());
+      } else {
+        std::vector<double> downscaling_options;
+        for (double scale : scales) {
+          if (scale < 1) {
+            downscaling_options.push_back(scale);
+          }
+        }
+        selected_scale = *std::max_element(downscaling_options.begin(), downscaling_options.end());
+      }
+
+      std::vector<std::pair<int, int>> chosen_canvas;
+      for (size_t i = 0; i < scales.size(); ++i) {
+        if (std::abs(scales[i] - selected_scale) < 1e-9) {
+          chosen_canvas.push_back(possible_canvas_sizes[i]);
+        }
+      }
+
+      if (chosen_canvas.size() > 1) {
+        auto optimal_canvas = std::min_element(chosen_canvas.begin(), chosen_canvas.end(),
+                                               [](const std::pair<int, int>& a, const std::pair<int, int>& b) {
+                                                 return (a.first * a.second) < (b.first * b.second);
+                                               });
+        return *optimal_canvas;
+      } else {
+        return chosen_canvas[0];
+      }
+    }
+  }
+
+  static std::vector<int64_t> ConvertAspectRatiosToIds(const std::vector<std::pair<int64_t, int64_t>>& aspect_ratios,
+                                                       int64_t max_image_tiles) {
+    int64_t max_num_images = aspect_ratios.size();
+
+    auto supported_aspect_ratios = GetAllSupportedAspectRatios(max_image_tiles);
+
+    // Initialize the 1D vector with zeros
+    std::vector<int64_t> aspect_ratios_ids(max_num_images, 0);
+
+    for (size_t j = 0; j < aspect_ratios.size(); ++j) {
+      const auto& ratio = aspect_ratios[j];
+      auto it = std::find(supported_aspect_ratios.begin(), supported_aspect_ratios.end(), ratio);
+      if (it != supported_aspect_ratios.end()) {
+        aspect_ratios_ids[j] = std::distance(supported_aspect_ratios.begin(), it) + 1;
+      }
+    }
+
+    return aspect_ratios_ids;
+  }
+
+  OrtxStatus DoPad(const ortc::Tensor<uint8_t>& image, const std::pair<int64_t, int64_t>& aspect_ratio,
+                   ortc::Tensor<uint8_t>& padded_image) const {
+    auto& dimensions = image.Shape();
+    auto [image_height, image_width] = std::make_tuple(dimensions[0], dimensions[1]);
+    auto [num_tiles_height, num_tiles_width] = aspect_ratio;
+    auto padded_height = num_tiles_height * tile_size_.first;
+    auto padded_width = num_tiles_width * tile_size_.second;
+    auto pad_size = std::make_pair(padded_height - image_height, padded_width - image_width);
+    auto channels = dimensions[2];
+    auto* padded_image_data = padded_image.Allocate({padded_height, padded_width, channels});
+    std::memset(padded_image_data, 0, padded_height * padded_width * channels);
+    auto* input_data = image.Data();
+    for (int64_t j = 0; j < image_height; ++j) {
+      std::memcpy(padded_image_data + j * padded_width * channels, input_data + j * image_width * channels,
+                  image_width * channels);
+    }
+
+    return {};
+  }
+
+  OrtxStatus DoResize(const ortc::Tensor<uint8_t>& image, ortc::Tensor<uint8_t>& resized_image,
+                      std::pair<int64_t, int64_t>& aspect_ratio) const {
+    auto& dimensions = image.Shape();
+    auto [image_height, image_width] = std::make_tuple(dimensions[0], dimensions[1]);
+    auto tile_size = tile_size_.first;
+    auto [canvas_height, canvas_width] = GetOptimalTiledCanvas(image_height, image_width, max_image_tiles_, tile_size);
+    auto num_tiles_height = canvas_height / tile_size;
+    auto num_tiles_width = canvas_width / tile_size;
+    aspect_ratio = std::make_pair(num_tiles_height, num_tiles_width);
+    auto [new_height, new_width] =
+        GetImageSizeFitToCanvas(image_height, image_width, canvas_height, canvas_width, tile_size);
+
+    Resize resizer;
+    std::unordered_map<std::string, ortx::AttrType> attrs = {{"height", new_height},
+                                                             {"width", new_width},
+                                                             {"interpolation", std::string("LINEAR")},
+                                                             {"keep_aspect_ratio", int64_t(0)}};
+    OrtxStatus status = resizer.Init(attrs);
+    if (!status.IsOk()) {
+      return status;
+    }
+
+    return resizer.Compute(image, resized_image);
+  }
+
+ public:
+  template <typename DictT>
+  OrtxStatus Init(const DictT& attrs) {
+    DictT normalizer_attrs;
+    DictT rescaler_attrs;
+    for (const auto& [key, value] : attrs) {
+      if (key.find("normalize/") == 0) {
+        normalizer_attrs[key.substr(10)] = value;
+      } else if (key.find("rescale/") == 0) {
+        rescaler_attrs[key.substr(8)] = value;
+      } else if (key == "max_image_tiles") {
+        max_image_tiles_ = std::get<int64_t>(value);
+      } else if (key == "size") {
+        auto tile_size = std::get<std::vector<int64_t>>(value);
+        if (tile_size.size() != 2) {
+          return {kOrtxErrorInvalidArgument, "[Llama3ImageTransform]: Invalid tile size"};
+        }
+        tile_size_ = std::make_pair(tile_size[0], tile_size[1]);
+      } else if (key == "interpolation") {
+        interpolation_ = std::get<std::string>(value);
+      } else {
+        return {kOrtxErrorInvalidArgument, "[Llama3ImageTransform]: Invalid argument"};
+      }
+    }
+
+    OrtxStatus status = normalize_.Init(normalizer_attrs);
+    if (!status.IsOk()) {
+      return status;
+    }
+
+    return rescale_.Init(rescaler_attrs);
+  }
+
+ private:
+  int64_t max_image_tiles_{};
+  std::pair<int64_t, int64_t> tile_size_{};
+  std::string interpolation_{};
+
+  Rescale rescale_;
+  Normalize normalize_;
+};
--- a/shared/api/image_transforms_phi_3.hpp
+++ b/shared/api/image_transforms_phi_3.hpp
@ -3,7 +3,8 @@

 #pragma once

-#include "ocos.h"
+#include "ext_status.h"
+#include "op_def_struct.h"
 #include "image_resample.h"

 constexpr int max_crops = 16;
--- a/shared/api/runner.hpp
+++ b/shared/api/runner.hpp
@ -28,10 +28,6 @@ class KernelDef {
  virtual TensorArgs AllocateOutput(ortc::IAllocator* allocator) const = 0;
  virtual OrtxStatus Apply(TensorArgs& inputs, TensorArgs& output) const = 0;

-  using AttrType =
-      std::variant<std::string, double, int64_t, std::vector<std::string>, std::vector<double>, std::vector<int64_t>>;
-  using AttrDict = std::unordered_map<std::string, AttrType>;
-
  template <typename... Args>
  using tuple_function_args = std::tuple<typename std::remove_reference<Args>::type*...>;

@ -85,7 +81,6 @@ class KernelDef {

  template <typename T, typename... Args>
  static auto CastOutputAllType(TensorArgs::iterator tensor, T& arg, Args&... args) {
-    // return std::make_tuple(static_cast<T&>(*tensor), CastOutputAllType(args...));
    return std::tuple_cat(CastOutputImpl<T>(tensor), CastOutputAllType(tensor + 1, args...));
  }

@ -278,8 +273,6 @@ class Operation {

 private:
  std::vector<std::unique_ptr<ortc::TensorBase>> outputs_;
-
- private:
  const KernelRegistry* kernel_registry_;

  std::unique_ptr<KernelDef> kernel_;
@ -294,9 +287,10 @@ class OrtxRunner {

  template <typename IT, typename OT>  // batch input/output container
  OrtxStatus Run(IT& input_seq, OT& output_seq) {
-    for (size_t i = 0; i < input_seq.size(); ++i) {
-      auto& input = *(input_seq.begin() + i);
+    size_t i = 0;
    Operation* last_op = nullptr;
+    for (; i < input_seq.size(); ++i) {
+      auto& input = *(input_seq.begin() + i);
      // sequentially apply the operations
      for (auto& op : ops_) {
        if (last_op != nullptr) {
@ -305,7 +299,7 @@ class OrtxRunner {
        auto [status, ts_output] = op->Apply(allocator_, input);
        if (status.IsOk()) {
          if (op == ops_.back()) {
-            output_seq.push_back(ts_output);
+            output_seq.push_back(std::move(ts_output));
          } else {
            input = ts_output;
          }
@ -317,9 +311,70 @@ class OrtxRunner {
      }
    }

+    if (last_op != nullptr) {
+      last_op->ResetTensors(allocator_);
+    }
+
    return {};
  }

+  static bool IsGreaterShape(const std::vector<int64_t>& lhs, const std::vector<int64_t>& rhs) {
+    if (lhs.size() != rhs.size()) {
+      return lhs.size() > rhs.size();
+    }
+
+    for (size_t i = 0; i < lhs.size(); ++i) {
+      if (lhs[i] != rhs[i]) {
+        return lhs[i] > rhs[i];
+      }
+    }
+
+    return false;
+  }
+
+  static void CopyOrPadTensor(const std::vector<int64_t>::const_iterator dest_shape_begin,
+                              const std::vector<int64_t>::const_iterator dest_shape_end,
+                              const std::vector<int64_t>::const_iterator src_shape_begin,
+                              const std::vector<int64_t>::const_iterator src_shape_end,
+                              std::byte* dest, const std::byte* src, size_t element_size) {
+    // no broadcasting here
+    assert(dest_shape_begin != dest_shape_end && src_shape_begin != src_shape_end);
+    assert(dest_shape_end - dest_shape_begin == src_shape_end - src_shape_begin);
+
+    if ((dest_shape_begin + 1) == dest_shape_end) {
+      std::memcpy(dest, src, element_size * (*src_shape_begin));
+      if (*dest_shape_begin > *src_shape_begin) {
+        std::memset(dest + *src_shape_begin * element_size, 0, (*dest_shape_begin - *src_shape_begin) * element_size);
+      }
+      return;
+    }
+
+    int64_t dest_chunk_size = 1;
+    int64_t src_chunk_size = 1;
+    for (auto iter = dest_shape_begin + 1; iter != dest_shape_end; ++iter) {
+      dest_chunk_size *= *iter;
+    }
+
+    for (auto iter = src_shape_begin + 1; iter != src_shape_end; ++iter) {
+      src_chunk_size *= *iter;
+    }
+
+    for (int64_t i = 0; i < *dest_shape_begin; ++i) {
+      if (i < *src_shape_begin) {
+        if (dest_chunk_size == src_chunk_size) {
+          std::memcpy(dest + i * dest_chunk_size * element_size, src + i * src_chunk_size * element_size,
+                      dest_chunk_size * element_size);
+        } else {
+          CopyOrPadTensor(dest_shape_begin + 1, dest_shape_end, src_shape_begin + 1, src_shape_end,
+                          dest + i * dest_chunk_size * element_size, src + i * src_chunk_size * element_size,
+                          element_size);
+        }
+      } else {
+        std::memset(dest + i * dest_chunk_size * element_size, 0, dest_chunk_size * element_size);
+      }
+    }
+  }
+
  static OrtxStatus StackTensors(const std::vector<TensorArgs>& arg_lists, std::vector<TensorPtr>& outputs,
                                 ortc::IAllocator* allocator) {
    if (arg_lists.empty()) {
@ -332,21 +387,35 @@ class OrtxRunner {
      std::vector<ortc::TensorBase*> ts_ptrs;
      ts_ptrs.reserve(arg_lists.size());
      std::vector<int64_t> shape = arg_lists[0][axis]->Shape();
+      size_t element_size = arg_lists[0][axis]->SizeInBytes() / arg_lists[0][axis]->NumberOfElement();
+      bool is_same_shape = true;
      for (auto& ts : arg_lists) {
        if (shape != ts[axis]->Shape()) {
+          is_same_shape = false;
+          auto dtype = ts[axis]->Type();
+          if (dtype != ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 && dtype != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
            return {kOrtxErrorInvalidArgument, "[StackTensors]: shapes of tensors to stack are not the same."};
          }
+          if (IsGreaterShape(ts[axis]->Shape(), shape)) {
+            shape = ts[axis]->Shape();
+          }
+        }
        ts_ptrs.push_back(ts[axis]);
      }

      std::vector<int64_t> output_shape = shape;
      output_shape.insert(output_shape.begin(), batch_size);
      std::byte* tensor_buf = outputs[axis]->AllocateRaw(output_shape);
+      auto ts_size = outputs[axis]->SizeInBytes() / batch_size;
      for (size_t i = 0; i < batch_size; ++i) {
        auto ts = ts_ptrs[i];
        const std::byte* ts_buff = reinterpret_cast<const std::byte*>(ts->DataRaw());
-        auto ts_size = ts->SizeInBytes();
+        if (is_same_shape /* || ts->Shape() == std::vector<int64_t>(output_shape.begin() + 1, output_shape.end()) */) {
          std::memcpy(tensor_buf + i * ts_size, ts_buff, ts_size);
+        } else {
+          CopyOrPadTensor(output_shape.begin() + 1, output_shape.end(), ts->Shape().begin(), ts->Shape().end(),
+                          tensor_buf + i * ts_size, reinterpret_cast<const std::byte*>(ts->DataRaw()), element_size);
+        }
      }
    }

--- a/test/data/processor/clip_image.json
+++ b/test/data/processor/clip_image.json
@ -11,20 +11,14 @@
          }
        }
      },
-      {
-        "operation": {
-          "name": "convert_to_rgb",
-          "type": "ConvertRGB"
-        }
-      },
      {
        "operation": {
          "name": "resize",
          "type": "Resize",
          "attrs": {
            "interpolation": "CUBIC",
-            "width": 256,
-            "height": 256
+            "width": 224,
+            "height": 224
          }
        }
      },
@ -53,6 +47,15 @@
            "std": [0.229, 0.224, 0.225]
          }
        }
+      },
+      {
+        "operation": {
+          "name": "to_channel_first",
+          "type": "Permute3D",
+          "attrs": {
+            "dims": [2, 0, 1]
+          }
+        }
      }
    ]
  }
--- a/test/data/processor/image_to_numpy.py
+++ b/test/data/processor/image_to_numpy.py
@ -30,7 +30,7 @@ img_proc = ImageProcessor(R"""

 img_name = "australia.jpg"
 result = img_proc.pre_process(os.path.dirname(__file__) + "/" + img_name)
-np_img = img_proc.to_numpy(result)
+np_img = img_proc.to_numpy(result, 0)
 print(np_img.shape, np_img.dtype)

 # can save the image back to disk
--- a/test/data/processor/load_ortx_tsdump.py
+++ b/test/data/processor/load_ortx_tsdump.py
@ -0,0 +1,46 @@
+import re
+import os
+import numpy as np
+
+from PIL import Image
+
+dumping_file_path = "C:\\temp\\normalized_image826885234_f_560.bin"
+
+
+def regen_image(arr):
+    mean = np.array([0.48145466, 0.4578275, 0.40821073])
+    std = np.array([0.26862954, 0.26130258, 0.27577711])
+
+    # Reverse normalization
+    array = arr * std + mean
+
+    # Clip the values to [0, 1] range
+    array = np.clip(array, 0, 1)
+
+    # Convert to [0, 255] range and uint8 type
+    array = (array * 255).astype(np.uint8)
+    return array
+
+
+filename = os.path.basename(dumping_file_path)
+res = re.search(r".+(\d+)_([u|f])_(\d+)", filename)
+dtype = np.uint8 if res[2] == 'u' else np.float32
+# load the binary raw data from the file
+with open(dumping_file_path, 'rb') as file:
+    raw_data = np.fromfile(file, dtype=dtype)
+
+
+image_width = int(res[3])
+image_height = int(raw_data.size / image_width) // 3
+raw_data = raw_data.reshape((image_height, image_width, 3))
+
+# from bgr to rgb
+# raw_data = raw_data[:, :, ::-1]
+
+# save the image to disk
+if dtype == np.float32:
+    raw_data = regen_image(raw_data)
+
+img = Image.fromarray(raw_data)
+img.save(dumping_file_path + ".png")
+img.show()
--- a/test/data/processor/mllama/llama_3_image.json
+++ b/test/data/processor/mllama/llama_3_image.json
@ -0,0 +1,43 @@
+{
+  "processor": {
+    "name": "MllamaImageProcessor",
+    "transforms": [
+      {
+        "operation": {
+          "name": "decode_image",
+          "domain": "com.microsoft.extensions",
+          "type": "DecodeImage",
+          "attrs": {
+            "color_space": "BGR"
+          }
+        }
+      },
+      {
+        "operation": {
+          "name": "llama3_image_transform",
+          "domain": "com.microsoft.extensions",
+          "type": "Llama3ImageTransform",
+          "attrs": {
+            "max_image_tiles": 4,
+            "size": [
+              560,
+              560
+            ],
+            "interpolation": "LINEAR",
+            "rescale/rescale_factor": 0.00392156862745098,
+            "normalize/mean": [
+              0.48145466,
+              0.4578275,
+              0.40821073
+            ],
+            "normalize/std": [
+              0.26862954,
+              0.26130258,
+              0.27577711
+            ]
+          }
+        }
+      }
+    ]
+  }
+}
--- a/test/pp_api_test/test_processor.cc
+++ b/test/pp_api_test/test_processor.cc
@ -12,10 +12,12 @@

 using namespace ort_extensions;

+const char* test_image_paths[] = {"data/processor/standard_s.jpg", "data/processor/australia.jpg", "data/processor/exceltable.png"};
+const size_t test_image_count = sizeof(test_image_paths) / sizeof(test_image_paths[0]);

 TEST(ProcessorTest, TestPhi3VImageProcessing) {
-  auto [input_data, n_data] = ort_extensions::LoadRawImages(
-      {"data/processor/standard_s.jpg", "data/processor/australia.jpg", "data/processor/exceltable.png"});
+  auto [input_data, n_data] = ort_extensions::LoadRawImages(test_image_paths, test_image_count);
+      // {"data/processor/standard_s.jpg", "data/processor/australia.jpg", "data/processor/exceltable.png"});

  auto proc = OrtxObjectPtr<ImageProcessor>(OrtxCreateProcessor, "data/processor/phi_3_image.json");
  ortc::Tensor<float>* pixel_values;
@ -43,11 +45,9 @@ TEST(ProcessorTest, TestPhi3VImageProcessing) {
  proc->ClearOutputs(&r);
 }

-TEST(ProcessorTest, TestClipImageProcessing) {
-  const char* images_path[] = {"data/processor/standard_s.jpg", "data/processor/australia.jpg",
-                               "data/processor/exceltable.png"};
-  OrtxObjectPtr<OrtxRawImages> raw_images;
-  extError_t err = OrtxLoadImages(ort_extensions::ptr(raw_images), images_path, 3, nullptr);
+TEST(ProcessorTest, TestCLIPImageProcessing) {
+  OrtxObjectPtr<OrtxRawImages> raw_images{};
+  extError_t err = OrtxLoadImages(ort_extensions::ptr(raw_images), test_image_paths, test_image_count, nullptr);
  ASSERT_EQ(err, kOrtxOK);

  OrtxObjectPtr<OrtxProcessor> processor;
@ -72,3 +72,53 @@ TEST(ProcessorTest, TestClipImageProcessing) {
  ASSERT_EQ(err, kOrtxOK);
  ASSERT_EQ(num_dims, 4);
 }
+
+TEST(ProcessorTest, TestMLlamaImageProcessing) {
+  OrtxObjectPtr<OrtxRawImages> raw_images{};
+  extError_t err = OrtxLoadImages(ort_extensions::ptr(raw_images), test_image_paths, test_image_count, nullptr);
+  ASSERT_EQ(err, kOrtxOK);
+
+  OrtxObjectPtr<OrtxProcessor> processor;
+  err = OrtxCreateProcessor(ort_extensions::ptr(processor), "data/processor/mllama/llama_3_image.json");
+  if (err != kOrtxOK) {
+    std::cout << "Error: " << OrtxGetLastErrorMessage() << std::endl;
+  }
+  ASSERT_EQ(err, kOrtxOK);
+
+  OrtxObjectPtr<OrtxTensorResult> result;
+  err = OrtxImagePreProcess(processor.get(), raw_images.get(), ort_extensions::ptr(result));
+  ASSERT_EQ(err, kOrtxOK);
+
+  OrtxTensor* tensor;
+  err = OrtxTensorResultGetAt(result.get(), 0, &tensor);
+  ASSERT_EQ(err, kOrtxOK);
+
+  const float* data{};
+  const int64_t* shape{};
+  size_t num_dims;
+  err = OrtxGetTensorData(tensor, reinterpret_cast<const void**>(&data), &shape, &num_dims);
+  ASSERT_EQ(err, kOrtxOK);
+  ASSERT_EQ(num_dims, 5);
+
+  err = OrtxTensorResultGetAt(result.get(), 1, &tensor);
+  ASSERT_EQ(err, kOrtxOK);
+  const int64_t* int_data{};
+  err = OrtxGetTensorData(tensor, reinterpret_cast<const void**>(&int_data), &shape, &num_dims);
+  ASSERT_EQ(err, kOrtxOK);
+  ASSERT_EQ(num_dims, 2);
+  ASSERT_EQ(std::vector<int64_t>(int_data, int_data + 3), std::vector<int64_t>({6, 6, 1}));
+
+  err = OrtxTensorResultGetAt(result.get(), 2, &tensor);
+  ASSERT_EQ(err, kOrtxOK);
+  err = OrtxGetTensorData(tensor, reinterpret_cast<const void**>(&int_data), &shape, &num_dims);
+  ASSERT_EQ(err, kOrtxOK);
+  ASSERT_EQ(num_dims, 2);
+  ASSERT_EQ(std::vector<int64_t>(shape, shape + num_dims), std::vector<int64_t>({3, 4}));
+
+  err = OrtxTensorResultGetAt(result.get(), 3, &tensor);
+  ASSERT_EQ(err, kOrtxOK);
+  err = OrtxGetTensorData(tensor, reinterpret_cast<const void**>(&int_data), &shape, &num_dims);
+  ASSERT_EQ(err, kOrtxOK);
+  ASSERT_EQ(num_dims, 2);
+  ASSERT_EQ(std::vector<int64_t>(int_data, int_data + 3), std::vector<int64_t>({4, 4, 1}));
+}
--- a/test/test_pp_api.py
+++ b/test/test_pp_api.py
@ -0,0 +1,119 @@
+import os
+import tempfile
+import requests
+import unittest
+import numpy as np
+
+from PIL import Image
+
+
+is_pp_api_available = False
+try:
+    from transformers import AutoImageProcessor
+    from onnxruntime_extensions import pp_api
+    is_pp_api_available = True
+except ImportError:
+    pass
+
+
+def regen_image(arr):
+    mean = np.array([0.48145466, 0.4578275, 0.40821073])
+    std = np.array([0.26862954, 0.26130258, 0.27577711])
+
+    # Reverse normalization
+    array = arr * std + mean
+
+    # Clip the values to [0, 1] range
+    array = np.clip(array, 0, 1)
+
+    # Convert to [0, 255] range and uint8 type
+    array = (array * 255).astype(np.uint8)
+
+    # Convert NumPy array to PIL Image
+    image = Image.fromarray(array)
+    return image
+
+
+@unittest.skipIf(not is_pp_api_available, "pp_api is not available")
+class TestPPAPI(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        if os.path.exists("/temp"):
+            cls.temp_dir = "/temp"
+        elif os.path.exists("/tmp"):
+            cls.temp_dir = "/tmp"
+        else:
+            cls.temp_dir = tempfile.mkdtemp()
+            print(f"Created temp dir: {cls.temp_dir}")
+        cls.token_id = os.environ.get("HF_TOKEN", None)
+
+    def test_CLIP_image_processing(self):
+        model_id = "openai/clip-vit-large-patch14"
+        image_list = ["test/data/processor/australia.jpg",
+                      "test/data/processor/passport.png",
+                      "test/data/processor/exceltable.png"]
+        (image, image2, image3) = [Image.open(f) for f in image_list]
+
+        processor = AutoImageProcessor.from_pretrained(model_id)
+        inputs = processor.preprocess(
+            [image, image2, image3], return_tensors="np")
+        print({k: v.shape if k == "pixel_values" else v for k, v in inputs.items()})
+
+        expected_images = inputs["pixel_values"]
+        for i in range(len(expected_images)):
+            expected = expected_images[i]
+            e_image = regen_image(np.transpose(expected, (1, 2, 0)))
+            e_image.save(f"{self.temp_dir}/CLIP_e_{i}.png")
+
+        ort_processor = pp_api.ImageProcessor(
+            "test/data/processor/clip_image.json")
+        inputs = ort_processor.pre_process(image_list)
+        print(ort_processor.to_numpy(inputs, 0).shape)
+        actual_images = ort_processor.to_numpy(inputs, 0)
+        for i in range(len(actual_images)):
+            actual = actual_images[i]
+            a_image = regen_image(np.transpose(actual, (1, 2, 0)))
+            a_image.save(f"{self.temp_dir}/CLIP_a_{i}.png")
+
+    def test_llama3_2_image_processing(self):
+        model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+        url = ("https://huggingface.co/datasets/huggingface/"
+               "documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg")
+        # save the image to a file in self.temp_dir
+        with open(f"{self.temp_dir}/rabbit.jpg", "wb") as f:
+            f.write(requests.get(url).content)
+
+        # image = Image.open(requests.get(url, stream=True).raw)
+        image_list = [f"{self.temp_dir}/rabbit.jpg",
+                      "test/data/processor/passport.png",
+                      "test/data/processor/exceltable.png"]
+        (image, image2, image3) = [Image.open(f) for f in image_list]
+
+        processor = AutoImageProcessor.from_pretrained(model_id, token=TestPPAPI.token_id)
+        inputs = processor.preprocess(
+            [image, image2, image3], return_tensors="np")
+        print({k: v.shape if k == "pixel_values" else v for k, v in inputs.items()})
+
+        ort_processor = pp_api.ImageProcessor(
+            "test/data/processor/mllama/llama_3_image.json")
+        ort_inputs = ort_processor.to_numpy(ort_processor.pre_process(image_list), 0)
+        print(ort_inputs.shape)
+
+        for idx in range(len(image_list)):
+            expected_images = inputs["pixel_values"][0][idx]
+            for i in range(len(expected_images)):
+                expected = expected_images[i]
+                e_image = regen_image(np.transpose(expected, (1, 2, 0)))
+                e_image.save(f"{self.temp_dir}/e_{idx}_{i}.png")
+
+            actual_images = ort_inputs[idx]
+            for i in range(len(actual_images)):
+                actual = actual_images[i]
+                a_image = regen_image(np.transpose(actual, (1, 2, 0)))
+                a_image.save(f"{self.temp_dir}/a_{idx}_{i}.png")
+
+
+if __name__ == '__main__':
+    unittest.main()