From 1b807949032fcfdd276e9b2c2fb5fe5f768a9bd1 Mon Sep 17 00:00:00 2001
From: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
Date: Wed, 4 Sep 2024 16:50:05 -0700
Subject: [PATCH] Remove OpenCV dependency from C_API mode (#800)

* Remove OpenCV dependency from C_API model

* fix build on Windows

* switch ci build flag

* try to fix the macOS build issue

* more fixing

* fix the macOS build issue

* list jpeg source

* verified on MacOS

* update the pp_api too

* avoid the codecs library conflicts

* Add the unit tests

* move the codec test

* add the missing dl lib for extensions test

* refine the code

* a smaller fixing for Windows Python
---
 .pipelines/ci.yml                     |   6 +-
 .pyproject/cmdclass.py                |   3 +
 CMakeLists.txt                        |  10 +-
 MANIFEST.in                           |   8 --
 build.sh                              |   2 +-
 cmake/ext_imgcodecs.cmake             | 131 +++++++++++++++++++++++
 cmake/ext_tests.cmake                 |   4 +
 cmake/presets/ort_genai.cmake         |   2 -
 include/ortx_utils.h                  |  11 ++
 onnxruntime_extensions/pp_api.py      |   8 ++
 pyop/py_c_api.cc                      |  15 ++-
 shared/api/c_api_utils.cc             |  27 ++++-
 shared/api/c_api_utils.hpp            |  60 ++++++++---
 shared/api/image_decoder.hpp          | 145 ++++++++++++++++++++++++++
 shared/api/image_processor.cc         |   4 +-
 test/data/processor/image_to_numpy.py |  41 ++++++++
 test/data/processor/proctest.py       |  76 ++++++++++++++
 test/pp_api_test/test_imgcodec.cc     |  81 ++++++++++++++
 18 files changed, 589 insertions(+), 45 deletions(-)
 create mode 100644 cmake/ext_imgcodecs.cmake
 create mode 100644 shared/api/image_decoder.hpp
 create mode 100644 test/data/processor/image_to_numpy.py
 create mode 100644 test/data/processor/proctest.py
 create mode 100644 test/pp_api_test/test_imgcodec.cc

diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml
index e39ef96bd..a54f1d47a 100644
--- a/.pipelines/ci.yml
+++ b/.pipelines/ci.yml
@@ -197,7 +197,7 @@ stages:
     # compiled as only one operator selected.
     - bash: |
         set -e -x -u
-        ./build.sh -DOCOS_ENABLE_C_API=ON
+        ./build.sh -DOCOS_ENABLE_C_API=ON -DOCOS_ENABLE_CV2=OFF -DOCOS_ENABLE_VISION=OFF -DOCOS_ENABLE_OPENCV_CODECS=OFF
         cd out/Linux/RelWithDebInfo
         ctest -C RelWithDebInfo --output-on-failure
       displayName: Build ort-extensions with API enabled and run tests
@@ -281,7 +281,7 @@ stages:
     # compiled as only one operator selected.
     - bash: |
         set -e -x -u
-        ./build.sh -DOCOS_ENABLE_C_API=ON
+        ./build.sh -DOCOS_ENABLE_C_API=ON -DOCOS_ENABLE_CV2=OFF -DOCOS_ENABLE_VISION=OFF -DOCOS_ENABLE_OPENCV_CODECS=OFF
         cd out/Darwin/RelWithDebInfo
         ctest -C RelWithDebInfo --output-on-failure
       displayName: Build ort-extensions with API enabled and run tests
@@ -431,7 +431,7 @@ stages:
 
     steps:
     - script: |
-        call .\build.bat -DOCOS_ENABLE_C_API=ON
+        call .\build.bat -DOCOS_ENABLE_C_API=ON -DOCOS_ENABLE_CV2=OFF -DOCOS_ENABLE_VISION=OFF -DOCOS_ENABLE_OPENCV_CODECS=OFF
         cd out\Windows
         ctest -C RelWithDebInfo --output-on-failure
       displayName: Build ort-extensions with API enabled and run tests
diff --git a/.pyproject/cmdclass.py b/.pyproject/cmdclass.py
index 3608dfc7a..3d2d78f00 100644
--- a/.pyproject/cmdclass.py
+++ b/.pyproject/cmdclass.py
@@ -212,6 +212,9 @@ def build_cmake(self, extension):
                 '-DOCOS_ENABLE_VISION=OFF']
 
         if self.pp_api:
+            if not self.no_opencv:
+                raise RuntimeError(
+                    "Cannot enable PP C API Python Wrapper without disabling OpenCV.")
             cmake_args += ['-DOCOS_ENABLE_C_API=ON']
 
         if self.no_azure is not None:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 12e54a52d..cbceb4b34 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -206,7 +206,6 @@ if(NOT PROJECT_IS_TOP_LEVEL AND ONNXRUNTIME_ROOT)
   set(_ONNXRUNTIME_EMBEDDED TRUE)
 endif()
 
-
 if (OCOS_ENABLE_SELECTED_OPLIST OR OCOS_BUILD_PRESET)
   disable_all_operators()
   if(OCOS_ENABLE_SELECTED_OPLIST)
@@ -737,9 +736,12 @@ if(OCOS_ENABLE_C_API)
     file(GLOB audio_TARGET_SRC "shared/api/c_api_feature_extraction.*" "shared/api/speech_*")
     list(APPEND _TARGET_LIB_SRC ${audio_TARGET_SRC})
   endif()
-  if(OCOS_ENABLE_CV2)
+  if(OCOS_ENABLE_DLIB)
+    include(ext_imgcodecs)
     file(GLOB cv2_TARGET_SRC "shared/api/c_api_processor.*" "shared/api/image_*.*")
     list(APPEND _TARGET_LIB_SRC ${cv2_TARGET_SRC})
+    target_include_directories(ocos_operators PUBLIC ${libPNG_SOURCE_DIR} ${libJPEG_SOURCE_DIR})
+    target_link_libraries(ocos_operators PUBLIC ${PNG_LIBRARY} ${JPEG_LIBRARY} ${ZLIB_LIBRARY})
   endif()
 endif()
 
@@ -852,8 +854,8 @@ target_link_libraries(ortcustomops PUBLIC ocos_operators)
 if(OCOS_BUILD_SHARED_LIB)
   file(GLOB shared_TARGET_SRC "shared/*.cc" "shared/*.h")
   if (OCOS_ENABLE_C_API)
-    if (NOT _HAS_TOKENIZER OR NOT OCOS_ENABLE_CV2 OR NOT OCOS_ENABLE_AUDIO)
-      message(FATAL_ERROR "Shared library build requires GPT2_TOKENIZER, CV2 and AUDIO to be enabled.")
+    if (NOT _HAS_TOKENIZER OR NOT OCOS_ENABLE_AUDIO)
+      message(FATAL_ERROR "Shared library build requires GPT2_TOKENIZER, AUDIO to be enabled.")
     endif()
     list(APPEND shared_TARGET_SRC "shared/extensions_c.def")
   else()
diff --git a/MANIFEST.in b/MANIFEST.in
index 43d7ac613..9d81ae414 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -5,11 +5,3 @@ recursive-include include *.*
 recursive-include operators *.*
 recursive-include pyop *.*
 recursive-include shared *.*
-prune ci_build
-prune docs
-prune test
-prune _subbuild
-prune out
-exclude *.bat
-exclude *.yaml
-exclude *.git*
diff --git a/build.sh b/build.sh
index a6b310ed0..ad44194b3 100755
--- a/build.sh
+++ b/build.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # The example build script to build the source in Linux-like platform
-set -e -x -u
+set -e -u
 
 cuda_arch=''
 if [[ $@ == *"DOCOS_USE_CUDA=ON"* && $@ != *"DCMAKE_CUDA_ARCHITECTURES"* ]]; then
diff --git a/cmake/ext_imgcodecs.cmake b/cmake/ext_imgcodecs.cmake
new file mode 100644
index 000000000..70f8adcb7
--- /dev/null
+++ b/cmake/ext_imgcodecs.cmake
@@ -0,0 +1,131 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+set(_IMGCODEC_ROOT_DIR ${dlib_SOURCE_DIR}/dlib/external)
+
+# ----------------------------------------------------------------------------
+#  project libpng
+#
+# ----------------------------------------------------------------------------
+set (PNG_LIBRARY "libpng_static_c")
+set (libPNG_SOURCE_DIR ${_IMGCODEC_ROOT_DIR}/libpng)
+set (zlib_SOURCE_DIR ${_IMGCODEC_ROOT_DIR}/zlib)
+
+if(NOT WIN32)
+  find_library(M_LIBRARY
+    NAMES m
+    PATHS /usr/lib /usr/local/lib
+  )
+  if(NOT M_LIBRARY)
+    message(STATUS "math lib 'libm' not found; floating point support disabled")
+  endif()
+else()
+  # not needed on windows
+  set(M_LIBRARY "")
+endif()
+
+set(lib_srcs
+   ${libPNG_SOURCE_DIR}/arm/arm_init.c
+   ${libPNG_SOURCE_DIR}/arm/filter_neon_intrinsics.c
+   ${libPNG_SOURCE_DIR}/arm/palette_neon_intrinsics.c
+   ${libPNG_SOURCE_DIR}//png.c
+   ${libPNG_SOURCE_DIR}//pngerror.c
+   ${libPNG_SOURCE_DIR}//pngget.c
+   ${libPNG_SOURCE_DIR}//pngmem.c
+   ${libPNG_SOURCE_DIR}//pngpread.c
+   ${libPNG_SOURCE_DIR}//pngread.c
+   ${libPNG_SOURCE_DIR}//pngrio.c
+   ${libPNG_SOURCE_DIR}//pngrtran.c
+   ${libPNG_SOURCE_DIR}//pngrutil.c
+   ${libPNG_SOURCE_DIR}//pngset.c
+   ${libPNG_SOURCE_DIR}//pngtrans.c
+   ${libPNG_SOURCE_DIR}//pngwio.c
+   ${libPNG_SOURCE_DIR}//pngwrite.c
+   ${libPNG_SOURCE_DIR}//pngwtran.c
+   ${libPNG_SOURCE_DIR}//pngwutil.c
+   ${zlib_SOURCE_DIR}/adler32.c
+   ${zlib_SOURCE_DIR}/compress.c
+   ${zlib_SOURCE_DIR}/crc32.c
+   ${zlib_SOURCE_DIR}/deflate.c
+   ${zlib_SOURCE_DIR}/gzclose.c
+   ${zlib_SOURCE_DIR}/gzlib.c
+   ${zlib_SOURCE_DIR}/gzread.c
+   ${zlib_SOURCE_DIR}/gzwrite.c
+   ${zlib_SOURCE_DIR}/infback.c
+   ${zlib_SOURCE_DIR}/inffast.c
+   ${zlib_SOURCE_DIR}/inflate.c
+   ${zlib_SOURCE_DIR}/inftrees.c
+   ${zlib_SOURCE_DIR}/trees.c
+   ${zlib_SOURCE_DIR}/uncompr.c
+   ${zlib_SOURCE_DIR}/zutil.c
+)
+
+add_library(${PNG_LIBRARY} STATIC EXCLUDE_FROM_ALL ${lib_srcs})
+target_include_directories(${PNG_LIBRARY} BEFORE PRIVATE ${zlib_SOURCE_DIR})
+
+if(MSVC)
+  target_compile_definitions(${PNG_LIBRARY} PRIVATE -D_CRT_SECURE_NO_DEPRECATE)
+else()
+  target_compile_options(${PNG_LIBRARY} PRIVATE -Wno-deprecated-non-prototype)
+endif()
+
+# ----------------------------------------------------------------------------
+#  project libjpeg
+#
+# ----------------------------------------------------------------------------
+set(JPEG_LIBRARY "libjpeg_static_c")
+set(libJPEG_SOURCE_DIR ${_IMGCODEC_ROOT_DIR}/libjpeg)
+
+set(lib_srcs
+  ${libJPEG_SOURCE_DIR}/jaricom.c
+  ${libJPEG_SOURCE_DIR}/jcapimin.c
+  ${libJPEG_SOURCE_DIR}/jcapistd.c
+  ${libJPEG_SOURCE_DIR}/jcarith.c
+  ${libJPEG_SOURCE_DIR}/jccoefct.c
+  ${libJPEG_SOURCE_DIR}/jccolor.c
+  ${libJPEG_SOURCE_DIR}/jcdctmgr.c
+  ${libJPEG_SOURCE_DIR}/jchuff.c
+  ${libJPEG_SOURCE_DIR}/jcinit.c
+  ${libJPEG_SOURCE_DIR}/jcmainct.c
+  ${libJPEG_SOURCE_DIR}/jcmarker.c
+  ${libJPEG_SOURCE_DIR}/jcmaster.c
+  ${libJPEG_SOURCE_DIR}/jcomapi.c
+  ${libJPEG_SOURCE_DIR}/jcparam.c
+  ${libJPEG_SOURCE_DIR}/jcprepct.c
+  ${libJPEG_SOURCE_DIR}/jcsample.c
+  ${libJPEG_SOURCE_DIR}/jdapimin.c
+  ${libJPEG_SOURCE_DIR}/jdapistd.c
+  ${libJPEG_SOURCE_DIR}/jdarith.c
+  ${libJPEG_SOURCE_DIR}/jdatadst.c
+  ${libJPEG_SOURCE_DIR}/jdatasrc.c
+  ${libJPEG_SOURCE_DIR}/jdcoefct.c
+  ${libJPEG_SOURCE_DIR}/jdcolor.c
+  ${libJPEG_SOURCE_DIR}/jddctmgr.c
+  ${libJPEG_SOURCE_DIR}/jdhuff.c
+  ${libJPEG_SOURCE_DIR}/jdinput.c
+  ${libJPEG_SOURCE_DIR}/jdmainct.c
+  ${libJPEG_SOURCE_DIR}/jdmarker.c
+  ${libJPEG_SOURCE_DIR}/jdmaster.c
+  ${libJPEG_SOURCE_DIR}/jdmerge.c
+  ${libJPEG_SOURCE_DIR}/jdpostct.c
+  ${libJPEG_SOURCE_DIR}/jdsample.c
+  ${libJPEG_SOURCE_DIR}/jerror.c
+  ${libJPEG_SOURCE_DIR}/jfdctflt.c
+  ${libJPEG_SOURCE_DIR}/jfdctfst.c
+  ${libJPEG_SOURCE_DIR}/jfdctint.c
+  ${libJPEG_SOURCE_DIR}/jidctflt.c
+  ${libJPEG_SOURCE_DIR}/jidctfst.c
+  ${libJPEG_SOURCE_DIR}/jidctint.c
+  ${libJPEG_SOURCE_DIR}/jmemmgr.c
+  ${libJPEG_SOURCE_DIR}/jmemnobs.c
+  ${libJPEG_SOURCE_DIR}/jquant1.c
+  ${libJPEG_SOURCE_DIR}/jquant2.c
+  ${libJPEG_SOURCE_DIR}/jutils.c
+  )
+file(GLOB lib_hdrs ${libJPEG_SOURCE_DIR}/*.h)
+add_library(${JPEG_LIBRARY} STATIC EXCLUDE_FROM_ALL ${lib_srcs} ${lib_hdrs})
+
+if(NOT MSVC)
+  set_source_files_properties(jcdctmgr.c PROPERTIES COMPILE_FLAGS "-O1")
+endif()
+target_compile_definitions(${JPEG_LIBRARY} PRIVATE -DNO_MKTEMP)
diff --git a/cmake/ext_tests.cmake b/cmake/ext_tests.cmake
index 436125b81..b292279b8 100644
--- a/cmake/ext_tests.cmake
+++ b/cmake/ext_tests.cmake
@@ -189,6 +189,10 @@ if (OCOS_BUILD_SHARED_LIB)
         list(APPEND extensions_test_libraries stdc++fs -pthread)
       endif()
 
+      if (NOT MSVC)
+        list(APPEND extensions_test_libraries ${CMAKE_DL_LIBS})
+      endif()
+
       add_test_target(TARGET extensions_test
                       TEST_SOURCES ${shared_TEST_SRC}
                       LIBRARIES ${extensions_test_libraries}
diff --git a/cmake/presets/ort_genai.cmake b/cmake/presets/ort_genai.cmake
index e1ecb5e98..2ed162bf5 100644
--- a/cmake/presets/ort_genai.cmake
+++ b/cmake/presets/ort_genai.cmake
@@ -3,8 +3,6 @@
 
 set(OCOS_ENABLE_GPT2_TOKENIZER ON CACHE INTERNAL "" FORCE)
 set(OCOS_ENABLE_C_API ON CACHE INTERNAL "" FORCE)
-set(OCOS_ENABLE_CV2 ON CACHE INTERNAL "" FORCE)
-set(OCOS_ENABLE_OPENCV_CODECS ON CACHE INTERNAL "" FORCE)
 set(OCOS_ENABLE_DLIB ON CACHE INTERNAL "" FORCE)
 set(OCOS_ENABLE_MATH ON CACHE INTERNAL "" FORCE)
 set(OCOS_ENABLE_AUDIO ON CACHE INTERNAL "" FORCE)
diff --git a/include/ortx_utils.h b/include/ortx_utils.h
index 0e6b951d7..e533650b8 100644
--- a/include/ortx_utils.h
+++ b/include/ortx_utils.h
@@ -105,6 +105,17 @@ extError_t ORTX_API_CALL OrtxTensorResultGetAt(OrtxTensorResult* result, size_t
  */
 extError_t ORTX_API_CALL OrtxGetTensorType(OrtxTensor* tensor, extDataType_t* type);
 
+/**
+ * @brief Retrieves the size of each element in the given tensor.
+ *
+ * This function calculates the size of each element in the specified tensor and stores it in the provided size variable.
+ *
+ * @param tensor A pointer to the OrtxTensor object.
+ * @param size A pointer to a size_t variable to store the size of each element.
+ * @return An extError_t value indicating the success or failure of the operation.
+ */
+extError_t ORTX_API_CALL OrtxGetTensorSizeOfElement(OrtxTensor* tensor, size_t* size);
+
 /** \brief Get the data from the tensor
  *
  * \param tensor The tensor object
diff --git a/onnxruntime_extensions/pp_api.py b/onnxruntime_extensions/pp_api.py
index f30b742fd..4e3c05595 100644
--- a/onnxruntime_extensions/pp_api.py
+++ b/onnxruntime_extensions/pp_api.py
@@ -65,8 +65,16 @@ def __init__(self, processor_json):
         self.processor = create_processor(processor_json)
 
     def pre_process(self, images):
+        if isinstance(images, str):
+            images = [images]
+        if isinstance(images, list):
+            images = load_images(images)
         return image_pre_process(self.processor, images)
 
+    @staticmethod
+    def to_numpy(result):
+        return tensor_result_get_at(result, 0)
+
     def __del__(self):
         if delete_object and self.processor:
             delete_object(self.processor)
diff --git a/pyop/py_c_api.cc b/pyop/py_c_api.cc
index d1854072b..c2f57b561 100644
--- a/pyop/py_c_api.cc
+++ b/pyop/py_c_api.cc
@@ -85,15 +85,12 @@ void AddGlobalMethodsCApi(pybind11::module& m) {
         const int64_t* shape{};
         size_t num_dims;
         const void* data{};
-        size_t elem_size = 0;
-        if (tensor_type == extDataType_t::kOrtxInt64 || tensor_type == extDataType_t::kOrtxFloat) {
+        size_t elem_size = 1;
+        if (tensor_type == extDataType_t::kOrtxInt64 ||
+            tensor_type == extDataType_t::kOrtxFloat ||
+            tensor_type == extDataType_t::kOrtxUint8) {
           OrtxGetTensorData(tensor, reinterpret_cast<const void**>(&data), &shape, &num_dims);
-          elem_size = 4;
-          if (tensor_type == extDataType_t::kOrtxInt64) {
-            elem_size = 8;
-          }
-        } else if (tensor_type == extDataType_t::kOrtxUnknownType) {
-          throw std::runtime_error("Failed to get tensor type");
+          OrtxGetTensorSizeOfElement(tensor, &elem_size);
         } else if (tensor_type == extDataType_t::kOrtxUnknownType) {
           throw std::runtime_error("unsupported tensor type");
         }
@@ -108,6 +105,8 @@ void AddGlobalMethodsCApi(pybind11::module& m) {
           obj = py::array_t<float>(npy_dims);
         } else if (tensor_type == extDataType_t::kOrtxInt64) {
           obj = py::array_t<int64_t>(npy_dims);
+        } else if (tensor_type == extDataType_t::kOrtxUint8) {
+          obj = py::array_t<uint8_t>(npy_dims);
         }
 
         void* out_ptr = obj.mutable_data();
diff --git a/shared/api/c_api_utils.cc b/shared/api/c_api_utils.cc
index 9db7b1bde..feebe4448 100644
--- a/shared/api/c_api_utils.cc
+++ b/shared/api/c_api_utils.cc
@@ -103,7 +103,6 @@ extError_t ORTX_API_CALL OrtxTensorResultGetAt(OrtxTensorResult* result, size_t
 
   auto tensor_ptr = std::make_unique<TensorObject>();
   tensor_ptr->SetTensor(ts);
-  tensor_ptr->SetTensorType(result_ptr->GetTensorType(index));
   *tensor = static_cast<OrtxTensor*>(tensor_ptr.release());
   return extError_t();
 }
@@ -124,6 +123,24 @@ extError_t ORTX_API_CALL OrtxGetTensorType(OrtxTensor* tensor, extDataType_t* ty
   return extError_t();
 }
 
+extError_t ORTX_API_CALL OrtxGetTensorSizeOfElement(OrtxTensor* tensor, size_t* size) {
+  if (tensor == nullptr || size == nullptr) {
+    ReturnableStatus::last_error_message_ = "Invalid argument";
+    return kOrtxErrorInvalidArgument;
+  }
+
+  auto tensor_impl = static_cast<TensorObject*>(tensor);
+  if (tensor_impl->ortx_kind() != extObjectKind_t::kOrtxKindTensor) {
+    ReturnableStatus::last_error_message_ = "Invalid argument";
+    return kOrtxErrorInvalidArgument;
+  }
+
+  auto tb = tensor_impl->GetTensor();
+  assert(tb != nullptr); 
+  *size = tb->SizeInBytes() / tb->NumberOfElement();
+  return extError_t();
+}
+
 extError_t ORTX_API_CALL OrtxGetTensorData(OrtxTensor* tensor, const void** data, const int64_t** shape,
                                            size_t* num_dims) {
   if (tensor == nullptr) {
@@ -158,3 +175,11 @@ extError_t ORTX_API_CALL OrtxGetTensorDataFloat(OrtxTensor* tensor, const float*
   *data = reinterpret_cast<const float*>(data_ptr);  // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
   return err;
 }
+
+extError_t ORTX_API_CALL OrtxGetTensorDataUint8(OrtxTensor* tensor, const uint8_t** data, const int64_t** shape,
+                                                size_t* num_dims) {
+  const void* data_ptr{};
+  auto err = OrtxGetTensorData(tensor, &data_ptr, shape, num_dims);
+  *data = reinterpret_cast<const uint8_t*>(data_ptr);  // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
+  return err;
+}
diff --git a/shared/api/c_api_utils.hpp b/shared/api/c_api_utils.hpp
index 46bd79ab3..37e749d1b 100644
--- a/shared/api/c_api_utils.hpp
+++ b/shared/api/c_api_utils.hpp
@@ -99,15 +99,56 @@ class TensorObject : public OrtxObjectImpl {
   ~TensorObject() override = default;
 
   void SetTensor(ortc::TensorBase* tensor) { tensor_ = tensor; }
-  void SetTensorType(extDataType_t type) { tensor_type_ = type; }
 
-  [[nodiscard]] extDataType_t GetTensorType() const { return tensor_type_; }
+  static extDataType_t GetDataType(ONNXTensorElementDataType dt) {
+    if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
+      return extDataType_t::kOrtxFloat;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8) {
+      return extDataType_t::kOrtxUint8;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8) {
+      return extDataType_t::kOrtxInt8;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16) {
+      return extDataType_t::kOrtxUint16;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16) {
+      return extDataType_t::kOrtxInt16;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32) {
+      return extDataType_t::kOrtxInt32;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
+      return extDataType_t::kOrtxInt64;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) {
+      return extDataType_t::kOrtxString;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL) {
+      return extDataType_t::kOrtxBool;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) {
+      return extDataType_t::kOrtxFloat16;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
+      return extDataType_t::kOrtxDouble;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32) {
+      return extDataType_t::kOrtxUint32;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64) {
+      return extDataType_t::kOrtxUint64;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64) {
+      return extDataType_t::kOrtxComplex64;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128) {
+      return extDataType_t::kOrtxComplex128;
+    } else if (dt == ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16) {
+      return extDataType_t::kOrtxBFloat16;
+    } else {
+      return extDataType_t::kOrtxUnknownType;
+    }
+  }
+  
+  [[nodiscard]] extDataType_t GetTensorType() const {
+    if (tensor_ == nullptr) {
+      return extDataType_t::kOrtxUnknownType;
+    }
+    return GetDataType(tensor_->Type());
+  }
 
   [[nodiscard]] ortc::TensorBase* GetTensor() const { return tensor_; }
 
  private:
   ortc::TensorBase* tensor_{};
-  extDataType_t tensor_type_{extDataType_t::kOrtxUnknownType};
 };
 
 class TensorResult : public OrtxObjectImpl {
@@ -116,13 +157,8 @@ class TensorResult : public OrtxObjectImpl {
   ~TensorResult() override = default;
 
   void SetTensors(std::vector<std::unique_ptr<ortc::TensorBase>>&& tensors) { tensors_ = std::move(tensors); }
-  void SetTensorTypes(const std::vector<extDataType_t>& types) { tensor_types_ = types; }
   [[nodiscard]] size_t NumTensors() const { return tensors_.size(); }
-
-  [[nodiscard]] const std::vector<extDataType_t>& tensor_types() const { return tensor_types_; }
-
   [[nodiscard]] const std::vector<std::unique_ptr<ortc::TensorBase>>& tensors() const { return tensors_; }
-
   [[nodiscard]] std::vector<ortc::TensorBase*> GetTensors() const {
     std::vector<ortc::TensorBase*> ts;
     ts.reserve(tensors_.size());
@@ -139,16 +175,8 @@ class TensorResult : public OrtxObjectImpl {
     return nullptr;
   }
 
-  extDataType_t GetTensorType(size_t i) const {
-    if (i < tensor_types_.size()) {
-      return tensor_types_[i];
-    }
-    return extDataType_t::kOrtxUnknownType;
-  }
-
  private:
   std::vector<std::unique_ptr<ortc::TensorBase>> tensors_;
-  std::vector<extDataType_t> tensor_types_;
 };
 
 struct ReturnableStatus {
diff --git a/shared/api/image_decoder.hpp b/shared/api/image_decoder.hpp
new file mode 100644
index 000000000..bc588e539
--- /dev/null
+++ b/shared/api/image_decoder.hpp
@@ -0,0 +1,145 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "png.h"
+#include "jpeglib.h"
+#include "op_def_struct.h"
+#include "ext_status.h"
+
+class JMemorySourceManager : public jpeg_source_mgr {
+ public:
+  // Constructor
+  JMemorySourceManager(const uint8_t* encoded_image_data, const int64_t encoded_image_data_len) {
+    // Initialize source fields
+    next_input_byte = reinterpret_cast<const JOCTET*>(encoded_image_data);
+    bytes_in_buffer = static_cast<size_t>(encoded_image_data_len);
+    init_source = &JMemorySourceManager::initSource;
+    fill_input_buffer = &JMemorySourceManager::fillInputBuffer;
+    skip_input_data = &JMemorySourceManager::skipInputData;
+    resync_to_restart = jpeg_resync_to_restart;
+    term_source = &JMemorySourceManager::termSource;
+  }
+
+  // Initialize source (no-op)
+  static void initSource(j_decompress_ptr cinfo) {
+    // No initialization needed
+  }
+
+  // Fill input buffer (not used here, always return FALSE)
+  static boolean fillInputBuffer(j_decompress_ptr cinfo) {
+    return FALSE;  // Buffer is managed manually
+  }
+
+  // Skip input data
+  static void skipInputData(j_decompress_ptr cinfo, long num_bytes) {
+    JMemorySourceManager* srcMgr = reinterpret_cast<JMemorySourceManager*>(cinfo->src);
+    if (num_bytes > 0) {
+      size_t bytes_to_skip = static_cast<size_t>(num_bytes);
+      while (bytes_to_skip > srcMgr->bytes_in_buffer) {
+        bytes_to_skip -= srcMgr->bytes_in_buffer;
+        if (srcMgr->fillInputBuffer(cinfo)) {
+          // Error: buffer ran out
+          srcMgr->extError = kOrtxErrorCorruptData;
+        }
+      }
+      srcMgr->next_input_byte += bytes_to_skip;
+      srcMgr->bytes_in_buffer -= bytes_to_skip;
+    }
+  }
+
+  // Terminate source (no-op)
+  static void termSource(j_decompress_ptr cinfo) {
+    // No cleanup needed
+  }
+
+  extError_t extError{kOrtxOK};  // Error handler
+};
+
+inline OrtxStatus image_decoder(const ortc::Tensor<uint8_t>& input, ortc::Tensor<uint8_t>& output) {
+  const auto& dimensions = input.Shape();
+  if (dimensions.size() != 1ULL) {
+    return {kOrtxErrorInvalidArgument, "[ImageDecoder]: Only raw image formats are supported."};
+  }
+
+  // Get data & the length
+  const uint8_t* encoded_image_data = input.Data();
+  const int64_t encoded_image_data_len = input.NumberOfElement();
+
+  // check it's a PNG image or JPEG image
+  if (encoded_image_data_len < 8) {
+    return {kOrtxErrorInvalidArgument, "[ImageDecoder]: Invalid image data."};
+  }
+
+  OrtxStatus status{};
+  if (png_sig_cmp(encoded_image_data, 0, 8) == 0) {
+    // Decode the PNG image
+    png_image image;
+    std::memset(&image, 0, sizeof(image));  // Use std::memset for clarity
+    image.version = PNG_IMAGE_VERSION;
+
+    if (png_image_begin_read_from_memory(&image, encoded_image_data, static_cast<size_t>(encoded_image_data_len)) ==
+        0) {
+      return {kOrtxErrorInvalidArgument, "[ImageDecoder]: Failed to read PNG image."};
+    }
+
+    image.format = PNG_FORMAT_RGB;  // Ensure you have the appropriate format
+    const int height = image.height;
+    const int width = image.width;
+    const int channels = PNG_IMAGE_PIXEL_CHANNELS(image.format);  // Calculates the number of channels based on format
+
+    std::vector<int64_t> output_dimensions{height, width, channels};
+
+    uint8_t* decoded_image_data = output.Allocate(output_dimensions);
+    if (decoded_image_data == nullptr) {
+      return {kOrtxErrorInvalidArgument, "[ImageDecoder]: Failed to allocate memory for decoded image data."};
+    }
+
+    if (png_image_finish_read(&image, nullptr, decoded_image_data, 0, nullptr) == 0) {
+      return {kOrtxErrorInvalidArgument, "[ImageDecoder]: Failed to decode PNG image."};
+    }
+  } else {
+    // Initialize JPEG decompression object
+    jpeg_decompress_struct cinfo;
+    jpeg_error_mgr jerr;
+    cinfo.err = jpeg_std_error(&jerr);
+    jpeg_create_decompress(&cinfo);
+
+    // Set up the custom memory source manager
+    JMemorySourceManager srcManager(encoded_image_data, encoded_image_data_len);
+    cinfo.src = &srcManager;
+
+    // Read the JPEG header to get image info
+    jpeg_read_header(&cinfo, TRUE);
+
+    // Start decompression
+    jpeg_start_decompress(&cinfo);
+
+    // Allocate memory for the image
+    std::vector<int64_t> output_dimensions{cinfo.output_height, cinfo.output_width, cinfo.output_components};
+    uint8_t* imageBuffer = output.Allocate(output_dimensions);
+
+    // Read the image data
+    int row_stride = cinfo.output_width * cinfo.output_components;
+    while (cinfo.output_scanline < cinfo.output_height) {
+      uint8_t* row_ptr = imageBuffer + (cinfo.output_scanline * row_stride);
+      jpeg_read_scanlines(&cinfo, &row_ptr, 1);
+      if (srcManager.extError != kOrtxOK) {
+        break;
+      }
+    }
+
+    if (srcManager.extError != kOrtxOK) {
+      status = {srcManager.extError, "[ImageDecoder]: Failed to decode JPEG image."};
+    }
+
+    // Finish decompression
+    jpeg_finish_decompress(&cinfo);
+    jpeg_destroy_decompress(&cinfo);
+  }
+
+  return status;
+}
diff --git a/shared/api/image_processor.cc b/shared/api/image_processor.cc
index 833cc236d..8fe9dc0bd 100644
--- a/shared/api/image_processor.cc
+++ b/shared/api/image_processor.cc
@@ -8,7 +8,7 @@
 
 #include "image_processor.h"
 #include "c_api_utils.hpp"
-#include "cv2/imgcodecs/imdecode.hpp"
+#include "image_decoder.hpp"
 #include "image_transforms.hpp"
 #include "image_transforms_phi_3.hpp"
 
@@ -179,7 +179,7 @@ OrtxStatus ImageProcessor::PreProcess(ort_extensions::span<ImageRawData> image_d
   operations_.back()->ResetTensors(allocator_);
   if (status.IsOk()) {
     r.SetTensors(std::move(img_result));
-    r.SetTensorTypes({kOrtxFloat, kOrtxInt64, kOrtxInt64});
+    // r.SetTensorTypes({kOrtxFloat, kOrtxInt64, kOrtxInt64});
   }
 
   return status;
diff --git a/test/data/processor/image_to_numpy.py b/test/data/processor/image_to_numpy.py
new file mode 100644
index 000000000..9e14940da
--- /dev/null
+++ b/test/data/processor/image_to_numpy.py
@@ -0,0 +1,41 @@
+import os
+import tempfile
+from PIL import Image
+
+from onnxruntime_extensions.pp_api import ImageProcessor
+
+img_proc = ImageProcessor(R"""
+{
+  "processor": {
+    "name": "image_processing",
+    "transforms": [
+      {
+        "operation": {
+          "name": "decode_image",
+          "type": "DecodeImage",
+          "attrs": {
+            "color_space": "BGR"
+          }
+        }
+      },
+      {
+        "operation": {
+          "name": "convert_to_rgb",
+          "type": "ConvertRGB"
+        }
+      }
+    ]
+  }
+}""")
+
+img_name = "australia.jpg"
+result = img_proc.pre_process(os.path.dirname(__file__) + "/" + img_name)
+np_img = img_proc.to_numpy(result)
+print(np_img.shape, np_img.dtype)
+
+# can save the image back to disk
+img_rgb = np_img[0]
+img_bgr = img_rgb[..., ::-1]
+output_name = tempfile.gettempdir() + "/" + img_name
+Image.fromarray(img_bgr).save(output_name)
+print(output_name)
diff --git a/test/data/processor/proctest.py b/test/data/processor/proctest.py
new file mode 100644
index 000000000..9c807dc55
--- /dev/null
+++ b/test/data/processor/proctest.py
@@ -0,0 +1,76 @@
+import os
+import tempfile
+from PIL import Image
+from transformers import AutoProcessor
+from onnxruntime_extensions.pp_api import create_processor, load_images, image_pre_process, tensor_result_get_at
+
+import numpy as np
+
+
+def regen_image(arr):
+    mean = np.array([0.48145466, 0.4578275, 0.40821073])
+    std = np.array([0.26862954, 0.26130258, 0.27577711])
+
+    # Reverse normalization
+    array = arr * std + mean
+
+    # Clip the values to [0, 1] range
+    array = np.clip(array, 0, 1)
+
+    # Convert to [0, 255] range and uint8 type
+    array = (array * 255).astype(np.uint8)
+
+    # Convert NumPy array to PIL Image
+    image = Image.fromarray(array)
+    return image
+
+
+test_image = "test/data/processor/passport.png"
+# test_image = "/temp/passport_s.png"
+# test_image = "/temp/passport_s2.png"
+model_id = "microsoft/Phi-3-vision-128k-instruct"
+
+processor = create_processor("test/data/processor/phi_3_image.json")
+images = load_images([test_image])
+c_out = image_pre_process(processor, images)
+# print(tensor_result_get_at(c_out, 0))
+# print(tensor_result_get_at(c_out, 1))
+
+image = Image.open(test_image)
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+messages = [
+    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
+    {"role": "assistant", "content": "The chart displays the percentage of respondents who agree with various statements about their preparedness for meetings. It shows five categories: 'Having clear and pre-defined goals for meetings', 'Knowing where to find the information I need for a meeting', 'Understanding my exact role and responsibilities when I'm invited', 'Having tools to manage admin tasks like note-taking or summarization', and 'Having more focus time to sufficiently prepare for meetings'. Each category has an associated bar indicating the level of agreement, measured on a scale from 0% to 100%."},
+    {"role": "user", "content": "Provide insightful questions to spark discussion."}
+]
+prompt = processor.tokenizer.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True)
+
+
+inputs = processor(prompt, [image], return_tensors="pt")
+# print(inputs["pixel_values"].numpy())
+# print(inputs["image_sizes"])
+
+np.testing.assert_allclose(
+    inputs["image_sizes"].numpy(), tensor_result_get_at(c_out, 1))
+# np.testing.assert_allclose(inputs["pixel_values"].numpy(), tensor_result_get_at(c_out, 0), rtol=1e-1)
+
+if os.path.exists("/temp"):
+    temp_dir = "/temp"
+else:
+    temp_dir = tempfile.mkdtemp()
+    print(f"Created temp dir: {temp_dir}")
+
+for i in range(17):
+    expected = inputs["pixel_values"].numpy()[0, i]
+    actual = tensor_result_get_at(c_out, 0)[0, i]
+    e_image = regen_image(expected.transpose(1, 2, 0))
+    a_image = regen_image(actual.transpose(1, 2, 0))
+    e_image.save(f"{temp_dir}/e_{i}.png")
+    a_image.save(f"{temp_dir}/a_{i}.png")
+
+    try:
+        np.testing.assert_allclose(inputs["pixel_values"].numpy(
+        )[0, i], tensor_result_get_at(c_out, 0)[0, i], rtol=1e-2)
+    except AssertionError as e:
+        print(str(e))
diff --git a/test/pp_api_test/test_imgcodec.cc b/test/pp_api_test/test_imgcodec.cc
new file mode 100644
index 000000000..87450cd7d
--- /dev/null
+++ b/test/pp_api_test/test_imgcodec.cc
@@ -0,0 +1,81 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <vector>
+#include <tuple>
+#include <fstream>
+#include <filesystem>
+
+#include "gtest/gtest.h"
+#include "shared/api/c_api_utils.hpp"
+#include "shared/api/image_decoder.hpp"
+
+using namespace ort_extensions;
+
+TEST(ImgDecoderTest, TestPngDecoder) {
+  std::vector<uint8_t> png_data;
+  std::filesystem::path png_path = "data/processor/exceltable.png";
+  std::ifstream png_file(png_path, std::ios::binary);
+  ASSERT_TRUE(png_file.is_open());
+  png_file.seekg(0, std::ios::end);
+  png_data.resize(png_file.tellg());
+  png_file.seekg(0, std::ios::beg);
+  png_file.read(reinterpret_cast<char*>(png_data.data()), png_data.size());
+  png_file.close();
+
+  ortc::Tensor<uint8_t> png_tensor({static_cast<int64_t>(png_data.size())},  png_data.data());
+  ortc::Tensor<uint8_t> out_tensor{&CppAllocator::Instance()};
+  auto status = image_decoder(png_tensor, out_tensor);
+  ASSERT_TRUE(status.IsOk());
+
+  ASSERT_EQ(out_tensor.Shape(), std::vector<int64_t>({206, 487, 3}));
+  auto out_range = out_tensor.Data() + 0;
+  ASSERT_EQ(std::vector<uint8_t>(out_range, out_range + 12),
+            std::vector<uint8_t>({0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+
+  out_range = out_tensor.Data() + 477 * 3;
+  ASSERT_EQ(std::vector<uint8_t>(out_range, out_range + 12),
+            std::vector<uint8_t>({0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+
+  out_range = out_tensor.Data() + 243 * 206 * 3;
+  ASSERT_EQ(std::vector<uint8_t>(out_range, out_range + 12),
+            std::vector<uint8_t>({217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217}));
+
+  out_range = out_tensor.Data() + 485 * 206 * 3;
+  ASSERT_EQ(std::vector<uint8_t>(out_range, out_range + 12),
+            std::vector<uint8_t>({0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+}
+
+TEST(ImageDecoderTest, TestJpegDecoder) {
+  std::vector<uint8_t> jpeg_data;
+  std::filesystem::path jpeg_path = "data/processor/australia.jpg";
+  std::ifstream jpeg_file(jpeg_path, std::ios::binary);
+  ASSERT_TRUE(jpeg_file.is_open());
+  jpeg_file.seekg(0, std::ios::end);
+  jpeg_data.resize(jpeg_file.tellg());
+  jpeg_file.seekg(0, std::ios::beg);
+  jpeg_file.read(reinterpret_cast<char*>(jpeg_data.data()), jpeg_data.size());
+  jpeg_file.close();
+
+  ortc::Tensor<uint8_t> jpeg_tensor({static_cast<int64_t>(jpeg_data.size())},  jpeg_data.data());
+  ortc::Tensor<uint8_t> out_tensor{&CppAllocator::Instance()};
+  auto status = image_decoder(jpeg_tensor, out_tensor);
+  ASSERT_TRUE(status.IsOk());
+
+  ASSERT_EQ(out_tensor.Shape(), std::vector<int64_t>({876, 1300, 3}));
+  auto out_range = out_tensor.Data() + 0;
+  ASSERT_EQ(std::vector<uint8_t>(out_range, out_range + 12),
+            std::vector<uint8_t>({48, 14, 5, 48, 14, 5, 48, 14, 5, 48, 14, 5}));
+
+  out_range = out_tensor.Data() + 1296 * 3;
+  ASSERT_EQ(std::vector<uint8_t>(out_range, out_range + 12),
+            std::vector<uint8_t>({221, 237, 224, 225, 236, 219, 218, 222, 199, 203, 202, 174}));
+
+  out_range = out_tensor.Data() + 438 * 1300 * 3;
+  ASSERT_EQ(std::vector<uint8_t>(out_range, out_range + 12),
+            std::vector<uint8_t>({84, 68, 55, 86, 70, 55, 92, 77, 58, 101, 86, 65}));
+
+  out_range = out_tensor.Data() + 875 * 1300 * 3 + 1296 * 3;
+  ASSERT_EQ(std::vector<uint8_t>(out_range, out_range + 12),
+            std::vector<uint8_t>({208, 210, 197, 204, 206, 193, 198, 200, 187, 194, 196, 183}));
+}