Merge branch 'main' of https://github.com/microsoft/onnxruntime-exten…

…sions into sayanshaw/chat-template
microsoft · Mar 5, 2025 · 8bf98ef · 8bf98ef
2 parents 9955f0a + 4c3ae1b
commit 8bf98ef
Show file tree

Hide file tree

Showing 8 changed files with 38 additions and 37 deletions.
diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
@@ -45,7 +45,7 @@ blingfire 0831265c1aca95ca02eca5bf1155e4251e545328
 
 _____
 
-dlib v19.24.6
+dlib v19.24.7
 
 Boost Software License - Version 1.0 - August 17th, 2003
 
@@ -122,7 +122,7 @@ Viatcheslav Ostapenko <[email protected]>
 
 _____
 
-nlohmann/json v3.10.5
+nlohmann/json v3.11.3
 
 MIT License
 

diff --git a/cgmanifest.json b/cgmanifest.json
@@ -34,7 +34,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "v19.24.6",
+          "commitHash": "v19.24.7",
           "repositoryUrl": "https://github.com/davisking/dlib.git"
         }
       }

diff --git a/cmake/externals/dlib.cmake b/cmake/externals/dlib.cmake
@@ -1,7 +1,7 @@
 FetchContent_Declare(
     dlib
-    URL https://github.com/davisking/dlib/archive/d3520131a6e0e0fb62bc556b0222b45d99caf905.zip
-    URL_HASH SHA1=26b0eb3063da744a11144ae620b61fd1fb90fb39
+    URL https://github.com/davisking/dlib/archive/refs/tags/v19.24.7.zip
+    URL_HASH SHA1=6c63ea576e2b525751b0dead27c6c1139c5100ae
     DOWNLOAD_EXTRACT_TIMESTAMP TRUE
     SOURCE_SUBDIR  not_set
 )

diff --git a/operators/tokenizer/bpe_kernels.cc b/operators/tokenizer/bpe_kernels.cc
@@ -656,13 +656,16 @@ void JsonFastTokenizer::LoadSpmModelParams(const json& tok_json) {
       for (const auto& step : *decoders_node) {
         std::string type = step.value("type", "");
         if (type == "Replace") {
-          std::string target = step.value("/pattern/String"_json_pointer, "");
+            std::string target = "";
+            if (step.contains("pattern")) {
+              target = step["pattern"].value("String", "");
+            }
           if (target == spm_escaped_space) {
             json_conf_.spm_model_ = true;
           }
         }
         else if (type == "Strip") {
-          std::string content = step.value("/content"_json_pointer, "");
+          std::string content = step.value("content", "");
           if (content == " ") {
             json_conf_.add_dummy_prefix_ = true;
           }

diff --git a/operators/tokenizer/tokenizer_jsconfig.hpp b/operators/tokenizer/tokenizer_jsconfig.hpp
@@ -45,7 +45,6 @@ class TokenJsonConfig final {
   TokenJsonConfig() {}
   ~TokenJsonConfig() {}
   using json = nlohmann::json;
-  using json_pointer = nlohmann::json_pointer<std::string>;
   std::shared_ptr<json> added_tokens_decoder;
 
  public:

diff --git a/shared/api/runner.hpp b/shared/api/runner.hpp
@@ -590,7 +590,8 @@ class OrtxRunner {
         if (shape != ts[axis]->Shape()) {
           is_same_shape = false;
           auto dtype = ts[axis]->Type();
-          if (dtype != ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 && dtype != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
+          if (dtype != ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 &&
+            dtype != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT && dtype != ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL) {
             return {kOrtxErrorInvalidArgument, "[StackTensors]: shapes of tensors to stack are not the same."};
           }
           if (IsGreaterShape(ts[axis]->Shape(), shape)) {

diff --git a/shared/api/speech_features.hpp b/shared/api/speech_features.hpp
@@ -51,16 +51,12 @@ class SpeechFeatures {
     return stft_norm_.Compute(pcm, n_fft_, hop_length_, {fft_win_.data(), fft_win_.size()}, n_fft_, stft_norm);
   }
 
-  OrtxStatus SpeechLibSTFTNorm(const ortc::Tensor<float>& pcm,
-                               ortc::Tensor<float>& stft_norm,
-                               ortc::Tensor<int64_t>& audio_frames) {
-    constexpr int64_t feat_stride = 1;
+  OrtxStatus SpeechLibSTFTNorm(const ortc::Tensor<float>& pcm, ortc::Tensor<float>& stft_norm) {
     const float preemphasis = 0.97f;
     // # Spec 1: SpeechLib cut remaining sample insufficient for a hop
     // n_batch = (wav.shape[0] - win_length) // hop_length + 1
     auto pcm_length = pcm.Shape()[1];
     auto n_batch = (pcm_length - frame_length_) / hop_length_ + 1;
-    audio_frames.Allocate({1})[0] = n_batch * feat_stride;
     auto pcm_data = pcm.Data();
     dlib::matrix<float> dm_x = dlib::mat(pcm_data, 1, pcm_length);
 
@@ -609,26 +605,24 @@ class Phi4AudioEmbed {
   OrtxStatus Compute(const ortc::Tensor<float>& pcm,
                      const ortc::Tensor<int64_t>& sr,
                      ortc::Tensor<float>& ts_logmel,
-                     ortc::Tensor<int64_t>& audio_frames,
+                     ortc::Tensor<bool>& audio_attention_mask,
                      ortc::Tensor<int64_t>& embeded_size) {
     int64_t sr_val = sr.Data()[0];
     ortc::Tensor<float> stft_norm(&CppAllocator::Instance());
-    ortc::Tensor<int64_t> num_audio_frames(&CppAllocator::Instance());
     SpeechFeatures stft_normal;
     stft_normal.Init(sr_val == 8000? stft_normal_8k_attrs_: stft_normal_attrs_);
-    auto status = stft_normal.SpeechLibSTFTNorm(pcm, stft_norm, num_audio_frames);
+    auto status = stft_normal.SpeechLibSTFTNorm(pcm, stft_norm);
     if (!status.IsOk()) {
       return status;
     }
-
-    SpeechLibLogMel logmel;
-    // already checked in Init
 
     // Currently we only support 8k and 16k Hz sampling rate.
     if (sr_val != 8000 && sr_val != 16000){
-      return OrtxStatus(kOrtxErrorNotImplemented, "Currently only 8k and 16k Hz sampling rate is supported. Please resample your audio file with unsupported audio sampling rate: " + sr_val);
+      return {kOrtxErrorInvalidArgument, "Only 8k and 16k Hz target sampling rate is supported."};
     }
 
+    SpeechLibLogMel logmel;
+    // attributes already are verified in Init method
     logmel.Init(sr_val == 8000 ? logmel_8k_attrs_: logmel_attrs_);
     status = logmel.Compute(stft_norm, ts_logmel);
     if (!status.IsOk()) {
@@ -648,10 +642,13 @@ class Phi4AudioEmbed {
 
         return result
     */
+    auto audio_frames = ts_logmel.Shape()[0];
     auto embedded_size_data = embeded_size.Allocate({1});
-    embedded_size_data[0] = std::ceil(static_cast<float>(ts_logmel.Shape()[0]) / audio_compression_rate_);
+    embedded_size_data[0] = std::ceil(static_cast<float>(audio_frames) / audio_compression_rate_);
 
-    audio_frames.Allocate({1})[0] = num_audio_frames.Data()[0];
+    constexpr int64_t feat_stride = 1;
+    auto attention = audio_attention_mask.Allocate({audio_frames * feat_stride});
+    std::memset(attention, 1, audio_frames * feat_stride * sizeof(bool));
     return status;
   }
 

diff --git a/test/pp_api_test/test_feature_extraction.cc b/test/pp_api_test/test_feature_extraction.cc
@@ -62,16 +62,17 @@ TEST(ExtractorTest, TestPhi4AudioFeatureExtraction) {
   ASSERT_EQ(std::vector<int64_t>(shape, shape + num_dims), std::vector<int64_t>({3, 1344, 80}));
 
   tensor.reset();
-  const int64_t* audio_frames{};
-  const int64_t* audio_frames_shape{};
-  size_t audio_frames_num_dims;
+  const bool* audio_attention_mask{};
+  const int64_t* audio_mask_shape{};
+  size_t audio_mask_dims;
   err = OrtxTensorResultGetAt(result.get(), 1, tensor.ToBeAssigned());
   ASSERT_EQ(err, kOrtxOK);
-  err = OrtxGetTensorData(tensor.get(), reinterpret_cast<const void**>(&audio_frames), &audio_frames_shape, &audio_frames_num_dims);
+  err = OrtxGetTensorData(tensor.get(), reinterpret_cast<const void**>(&audio_attention_mask), &audio_mask_shape, &audio_mask_dims);
   ASSERT_EQ(err, kOrtxOK);
-  ASSERT_EQ(std::vector<int64_t>(audio_frames_shape, audio_frames_shape + audio_frames_num_dims), std::vector<int64_t>({3, 1}));
-  const size_t num_elements = std::accumulate(audio_frames_shape, audio_frames_shape + audio_frames_num_dims, 1, std::multiplies<size_t>());
-  ASSERT_EQ(std::vector<int64_t>(audio_frames, audio_frames + num_elements), std::vector<int64_t>({1098, 1332, 1344}));
+  ASSERT_EQ(std::vector<int64_t>(audio_mask_shape, audio_mask_shape + audio_mask_dims), std::vector<int64_t>({3, 1344}));
+  ASSERT_EQ(std::count(audio_attention_mask + 0 * 1344, audio_attention_mask + 1 * 1344, true), 1098);
+  ASSERT_EQ(std::count(audio_attention_mask + 1 * 1344, audio_attention_mask + 2 * 1344, true), 1332);
+  ASSERT_EQ(std::count(audio_attention_mask + 2 * 1344, audio_attention_mask + 3 * 1344, true), 1344);
 
   tensor.reset();
   err = OrtxTensorResultGetAt(result.get(), 2, tensor.ToBeAssigned());
@@ -109,16 +110,16 @@ TEST(ExtractorTest, TestPhi4AudioFeatureExtraction8k) {
   ASSERT_EQ(std::vector<int64_t>(shape, shape + num_dims), std::vector<int64_t>({1, 2938, 80}));
 
   tensor.reset();
-  const int64_t* audio_frames{};
-  const int64_t* audio_frames_shape{};
-  size_t audio_frames_num_dims;
+  const bool* audio_attention_mask{};
+  const int64_t* audio_mask_shape{};
+  size_t audio_mask_dims{};
   err = OrtxTensorResultGetAt(result.get(), 1, tensor.ToBeAssigned());
   ASSERT_EQ(err, kOrtxOK);
-  err = OrtxGetTensorData(tensor.get(), reinterpret_cast<const void**>(&audio_frames), &audio_frames_shape, &audio_frames_num_dims);
+  err = OrtxGetTensorData(tensor.get(), reinterpret_cast<const void**>(&audio_attention_mask), &audio_mask_shape, &audio_mask_dims);
   ASSERT_EQ(err, kOrtxOK);
-  ASSERT_EQ(std::vector<int64_t>(audio_frames_shape, audio_frames_shape + audio_frames_num_dims), std::vector<int64_t>({1, 1}));
-  const size_t num_elements = std::accumulate(audio_frames_shape, audio_frames_shape + audio_frames_num_dims, 1, std::multiplies<size_t>());
-  ASSERT_EQ(std::vector<int64_t>(audio_frames, audio_frames + num_elements), std::vector<int64_t>({2938}));
+  ASSERT_EQ(std::vector<int64_t>(audio_mask_shape, audio_mask_shape + audio_mask_dims), std::vector<int64_t>({1, 2938}));
+  const size_t num_elements = std::count(audio_attention_mask, audio_attention_mask + 2938, true);
+  ASSERT_EQ(num_elements, 2938);
 
   tensor.reset();
   err = OrtxTensorResultGetAt(result.get(), 2, tensor.ToBeAssigned());