Merge branch 'main' into snnn-patch-1

microsoft · Feb 27, 2025 · 67eff96 · 67eff96
2 parents 7adfde4 + f63857d
commit 67eff96
Show file tree

Hide file tree

Showing 12 changed files with 414 additions and 12 deletions.
diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml
@@ -740,7 +740,7 @@ stages:
     steps:
     - script: |
         cd $(Build.BinariesDirectory)
-        git clone https://github.com/emscripten-core/emsdk --depth 1 --branch 3.1.74
+        git clone https://github.com/emscripten-core/emsdk --depth 1 --branch 4.0.3
         emsdk/emsdk install latest
         emsdk/emsdk activate latest
       displayName: Setup emscripten pipeline

diff --git a/cmake/externals/dlib.cmake b/cmake/externals/dlib.cmake
@@ -1,7 +1,7 @@
 FetchContent_Declare(
     dlib
-    URL https://github.com/davisking/dlib/archive/refs/tags/v19.24.6.zip
-    URL_HASH SHA1=59b1fb4e9909697c646e4f74e94871dacf49f0bf
+    URL https://github.com/davisking/dlib/archive/d3520131a6e0e0fb62bc556b0222b45d99caf905.zip
+    URL_HASH SHA1=26b0eb3063da744a11144ae620b61fd1fb90fb39
     DOWNLOAD_EXTRACT_TIMESTAMP TRUE
     SOURCE_SUBDIR  not_set
 )

diff --git a/operators/tokenizer/bpe_kernels.cc b/operators/tokenizer/bpe_kernels.cc
@@ -682,6 +682,32 @@ void JsonFastTokenizer::UpdateTokenizer(const TokenJsonConfig& config, const jso
     }
   }
 
+  std::shared_ptr<json> added_tokens_decoder = config.added_tokens_decoder;
+
+  // Add any tokens from the added_tokens_decoder that were missing in added_tokens_
+  if (added_tokens_decoder && !added_tokens_decoder->empty()) {
+    for (const auto& [id_str, token] : added_tokens_decoder->items()) {
+      int id = std::stoi(id_str); // Convert key (ID) from string to integer
+
+      // Check if this token is already in the added_tokens_
+      auto existing_token = added_tokens_.find(ustring(token.value("content", "")));
+      if (existing_token == added_tokens_.end()) {  // Token doesn't exist yet
+        // Prepare a new token (populate id's with the keys from added_tokens_decoder)
+        AddedToken added_token;
+        added_token.id_ = id;
+        added_token.content_ = token.value("content", "");
+        added_token.lstrip_ = token.value("lstrip", false);
+        added_token.normalized_ = token.value("normalized", false);
+        added_token.rstrip_ = token.value("rstrip", false);
+        added_token.single_word_ = token.value("single_word", false);
+        added_token.special_ = token.value("special", false);
+
+        // Add the new token to added_tokens_
+        added_tokens_.emplace(ustring(added_token.content_), added_token);
+      }
+    }
+  }
+
   // iterate the added_tokens_ map and set the special tokens
   for (const auto& [key, added_token] : added_tokens_) {
       if (added_token.content_ == config.bos_token_) {

diff --git a/operators/tokenizer/tokenizer_jsconfig.hpp b/operators/tokenizer/tokenizer_jsconfig.hpp
@@ -46,6 +46,7 @@ class TokenJsonConfig final {
   ~TokenJsonConfig() {}
   using json = nlohmann::json;
   using json_pointer = nlohmann::json_pointer<std::string>;
+  std::shared_ptr<json> added_tokens_decoder;
 
  public:
   OrtxStatus AppendModuleJson(json& json_config) {
@@ -116,7 +117,7 @@ class TokenJsonConfig final {
         vocab_stream = std::make_unique<std::istringstream>(vocab_str);
       }
     } else {
-      auto ifs = std::make_unique<std::ifstream>(vocab_path_);
+      auto ifs = std::make_unique<std::ifstream>(path(vocab_path_.data()).open());
       if (!ifs->is_open()) {
         return OrtxStatus(extError_t::kOrtxErrorInvalidArgument, vocab_path_ + ": does not exist.");
       }
@@ -191,6 +192,9 @@ class TokenJsonConfig final {
       return OrtxStatus(kOrtxErrorInvalidArgument, "Failed to parse config json.");
     }
 
+    // Store added_tokens_decoder to add any missed tokens into added_tokens in UpdateTokenizer 
+    added_tokens_decoder = std::make_shared<json>(json_config.value("added_tokens_decoder", json::object()));
+
     auto module_cfg = tok_dir / "tokenizer_module.json";
     if (module_cfg.exists()) {
       std::ifstream module_ifs = module_cfg.open();

diff --git a/shared/api/c_api_processor.cc b/shared/api/c_api_processor.cc
@@ -31,21 +31,23 @@ struct RawImagesObject : public OrtxObjectImpl {
   size_t num_images{};
 };
 
-extError_t ORTX_API_CALL
-OrtxCreateRawImages(OrtxRawImages** images, const void* data[], const int64_t sizes[], size_t num_images) {
+extError_t ORTX_API_CALL OrtxCreateRawImages(OrtxRawImages** images, const void* data[], const int64_t sizes[],
+                                             size_t num_images) {
   if (images == nullptr || data == nullptr || sizes == nullptr) {
     ReturnableStatus::last_error_message_ = "Invalid argument";
     return kOrtxErrorInvalidArgument;
   }
 
   auto images_obj = std::make_unique<RawImagesObject>();
   images_obj->images = std::make_unique<ImageRawData[]>(num_images);
+  images_obj->num_images = num_images;
   for (size_t i = 0; i < num_images; ++i) {
     images_obj->images[i].resize(static_cast<size_t>(sizes[i]));
     std::copy_n(static_cast<const uint8_t*>(data[i]), sizes[i], images_obj->images[i].data());
   }
 
-  return {};
+  *images = static_cast<OrtxRawImages*>(images_obj.release());
+  return extError_t();
 }
 
 extError_t ORTX_API_CALL OrtxLoadImages(OrtxRawImages** images, const char** image_paths, size_t num_images,

diff --git a/shared/api/runner.hpp b/shared/api/runner.hpp
@@ -447,7 +447,6 @@ class ExecutionPlan {
           status = PrepareInput(*operations_[n + 1], ts_output, ts_input, ts_lookup_table);
         }
 
-        size_t i = 0;
         for (size_t i = 0; i < ts_output.size(); i++) {
           auto& out_tensor = ts_output[i];
           std::string tensor_name = op->GetOpName() + ":" + std::to_string(i);

diff --git a/shared/api/speech_features.hpp b/shared/api/speech_features.hpp
@@ -51,12 +51,16 @@ class SpeechFeatures {
     return stft_norm_.Compute(pcm, n_fft_, hop_length_, {fft_win_.data(), fft_win_.size()}, n_fft_, stft_norm);
   }
 
-  OrtxStatus SpeechLibSTFTNorm(const ortc::Tensor<float>& pcm, ortc::Tensor<float>& stft_norm) {
+  OrtxStatus SpeechLibSTFTNorm(const ortc::Tensor<float>& pcm,
+                               ortc::Tensor<float>& stft_norm,
+                               ortc::Tensor<int64_t>& audio_frames) {
+    constexpr int64_t feat_stride = 1;
     const float preemphasis = 0.97f;
     // # Spec 1: SpeechLib cut remaining sample insufficient for a hop
     // n_batch = (wav.shape[0] - win_length) // hop_length + 1
     auto pcm_length = pcm.Shape()[1];
     auto n_batch = (pcm_length - frame_length_) / hop_length_ + 1;
+    audio_frames.Allocate({1})[0] = n_batch * feat_stride;
     auto pcm_data = pcm.Data();
     dlib::matrix<float> dm_x = dlib::mat(pcm_data, 1, pcm_length);
 
@@ -605,13 +609,14 @@ class Phi4AudioEmbed {
   OrtxStatus Compute(const ortc::Tensor<float>& pcm,
                      const ortc::Tensor<int64_t>& sr,
                      ortc::Tensor<float>& ts_logmel,
+                     ortc::Tensor<int64_t>& audio_frames,
                      ortc::Tensor<int64_t>& embeded_size) {
-
     int64_t sr_val = sr.Data()[0];
     ortc::Tensor<float> stft_norm(&CppAllocator::Instance());
+    ortc::Tensor<int64_t> num_audio_frames(&CppAllocator::Instance());
     SpeechFeatures stft_normal;
     stft_normal.Init(sr_val == 8000? stft_normal_8k_attrs_: stft_normal_attrs_);
-    auto status = stft_normal.SpeechLibSTFTNorm(pcm, stft_norm);
+    auto status = stft_normal.SpeechLibSTFTNorm(pcm, stft_norm, num_audio_frames);
     if (!status.IsOk()) {
       return status;
     }
@@ -639,6 +644,8 @@ class Phi4AudioEmbed {
     */
     auto embedded_size_data = embeded_size.Allocate({1});
     embedded_size_data[0] = std::ceil(static_cast<float>(ts_logmel.Shape()[0]) / audio_compression_rate_);
+
+    audio_frames.Allocate({1})[0] = num_audio_frames.Data()[0];
     return status;
   }
 

diff --git a/test/data/added-tokens/tokenizer.json b/test/data/added-tokens/tokenizer.json
@@ -0,0 +1,189 @@
+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<s>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "</s>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 32003,
+      "content": "<|placeholder2|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 32004,
+      "content": "<|placeholder3|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 32005,
+      "content": "<|placeholder4|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 32006,
+      "content": "<|system|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 32007,
+      "content": "<|end|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 32008,
+      "content": "<|placeholder5|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 32009,
+      "content": "<|placeholder6|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 32010,
+      "content": "<|user|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    } 
+  ],
+  "normalizer": {
+    "type": "Sequence",
+    "normalizers": [
+      {
+        "type": "Prepend",
+        "prepend": "▁"
+      },
+      {
+        "type": "Replace",
+        "pattern": {
+          "String": " "
+        },
+        "content": "▁"
+      }
+    ]
+  },
+  "pre_tokenizer": null,
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {}
+  },
+  "decoder": {
+    "type": "Sequence",
+    "decoders": [
+      {
+        "type": "Replace",
+        "pattern": {
+          "String": "▁"
+        },
+        "content": " "
+      },
+      {
+        "type": "ByteFallback"
+      },
+      {
+        "type": "Fuse"
+      },
+      {
+        "type": "Strip",
+        "content": " ",
+        "start": 1,
+        "stop": 0
+      }
+    ]
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": "<unk>",
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": true,
+    "byte_fallback": true,
+    "vocab": {
+      "<unk>": 0,
+      "<s>": 1,
+      "</s>": 2,
+      "<0x0A>": 13
+    },
+    "merges": {}
+  }
+}