Skip to content

Commit

Permalink
Merge branch 'main' into snnn-patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
wenbingl authored Feb 27, 2025
2 parents 7adfde4 + f63857d commit 67eff96
Show file tree
Hide file tree
Showing 12 changed files with 414 additions and 12 deletions.
2 changes: 1 addition & 1 deletion .pipelines/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,7 @@ stages:
steps:
- script: |
cd $(Build.BinariesDirectory)
git clone https://github.com/emscripten-core/emsdk --depth 1 --branch 3.1.74
git clone https://github.com/emscripten-core/emsdk --depth 1 --branch 4.0.3
emsdk/emsdk install latest
emsdk/emsdk activate latest
displayName: Setup emscripten pipeline
Expand Down
4 changes: 2 additions & 2 deletions cmake/externals/dlib.cmake
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FetchContent_Declare(
dlib
URL https://github.com/davisking/dlib/archive/refs/tags/v19.24.6.zip
URL_HASH SHA1=59b1fb4e9909697c646e4f74e94871dacf49f0bf
URL https://github.com/davisking/dlib/archive/d3520131a6e0e0fb62bc556b0222b45d99caf905.zip
URL_HASH SHA1=26b0eb3063da744a11144ae620b61fd1fb90fb39
DOWNLOAD_EXTRACT_TIMESTAMP TRUE
SOURCE_SUBDIR not_set
)
Expand Down
26 changes: 26 additions & 0 deletions operators/tokenizer/bpe_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,32 @@ void JsonFastTokenizer::UpdateTokenizer(const TokenJsonConfig& config, const jso
}
}

std::shared_ptr<json> added_tokens_decoder = config.added_tokens_decoder;

// Add any tokens from the added_tokens_decoder that were missing in added_tokens_
if (added_tokens_decoder && !added_tokens_decoder->empty()) {
for (const auto& [id_str, token] : added_tokens_decoder->items()) {
int id = std::stoi(id_str); // Convert key (ID) from string to integer

// Check if this token is already in the added_tokens_
auto existing_token = added_tokens_.find(ustring(token.value("content", "")));
if (existing_token == added_tokens_.end()) { // Token doesn't exist yet
// Prepare a new token (populate id's with the keys from added_tokens_decoder)
AddedToken added_token;
added_token.id_ = id;
added_token.content_ = token.value("content", "");
added_token.lstrip_ = token.value("lstrip", false);
added_token.normalized_ = token.value("normalized", false);
added_token.rstrip_ = token.value("rstrip", false);
added_token.single_word_ = token.value("single_word", false);
added_token.special_ = token.value("special", false);

// Add the new token to added_tokens_
added_tokens_.emplace(ustring(added_token.content_), added_token);
}
}
}

// iterate the added_tokens_ map and set the special tokens
for (const auto& [key, added_token] : added_tokens_) {
if (added_token.content_ == config.bos_token_) {
Expand Down
6 changes: 5 additions & 1 deletion operators/tokenizer/tokenizer_jsconfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class TokenJsonConfig final {
~TokenJsonConfig() {}
using json = nlohmann::json;
using json_pointer = nlohmann::json_pointer<std::string>;
std::shared_ptr<json> added_tokens_decoder;

public:
OrtxStatus AppendModuleJson(json& json_config) {
Expand Down Expand Up @@ -116,7 +117,7 @@ class TokenJsonConfig final {
vocab_stream = std::make_unique<std::istringstream>(vocab_str);
}
} else {
auto ifs = std::make_unique<std::ifstream>(vocab_path_);
auto ifs = std::make_unique<std::ifstream>(path(vocab_path_.data()).open());
if (!ifs->is_open()) {
return OrtxStatus(extError_t::kOrtxErrorInvalidArgument, vocab_path_ + ": does not exist.");
}
Expand Down Expand Up @@ -191,6 +192,9 @@ class TokenJsonConfig final {
return OrtxStatus(kOrtxErrorInvalidArgument, "Failed to parse config json.");
}

// Store added_tokens_decoder to add any missed tokens into added_tokens in UpdateTokenizer
added_tokens_decoder = std::make_shared<json>(json_config.value("added_tokens_decoder", json::object()));

auto module_cfg = tok_dir / "tokenizer_module.json";
if (module_cfg.exists()) {
std::ifstream module_ifs = module_cfg.open();
Expand Down
8 changes: 5 additions & 3 deletions shared/api/c_api_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,23 @@ struct RawImagesObject : public OrtxObjectImpl {
size_t num_images{};
};

extError_t ORTX_API_CALL
OrtxCreateRawImages(OrtxRawImages** images, const void* data[], const int64_t sizes[], size_t num_images) {
extError_t ORTX_API_CALL OrtxCreateRawImages(OrtxRawImages** images, const void* data[], const int64_t sizes[],
size_t num_images) {
if (images == nullptr || data == nullptr || sizes == nullptr) {
ReturnableStatus::last_error_message_ = "Invalid argument";
return kOrtxErrorInvalidArgument;
}

auto images_obj = std::make_unique<RawImagesObject>();
images_obj->images = std::make_unique<ImageRawData[]>(num_images);
images_obj->num_images = num_images;
for (size_t i = 0; i < num_images; ++i) {
images_obj->images[i].resize(static_cast<size_t>(sizes[i]));
std::copy_n(static_cast<const uint8_t*>(data[i]), sizes[i], images_obj->images[i].data());
}

return {};
*images = static_cast<OrtxRawImages*>(images_obj.release());
return extError_t();
}

extError_t ORTX_API_CALL OrtxLoadImages(OrtxRawImages** images, const char** image_paths, size_t num_images,
Expand Down
1 change: 0 additions & 1 deletion shared/api/runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,6 @@ class ExecutionPlan {
status = PrepareInput(*operations_[n + 1], ts_output, ts_input, ts_lookup_table);
}

size_t i = 0;
for (size_t i = 0; i < ts_output.size(); i++) {
auto& out_tensor = ts_output[i];
std::string tensor_name = op->GetOpName() + ":" + std::to_string(i);
Expand Down
13 changes: 10 additions & 3 deletions shared/api/speech_features.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,16 @@ class SpeechFeatures {
return stft_norm_.Compute(pcm, n_fft_, hop_length_, {fft_win_.data(), fft_win_.size()}, n_fft_, stft_norm);
}

OrtxStatus SpeechLibSTFTNorm(const ortc::Tensor<float>& pcm, ortc::Tensor<float>& stft_norm) {
OrtxStatus SpeechLibSTFTNorm(const ortc::Tensor<float>& pcm,
ortc::Tensor<float>& stft_norm,
ortc::Tensor<int64_t>& audio_frames) {
constexpr int64_t feat_stride = 1;
const float preemphasis = 0.97f;
// # Spec 1: SpeechLib cut remaining sample insufficient for a hop
// n_batch = (wav.shape[0] - win_length) // hop_length + 1
auto pcm_length = pcm.Shape()[1];
auto n_batch = (pcm_length - frame_length_) / hop_length_ + 1;
audio_frames.Allocate({1})[0] = n_batch * feat_stride;
auto pcm_data = pcm.Data();
dlib::matrix<float> dm_x = dlib::mat(pcm_data, 1, pcm_length);

Expand Down Expand Up @@ -605,13 +609,14 @@ class Phi4AudioEmbed {
OrtxStatus Compute(const ortc::Tensor<float>& pcm,
const ortc::Tensor<int64_t>& sr,
ortc::Tensor<float>& ts_logmel,
ortc::Tensor<int64_t>& audio_frames,
ortc::Tensor<int64_t>& embeded_size) {

int64_t sr_val = sr.Data()[0];
ortc::Tensor<float> stft_norm(&CppAllocator::Instance());
ortc::Tensor<int64_t> num_audio_frames(&CppAllocator::Instance());
SpeechFeatures stft_normal;
stft_normal.Init(sr_val == 8000? stft_normal_8k_attrs_: stft_normal_attrs_);
auto status = stft_normal.SpeechLibSTFTNorm(pcm, stft_norm);
auto status = stft_normal.SpeechLibSTFTNorm(pcm, stft_norm, num_audio_frames);
if (!status.IsOk()) {
return status;
}
Expand Down Expand Up @@ -639,6 +644,8 @@ class Phi4AudioEmbed {
*/
auto embedded_size_data = embeded_size.Allocate({1});
embedded_size_data[0] = std::ceil(static_cast<float>(ts_logmel.Shape()[0]) / audio_compression_rate_);

audio_frames.Allocate({1})[0] = num_audio_frames.Data()[0];
return status;
}

Expand Down
189 changes: 189 additions & 0 deletions test/data/added-tokens/tokenizer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "</s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32003,
"content": "<|placeholder2|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32004,
"content": "<|placeholder3|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32005,
"content": "<|placeholder4|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32006,
"content": "<|system|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32007,
"content": "<|end|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32008,
"content": "<|placeholder5|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32009,
"content": "<|placeholder6|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32010,
"content": "<|user|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Prepend",
"prepend": ""
},
{
"type": "Replace",
"pattern": {
"String": " "
},
"content": ""
}
]
},
"pre_tokenizer": null,
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {}
},
"decoder": {
"type": "Sequence",
"decoders": [
{
"type": "Replace",
"pattern": {
"String": ""
},
"content": " "
},
{
"type": "ByteFallback"
},
{
"type": "Fuse"
},
{
"type": "Strip",
"content": " ",
"start": 1,
"stop": 0
}
]
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<unk>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": true,
"byte_fallback": true,
"vocab": {
"<unk>": 0,
"<s>": 1,
"</s>": 2,
"<0x0A>": 13
},
"merges": {}
}
}
Loading

0 comments on commit 67eff96

Please sign in to comment.