Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…sions into sayanshaw/chat-template
  • Loading branch information
Sayan Shaw committed Mar 5, 2025
2 parents 9955f0a + 4c3ae1b commit 8bf98ef
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 37 deletions.
4 changes: 2 additions & 2 deletions ThirdPartyNotices.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ blingfire 0831265c1aca95ca02eca5bf1155e4251e545328

_____

dlib v19.24.6
dlib v19.24.7

Boost Software License - Version 1.0 - August 17th, 2003

Expand Down Expand Up @@ -122,7 +122,7 @@ Viatcheslav Ostapenko <[email protected]>

_____

nlohmann/json v3.10.5
nlohmann/json v3.11.3

MIT License

Expand Down
2 changes: 1 addition & 1 deletion cgmanifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"component": {
"type": "git",
"git": {
"commitHash": "v19.24.6",
"commitHash": "v19.24.7",
"repositoryUrl": "https://github.com/davisking/dlib.git"
}
}
Expand Down
4 changes: 2 additions & 2 deletions cmake/externals/dlib.cmake
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FetchContent_Declare(
dlib
URL https://github.com/davisking/dlib/archive/d3520131a6e0e0fb62bc556b0222b45d99caf905.zip
URL_HASH SHA1=26b0eb3063da744a11144ae620b61fd1fb90fb39
URL https://github.com/davisking/dlib/archive/refs/tags/v19.24.7.zip
URL_HASH SHA1=6c63ea576e2b525751b0dead27c6c1139c5100ae
DOWNLOAD_EXTRACT_TIMESTAMP TRUE
SOURCE_SUBDIR not_set
)
Expand Down
7 changes: 5 additions & 2 deletions operators/tokenizer/bpe_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -656,13 +656,16 @@ void JsonFastTokenizer::LoadSpmModelParams(const json& tok_json) {
for (const auto& step : *decoders_node) {
std::string type = step.value("type", "");
if (type == "Replace") {
std::string target = step.value("/pattern/String"_json_pointer, "");
std::string target = "";
if (step.contains("pattern")) {
target = step["pattern"].value("String", "");
}
if (target == spm_escaped_space) {
json_conf_.spm_model_ = true;
}
}
else if (type == "Strip") {
std::string content = step.value("/content"_json_pointer, "");
std::string content = step.value("content", "");
if (content == " ") {
json_conf_.add_dummy_prefix_ = true;
}
Expand Down
1 change: 0 additions & 1 deletion operators/tokenizer/tokenizer_jsconfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ class TokenJsonConfig final {
TokenJsonConfig() {}
~TokenJsonConfig() {}
using json = nlohmann::json;
using json_pointer = nlohmann::json_pointer<std::string>;
std::shared_ptr<json> added_tokens_decoder;

public:
Expand Down
3 changes: 2 additions & 1 deletion shared/api/runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,8 @@ class OrtxRunner {
if (shape != ts[axis]->Shape()) {
is_same_shape = false;
auto dtype = ts[axis]->Type();
if (dtype != ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 && dtype != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
if (dtype != ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 &&
dtype != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT && dtype != ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL) {
return {kOrtxErrorInvalidArgument, "[StackTensors]: shapes of tensors to stack are not the same."};
}
if (IsGreaterShape(ts[axis]->Shape(), shape)) {
Expand Down
25 changes: 11 additions & 14 deletions shared/api/speech_features.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,12 @@ class SpeechFeatures {
return stft_norm_.Compute(pcm, n_fft_, hop_length_, {fft_win_.data(), fft_win_.size()}, n_fft_, stft_norm);
}

OrtxStatus SpeechLibSTFTNorm(const ortc::Tensor<float>& pcm,
ortc::Tensor<float>& stft_norm,
ortc::Tensor<int64_t>& audio_frames) {
constexpr int64_t feat_stride = 1;
OrtxStatus SpeechLibSTFTNorm(const ortc::Tensor<float>& pcm, ortc::Tensor<float>& stft_norm) {
const float preemphasis = 0.97f;
// # Spec 1: SpeechLib cut remaining sample insufficient for a hop
// n_batch = (wav.shape[0] - win_length) // hop_length + 1
auto pcm_length = pcm.Shape()[1];
auto n_batch = (pcm_length - frame_length_) / hop_length_ + 1;
audio_frames.Allocate({1})[0] = n_batch * feat_stride;
auto pcm_data = pcm.Data();
dlib::matrix<float> dm_x = dlib::mat(pcm_data, 1, pcm_length);

Expand Down Expand Up @@ -609,26 +605,24 @@ class Phi4AudioEmbed {
OrtxStatus Compute(const ortc::Tensor<float>& pcm,
const ortc::Tensor<int64_t>& sr,
ortc::Tensor<float>& ts_logmel,
ortc::Tensor<int64_t>& audio_frames,
ortc::Tensor<bool>& audio_attention_mask,
ortc::Tensor<int64_t>& embeded_size) {
int64_t sr_val = sr.Data()[0];
ortc::Tensor<float> stft_norm(&CppAllocator::Instance());
ortc::Tensor<int64_t> num_audio_frames(&CppAllocator::Instance());
SpeechFeatures stft_normal;
stft_normal.Init(sr_val == 8000? stft_normal_8k_attrs_: stft_normal_attrs_);
auto status = stft_normal.SpeechLibSTFTNorm(pcm, stft_norm, num_audio_frames);
auto status = stft_normal.SpeechLibSTFTNorm(pcm, stft_norm);
if (!status.IsOk()) {
return status;
}

SpeechLibLogMel logmel;
// already checked in Init

// Currently we only support 8k and 16k Hz sampling rate.
if (sr_val != 8000 && sr_val != 16000){
return OrtxStatus(kOrtxErrorNotImplemented, "Currently only 8k and 16k Hz sampling rate is supported. Please resample your audio file with unsupported audio sampling rate: " + sr_val);
return {kOrtxErrorInvalidArgument, "Only 8k and 16k Hz target sampling rate is supported."};
}

SpeechLibLogMel logmel;
// attributes already are verified in Init method
logmel.Init(sr_val == 8000 ? logmel_8k_attrs_: logmel_attrs_);
status = logmel.Compute(stft_norm, ts_logmel);
if (!status.IsOk()) {
Expand All @@ -648,10 +642,13 @@ class Phi4AudioEmbed {
return result
*/
auto audio_frames = ts_logmel.Shape()[0];
auto embedded_size_data = embeded_size.Allocate({1});
embedded_size_data[0] = std::ceil(static_cast<float>(ts_logmel.Shape()[0]) / audio_compression_rate_);
embedded_size_data[0] = std::ceil(static_cast<float>(audio_frames) / audio_compression_rate_);

audio_frames.Allocate({1})[0] = num_audio_frames.Data()[0];
constexpr int64_t feat_stride = 1;
auto attention = audio_attention_mask.Allocate({audio_frames * feat_stride});
std::memset(attention, 1, audio_frames * feat_stride * sizeof(bool));
return status;
}

Expand Down
29 changes: 15 additions & 14 deletions test/pp_api_test/test_feature_extraction.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,16 +62,17 @@ TEST(ExtractorTest, TestPhi4AudioFeatureExtraction) {
ASSERT_EQ(std::vector<int64_t>(shape, shape + num_dims), std::vector<int64_t>({3, 1344, 80}));

tensor.reset();
const int64_t* audio_frames{};
const int64_t* audio_frames_shape{};
size_t audio_frames_num_dims;
const bool* audio_attention_mask{};
const int64_t* audio_mask_shape{};
size_t audio_mask_dims;
err = OrtxTensorResultGetAt(result.get(), 1, tensor.ToBeAssigned());
ASSERT_EQ(err, kOrtxOK);
err = OrtxGetTensorData(tensor.get(), reinterpret_cast<const void**>(&audio_frames), &audio_frames_shape, &audio_frames_num_dims);
err = OrtxGetTensorData(tensor.get(), reinterpret_cast<const void**>(&audio_attention_mask), &audio_mask_shape, &audio_mask_dims);
ASSERT_EQ(err, kOrtxOK);
ASSERT_EQ(std::vector<int64_t>(audio_frames_shape, audio_frames_shape + audio_frames_num_dims), std::vector<int64_t>({3, 1}));
const size_t num_elements = std::accumulate(audio_frames_shape, audio_frames_shape + audio_frames_num_dims, 1, std::multiplies<size_t>());
ASSERT_EQ(std::vector<int64_t>(audio_frames, audio_frames + num_elements), std::vector<int64_t>({1098, 1332, 1344}));
ASSERT_EQ(std::vector<int64_t>(audio_mask_shape, audio_mask_shape + audio_mask_dims), std::vector<int64_t>({3, 1344}));
ASSERT_EQ(std::count(audio_attention_mask + 0 * 1344, audio_attention_mask + 1 * 1344, true), 1098);
ASSERT_EQ(std::count(audio_attention_mask + 1 * 1344, audio_attention_mask + 2 * 1344, true), 1332);
ASSERT_EQ(std::count(audio_attention_mask + 2 * 1344, audio_attention_mask + 3 * 1344, true), 1344);

tensor.reset();
err = OrtxTensorResultGetAt(result.get(), 2, tensor.ToBeAssigned());
Expand Down Expand Up @@ -109,16 +110,16 @@ TEST(ExtractorTest, TestPhi4AudioFeatureExtraction8k) {
ASSERT_EQ(std::vector<int64_t>(shape, shape + num_dims), std::vector<int64_t>({1, 2938, 80}));

tensor.reset();
const int64_t* audio_frames{};
const int64_t* audio_frames_shape{};
size_t audio_frames_num_dims;
const bool* audio_attention_mask{};
const int64_t* audio_mask_shape{};
size_t audio_mask_dims{};
err = OrtxTensorResultGetAt(result.get(), 1, tensor.ToBeAssigned());
ASSERT_EQ(err, kOrtxOK);
err = OrtxGetTensorData(tensor.get(), reinterpret_cast<const void**>(&audio_frames), &audio_frames_shape, &audio_frames_num_dims);
err = OrtxGetTensorData(tensor.get(), reinterpret_cast<const void**>(&audio_attention_mask), &audio_mask_shape, &audio_mask_dims);
ASSERT_EQ(err, kOrtxOK);
ASSERT_EQ(std::vector<int64_t>(audio_frames_shape, audio_frames_shape + audio_frames_num_dims), std::vector<int64_t>({1, 1}));
const size_t num_elements = std::accumulate(audio_frames_shape, audio_frames_shape + audio_frames_num_dims, 1, std::multiplies<size_t>());
ASSERT_EQ(std::vector<int64_t>(audio_frames, audio_frames + num_elements), std::vector<int64_t>({2938}));
ASSERT_EQ(std::vector<int64_t>(audio_mask_shape, audio_mask_shape + audio_mask_dims), std::vector<int64_t>({1, 2938}));
const size_t num_elements = std::count(audio_attention_mask, audio_attention_mask + 2938, true);
ASSERT_EQ(num_elements, 2938);

tensor.reset();
err = OrtxTensorResultGetAt(result.get(), 2, tensor.ToBeAssigned());
Expand Down

0 comments on commit 8bf98ef

Please sign in to comment.