From cf81a9d80abcff697dbf3060a2f031689986e653 Mon Sep 17 00:00:00 2001 From: Wenbing Li Date: Thu, 6 Mar 2025 01:17:56 +0000 Subject: [PATCH] Fix phi-4 regex pattern handling in the tokenizer --- operators/tokenizer/bpe_utils.hpp | 31 ++++++++++++++++++------------- test/test_pp_api.py | 4 ++-- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/operators/tokenizer/bpe_utils.hpp b/operators/tokenizer/bpe_utils.hpp index 42727459..6e75c603 100644 --- a/operators/tokenizer/bpe_utils.hpp +++ b/operators/tokenizer/bpe_utils.hpp @@ -191,14 +191,14 @@ class PreTokenizerWithRegEx { return {}; } - // "\s+(?!\S)|\s+)" + // "\s+(?!\S)|\s+" std::u32string_view Match_GPT2_Pattern_4() { if ((m_text.size() >= 1) && (IsZ(m_text[0]))) { size_t i = 1; for (; i < m_text.size(); ++i) { if (!IsZ(m_text[i])) break; } - if ((i > 1) && (i != m_text.size())) { //\s+(?!\S) + if ((i > 1) && (i != m_text.size())) { // ?!\S i--; std::u32string_view res = m_text.substr(0, i); m_text = m_text.substr(i); @@ -504,28 +504,29 @@ class PreTokenizerWithRegEx { OrtxStatus Compile(const std::string& regex) { // NOTES: to avoid the short pattern shadowing the longer one, the longer pattern should be placed first auto patterns = std::vector>{ - {R"((?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))", - &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1}, {R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?)", &PreTokenizerWithRegEx::Match_PHI4_Pattern_1}, {R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?)", &PreTokenizerWithRegEx::Match_PHI4_Pattern_2}, + {R"((?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))", + &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1}, {R"((?i:'s|'t|'re|'ve|'m|'ll|'d))", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1}, - {R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1}, - {R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2}, - {R"(\p{N}{1,3})", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3}, + {R"( ?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4}, {R"( ?[^\s\p{L}\p{N}]+[\r\n]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4}, - {R"(\s*[\r\n]+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5}, - {R"( ?\p{L}+| ?\p{N}+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_2}, + {R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2}, + {R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1}, {R"( ?[^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_3}, - {R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4}, + {R"( ?\p{L}+| ?\p{N}+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_2}, {R"([\p{L}]+|[\p{N}])", &PreTokenizerWithRegEx::Match_CLIP_Pattern_1}, {R"([^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_CLIP_Pattern_2}, - {R"(?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4}, + {R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4}, + {R"(\p{N}{1,3})", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3}, + {R"(\s*[\r\n]+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5}, {R"(\p{N})", &PreTokenizerWithRegEx::Match_General_Pattern_1}, }; std::string regex_compound = regex; + std::map patterns_map; // using map for a ordered pattern matchers for (const auto& [pattern, func] : patterns) { auto pos = regex_compound.find(pattern); if (pos != std::string::npos) { @@ -539,8 +540,9 @@ class PreTokenizerWithRegEx { continue; } } - - activated_matchers_.push_back(func); + auto original_pos = regex.find(pattern); + assert(original_pos != std::string::npos); + patterns_map[original_pos] = func; std::string regex_prefix; auto pattern_size = pattern.size(); if (pos > 0) { // remove the '|' at the end of the prefix @@ -557,6 +559,9 @@ class PreTokenizerWithRegEx { regex_compound = regex_prefix + regex_compound.substr(pos + pattern_size); } } + for (const auto& [_, func] : patterns_map) { + activated_matchers_.push_back(func); + } if (regex_compound.size() > 0) { try { diff --git a/test/test_pp_api.py b/test/test_pp_api.py index fdd82a66..59575619 100644 --- a/test/test_pp_api.py +++ b/test/test_pp_api.py @@ -10,7 +10,7 @@ # os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" is_pp_api_available = False hf_token_id = None -phi4_model_local_path = None +phi4_model_local_path = "microsoft/Phi-4-multimodal-instruct" try: from transformers import AutoImageProcessor, AutoTokenizer from onnxruntime_extensions import pp_api @@ -201,7 +201,7 @@ def test_Qwen_QVQ_tokenizer(self): def test_Phi4_tokenizer(self): model_id = phi4_model_local_path test_sentence = ['<|user|>\n' + self.tokenizer_test_sentence] - hf_enc = AutoTokenizer.from_pretrained(model_id, use_fast=True) + hf_enc = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=True) inputs = hf_enc(test_sentence)["input_ids"] tokenizer = pp_api.Tokenizer(model_id) ortx_inputs = tokenizer.tokenize(test_sentence)