Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix phi-4 regex pattern handling issue in the tokenizer #905

Merged
merged 2 commits into from
Mar 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 18 additions & 13 deletions operators/tokenizer/bpe_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,14 +191,14 @@ class PreTokenizerWithRegEx {
return {};
}

// "\s+(?!\S)|\s+)"
// "\s+(?!\S)|\s+"
std::u32string_view Match_GPT2_Pattern_4() {
if ((m_text.size() >= 1) && (IsZ(m_text[0]))) {
size_t i = 1;
for (; i < m_text.size(); ++i) {
if (!IsZ(m_text[i])) break;
}
if ((i > 1) && (i != m_text.size())) { //\s+(?!\S)
if ((i > 1) && (i != m_text.size())) { // ?!\S
i--;
std::u32string_view res = m_text.substr(0, i);
m_text = m_text.substr(i);
Expand Down Expand Up @@ -504,28 +504,29 @@ class PreTokenizerWithRegEx {
OrtxStatus Compile(const std::string& regex) {
// NOTES: to avoid the short pattern shadowing the longer one, the longer pattern should be placed first
auto patterns = std::vector<std::tuple<std::string_view, RegexMatchFunc>>{
{R"((?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))",
&PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
{R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?)",
&PreTokenizerWithRegEx::Match_PHI4_Pattern_1},
{R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?)",
&PreTokenizerWithRegEx::Match_PHI4_Pattern_2},
{R"((?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))",
&PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
{R"((?i:'s|'t|'re|'ve|'m|'ll|'d))", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
{R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
{R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
{R"(\p{N}{1,3})", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3},
{R"( ?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
{R"( ?[^\s\p{L}\p{N}]+[\r\n]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
{R"(\s*[\r\n]+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5},
{R"( ?\p{L}+| ?\p{N}+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_2},
{R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
{R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
{R"( ?[^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_3},
{R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4},
{R"( ?\p{L}+| ?\p{N}+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_2},
{R"([\p{L}]+|[\p{N}])", &PreTokenizerWithRegEx::Match_CLIP_Pattern_1},
{R"([^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_CLIP_Pattern_2},
{R"(?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
{R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4},
{R"(\p{N}{1,3})", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3},
{R"(\s*[\r\n]+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5},
{R"(\p{N})", &PreTokenizerWithRegEx::Match_General_Pattern_1},
};

std::string regex_compound = regex;
std::map<size_t, RegexMatchFunc> patterns_map; // using map for a ordered pattern matchers
for (const auto& [pattern, func] : patterns) {
auto pos = regex_compound.find(pattern);
if (pos != std::string::npos) {
Expand All @@ -539,8 +540,9 @@ class PreTokenizerWithRegEx {
continue;
}
}

activated_matchers_.push_back(func);
auto original_pos = regex.find(pattern);
assert(original_pos != std::string::npos);
patterns_map[original_pos] = func;
std::string regex_prefix;
auto pattern_size = pattern.size();
if (pos > 0) { // remove the '|' at the end of the prefix
Expand All @@ -557,6 +559,9 @@ class PreTokenizerWithRegEx {
regex_compound = regex_prefix + regex_compound.substr(pos + pattern_size);
}
}
for (const auto& [_, func] : patterns_map) {
activated_matchers_.push_back(func);
}

if (regex_compound.size() > 0) {
try {
Expand Down
4 changes: 2 additions & 2 deletions test/test_pp_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
is_pp_api_available = False
hf_token_id = None
phi4_model_local_path = None
phi4_model_local_path = "microsoft/Phi-4-multimodal-instruct"
try:
from transformers import AutoImageProcessor, AutoTokenizer
from onnxruntime_extensions import pp_api
Expand Down Expand Up @@ -201,7 +201,7 @@ def test_Qwen_QVQ_tokenizer(self):
def test_Phi4_tokenizer(self):
model_id = phi4_model_local_path
test_sentence = ['<|user|>\n' + self.tokenizer_test_sentence]
hf_enc = AutoTokenizer.from_pretrained(model_id, use_fast=True)
hf_enc = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=True)
inputs = hf_enc(test_sentence)["input_ids"]
tokenizer = pp_api.Tokenizer(model_id)
ortx_inputs = tokenizer.tokenize(test_sentence)
Expand Down