Skip to content

Commit

Permalink
Fix phi-4 regex pattern handling in the tokenizer (#905)
Browse files Browse the repository at this point in the history
  • Loading branch information
wenbingl authored Mar 7, 2025
1 parent d50723a commit bfeb3dd
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 15 deletions.
31 changes: 18 additions & 13 deletions operators/tokenizer/bpe_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,14 +191,14 @@ class PreTokenizerWithRegEx {
return {};
}

// "\s+(?!\S)|\s+)"
// "\s+(?!\S)|\s+"
std::u32string_view Match_GPT2_Pattern_4() {
if ((m_text.size() >= 1) && (IsZ(m_text[0]))) {
size_t i = 1;
for (; i < m_text.size(); ++i) {
if (!IsZ(m_text[i])) break;
}
if ((i > 1) && (i != m_text.size())) { //\s+(?!\S)
if ((i > 1) && (i != m_text.size())) { // ?!\S
i--;
std::u32string_view res = m_text.substr(0, i);
m_text = m_text.substr(i);
Expand Down Expand Up @@ -504,28 +504,29 @@ class PreTokenizerWithRegEx {
OrtxStatus Compile(const std::string& regex) {
// NOTES: to avoid the short pattern shadowing the longer one, the longer pattern should be placed first
auto patterns = std::vector<std::tuple<std::string_view, RegexMatchFunc>>{
{R"((?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))",
&PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
{R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?)",
&PreTokenizerWithRegEx::Match_PHI4_Pattern_1},
{R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?)",
&PreTokenizerWithRegEx::Match_PHI4_Pattern_2},
{R"((?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))",
&PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
{R"((?i:'s|'t|'re|'ve|'m|'ll|'d))", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
{R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
{R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
{R"(\p{N}{1,3})", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3},
{R"( ?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
{R"( ?[^\s\p{L}\p{N}]+[\r\n]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
{R"(\s*[\r\n]+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5},
{R"( ?\p{L}+| ?\p{N}+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_2},
{R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
{R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
{R"( ?[^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_3},
{R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4},
{R"( ?\p{L}+| ?\p{N}+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_2},
{R"([\p{L}]+|[\p{N}])", &PreTokenizerWithRegEx::Match_CLIP_Pattern_1},
{R"([^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_CLIP_Pattern_2},
{R"(?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
{R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4},
{R"(\p{N}{1,3})", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3},
{R"(\s*[\r\n]+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5},
{R"(\p{N})", &PreTokenizerWithRegEx::Match_General_Pattern_1},
};

std::string regex_compound = regex;
std::map<size_t, RegexMatchFunc> patterns_map; // using map for a ordered pattern matchers
for (const auto& [pattern, func] : patterns) {
auto pos = regex_compound.find(pattern);
if (pos != std::string::npos) {
Expand All @@ -539,8 +540,9 @@ class PreTokenizerWithRegEx {
continue;
}
}

activated_matchers_.push_back(func);
auto original_pos = regex.find(pattern);
assert(original_pos != std::string::npos);
patterns_map[original_pos] = func;
std::string regex_prefix;
auto pattern_size = pattern.size();
if (pos > 0) { // remove the '|' at the end of the prefix
Expand All @@ -557,6 +559,9 @@ class PreTokenizerWithRegEx {
regex_compound = regex_prefix + regex_compound.substr(pos + pattern_size);
}
}
for (const auto& [_, func] : patterns_map) {
activated_matchers_.push_back(func);
}

if (regex_compound.size() > 0) {
try {
Expand Down
4 changes: 2 additions & 2 deletions test/test_pp_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
is_pp_api_available = False
hf_token_id = None
phi4_model_local_path = None
phi4_model_local_path = "microsoft/Phi-4-multimodal-instruct"
try:
from transformers import AutoImageProcessor, AutoTokenizer
from onnxruntime_extensions import pp_api
Expand Down Expand Up @@ -201,7 +201,7 @@ def test_Qwen_QVQ_tokenizer(self):
def test_Phi4_tokenizer(self):
model_id = phi4_model_local_path
test_sentence = ['<|user|>\n' + self.tokenizer_test_sentence]
hf_enc = AutoTokenizer.from_pretrained(model_id, use_fast=True)
hf_enc = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=True)
inputs = hf_enc(test_sentence)["input_ids"]
tokenizer = pp_api.Tokenizer(model_id)
ortx_inputs = tokenizer.tokenize(test_sentence)
Expand Down

0 comments on commit bfeb3dd

Please sign in to comment.