Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…sions into sayanshaw/chat-template
  • Loading branch information
Sayan Shaw committed Mar 7, 2025
2 parents 52de34f + bfeb3dd commit 3a93e4a
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 41 deletions.
46 changes: 21 additions & 25 deletions .pipelines/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -445,59 +445,54 @@ stages:

strategy:
matrix:
py312-1201:
python.version: '3.12'
torch.version: 'torch torchvision torchaudio'
ort.version: '1.20.1'
py312-1192:
python.version: '3.12'
torch.version: 'torch torchvision torchaudio'
ort.version: '1.19.2'
py312-1181:
python.version: '3.12'
py311-1181:
python.version: '3.11'
torch.version: 'torch torchvision torchaudio'
ort.version: '1.18.1'
py311-1171:
python.version: '3.11'
py310-1171:
python.version: '3.10'
torch.version: 'torch torchvision torchaudio'
ort.version: '1.17.1'
py310-1163:
python.version: '3.10'
torch.version: 'torch torchvision torchaudio'
ort.version: '1.16.3'
py39-1151:
python.version: '3.9'
torch.version: 'torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 --index-url https://download.pytorch.org/whl/cpu'
ort.version: '1.15.1'

steps:
- powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts"
displayName: Add conda to PATH

- script: conda create --yes --quiet --name pyenv -c conda-forge python=$(python.version) numpy
displayName: Create Anaconda environment
- task: UsePythonVersion@0
inputs:
versionSpec: $(python.version)
disableDownloadFromRegistry: true
addToPath: true
architecture: 'x64'
displayName: Use ADO python task

- script: |
call activate pyenv
python -m pip install --upgrade pip
python -m pip install onnxruntime==$(ort.version)
python -m pip install -r requirements-dev.txt
displayName: Install requirements{-dev}.txt and cmake python modules
condition: ne(variables['python.version'], '3.12')
- script: |
call activate pyenv
set CMAKE_ARGS=-DOCOS_ONNXRUNTIME_VERSION=$(ort.version)
python -m pip install .
python -m pip install -v .
displayName: Build the wheel
- script: |
call activate pyenv
python -m pip install $(torch.version)
displayName: Install pytorch
condition: ne(variables['python.version'], '3.12')
- script: |
call activate pyenv
pytest test
cd test && python -m pytest .
displayName: Run python test
condition: ne(variables['python.version'], '3.12')
#################
# Windows PyDebug
Expand All @@ -509,7 +504,7 @@ stages:
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: '3.x'
versionSpec: '3.12'
disableDownloadFromRegistry: true
addToPath: true
architecture: 'x64'
Expand All @@ -518,10 +513,11 @@ stages:
- script: |
python -m pip install --upgrade setuptools pip
python -m pip install "numpy < 2.0.0"
set OCOS_NO_OPENCV=1
set OCOS_SCB_DEBUG=1
python -m pip install -v -e .
displayName: Build onnxruntime-extensions in editable mode.
env:
OCOS_NO_OPENCV: 1
OCOS_SCB_DEBUG: 1
- script: |
python -m pip install -r requirements-dev.txt
Expand Down
2 changes: 1 addition & 1 deletion .pipelines/ci_optional.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ stages:
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: '3.x'
versionSpec: '3.12'
disableDownloadFromRegistry: true
addToPath: true
architecture: 'x64'
Expand Down
31 changes: 18 additions & 13 deletions operators/tokenizer/bpe_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,14 +191,14 @@ class PreTokenizerWithRegEx {
return {};
}

// "\s+(?!\S)|\s+)"
// "\s+(?!\S)|\s+"
std::u32string_view Match_GPT2_Pattern_4() {
if ((m_text.size() >= 1) && (IsZ(m_text[0]))) {
size_t i = 1;
for (; i < m_text.size(); ++i) {
if (!IsZ(m_text[i])) break;
}
if ((i > 1) && (i != m_text.size())) { //\s+(?!\S)
if ((i > 1) && (i != m_text.size())) { // ?!\S
i--;
std::u32string_view res = m_text.substr(0, i);
m_text = m_text.substr(i);
Expand Down Expand Up @@ -504,28 +504,29 @@ class PreTokenizerWithRegEx {
OrtxStatus Compile(const std::string& regex) {
// NOTES: to avoid the short pattern shadowing the longer one, the longer pattern should be placed first
auto patterns = std::vector<std::tuple<std::string_view, RegexMatchFunc>>{
{R"((?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))",
&PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
{R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?)",
&PreTokenizerWithRegEx::Match_PHI4_Pattern_1},
{R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?)",
&PreTokenizerWithRegEx::Match_PHI4_Pattern_2},
{R"((?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))",
&PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
{R"((?i:'s|'t|'re|'ve|'m|'ll|'d))", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
{R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
{R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
{R"(\p{N}{1,3})", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3},
{R"( ?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
{R"( ?[^\s\p{L}\p{N}]+[\r\n]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
{R"(\s*[\r\n]+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5},
{R"( ?\p{L}+| ?\p{N}+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_2},
{R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
{R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
{R"( ?[^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_3},
{R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4},
{R"( ?\p{L}+| ?\p{N}+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_2},
{R"([\p{L}]+|[\p{N}])", &PreTokenizerWithRegEx::Match_CLIP_Pattern_1},
{R"([^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_CLIP_Pattern_2},
{R"(?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
{R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4},
{R"(\p{N}{1,3})", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3},
{R"(\s*[\r\n]+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5},
{R"(\p{N})", &PreTokenizerWithRegEx::Match_General_Pattern_1},
};

std::string regex_compound = regex;
std::map<size_t, RegexMatchFunc> patterns_map; // using map for a ordered pattern matchers
for (const auto& [pattern, func] : patterns) {
auto pos = regex_compound.find(pattern);
if (pos != std::string::npos) {
Expand All @@ -539,8 +540,9 @@ class PreTokenizerWithRegEx {
continue;
}
}

activated_matchers_.push_back(func);
auto original_pos = regex.find(pattern);
assert(original_pos != std::string::npos);
patterns_map[original_pos] = func;
std::string regex_prefix;
auto pattern_size = pattern.size();
if (pos > 0) { // remove the '|' at the end of the prefix
Expand All @@ -557,6 +559,9 @@ class PreTokenizerWithRegEx {
regex_compound = regex_prefix + regex_compound.substr(pos + pattern_size);
}
}
for (const auto& [_, func] : patterns_map) {
activated_matchers_.push_back(func);
}

if (regex_compound.size() > 0) {
try {
Expand Down
4 changes: 2 additions & 2 deletions test/test_pp_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
is_pp_api_available = False
hf_token_id = None
phi4_model_local_path = None
phi4_model_local_path = "microsoft/Phi-4-multimodal-instruct"
try:
from transformers import AutoImageProcessor, AutoTokenizer
from onnxruntime_extensions import pp_api
Expand Down Expand Up @@ -201,7 +201,7 @@ def test_Qwen_QVQ_tokenizer(self):
def test_Phi4_tokenizer(self):
model_id = phi4_model_local_path
test_sentence = ['<|user|>\n' + self.tokenizer_test_sentence]
hf_enc = AutoTokenizer.from_pretrained(model_id, use_fast=True)
hf_enc = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=True)
inputs = hf_enc(test_sentence)["input_ids"]
tokenizer = pp_api.Tokenizer(model_id)
ortx_inputs = tokenizer.tokenize(test_sentence)
Expand Down

0 comments on commit 3a93e4a

Please sign in to comment.