Merge branch 'main' of https://github.com/microsoft/onnxruntime-exten…

…sions into sayanshaw/chat-template
microsoft · Mar 7, 2025 · 3a93e4a · 3a93e4a
2 parents 52de34f + bfeb3dd
commit 3a93e4a
Show file tree

Hide file tree

Showing 4 changed files with 42 additions and 41 deletions.
diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml
@@ -445,59 +445,54 @@ stages:
 
     strategy:
       matrix:
+        py312-1201:
+          python.version: '3.12'
+          torch.version: 'torch torchvision torchaudio'
+          ort.version: '1.20.1'
         py312-1192:
           python.version: '3.12'
           torch.version: 'torch torchvision torchaudio'
           ort.version: '1.19.2'
-        py312-1181:
-          python.version: '3.12'
+        py311-1181:
+          python.version: '3.11'
           torch.version: 'torch torchvision torchaudio'
           ort.version: '1.18.1'
-        py311-1171:
-          python.version: '3.11'
+        py310-1171:
+          python.version: '3.10'
           torch.version: 'torch torchvision torchaudio'
           ort.version: '1.17.1'
         py310-1163:
           python.version: '3.10'
           torch.version: 'torch torchvision torchaudio'
           ort.version: '1.16.3'
-        py39-1151:
-          python.version: '3.9'
-          torch.version: 'torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 --index-url https://download.pytorch.org/whl/cpu'
-          ort.version: '1.15.1'
 
     steps:
-    - powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts"
-      displayName: Add conda to PATH
-
-    - script: conda create --yes --quiet --name pyenv -c conda-forge python=$(python.version) numpy
-      displayName: Create Anaconda environment
+    - task: UsePythonVersion@0
+      inputs:
+        versionSpec: $(python.version)
+        disableDownloadFromRegistry: true
+        addToPath: true
+        architecture: 'x64'
+      displayName: Use ADO python task
 
     - script: |
-        call activate pyenv
         python -m pip install --upgrade pip
         python -m pip install onnxruntime==$(ort.version)
         python -m pip install -r requirements-dev.txt
       displayName: Install requirements{-dev}.txt and cmake python modules
-      condition: ne(variables['python.version'], '3.12')
 
     - script: |
-        call activate pyenv
         set CMAKE_ARGS=-DOCOS_ONNXRUNTIME_VERSION=$(ort.version)
-        python -m pip install .
+        python -m pip install -v .
       displayName: Build the wheel
 
     - script: |
-        call activate pyenv
         python -m pip install $(torch.version)
       displayName: Install pytorch
-      condition: ne(variables['python.version'], '3.12')
 
     - script: |
-        call activate pyenv
-        pytest test
+        cd test && python -m pytest .
       displayName: Run python test
-      condition: ne(variables['python.version'], '3.12')
 
   #################
   # Windows PyDebug
@@ -509,7 +504,7 @@ stages:
     steps:
     - task: UsePythonVersion@0
       inputs:
-        versionSpec: '3.x'
+        versionSpec: '3.12'
         disableDownloadFromRegistry: true
         addToPath: true
         architecture: 'x64'
@@ -518,10 +513,11 @@ stages:
     - script: |
         python -m pip install --upgrade setuptools pip
         python -m pip install "numpy < 2.0.0"
-        set OCOS_NO_OPENCV=1
-        set OCOS_SCB_DEBUG=1
         python -m pip install -v -e .
       displayName: Build onnxruntime-extensions in editable mode.
+      env:
+        OCOS_NO_OPENCV: 1
+        OCOS_SCB_DEBUG: 1
 
     - script: |
         python -m pip install -r requirements-dev.txt

diff --git a/.pipelines/ci_optional.yml b/.pipelines/ci_optional.yml
@@ -130,7 +130,7 @@ stages:
     steps:
     - task: UsePythonVersion@0
       inputs:
-        versionSpec: '3.x'
+        versionSpec: '3.12'
         disableDownloadFromRegistry: true
         addToPath: true
         architecture: 'x64'

diff --git a/operators/tokenizer/bpe_utils.hpp b/operators/tokenizer/bpe_utils.hpp
@@ -191,14 +191,14 @@ class PreTokenizerWithRegEx {
     return {};
   }
 
-  // "\s+(?!\S)|\s+)"
+  // "\s+(?!\S)|\s+"
   std::u32string_view Match_GPT2_Pattern_4() {
     if ((m_text.size() >= 1) && (IsZ(m_text[0]))) {
       size_t i = 1;
       for (; i < m_text.size(); ++i) {
         if (!IsZ(m_text[i])) break;
       }
-      if ((i > 1) && (i != m_text.size())) {  //\s+(?!\S)
+      if ((i > 1) && (i != m_text.size())) {  // ?!\S
         i--;
         std::u32string_view res = m_text.substr(0, i);
         m_text = m_text.substr(i);
@@ -504,28 +504,29 @@ class PreTokenizerWithRegEx {
   OrtxStatus Compile(const std::string& regex) {
     // NOTES: to avoid the short pattern shadowing the longer one, the longer pattern should be placed first
     auto patterns = std::vector<std::tuple<std::string_view, RegexMatchFunc>>{
-        {R"((?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))",
-         &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
         {R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?)",
          &PreTokenizerWithRegEx::Match_PHI4_Pattern_1},
         {R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?)",
          &PreTokenizerWithRegEx::Match_PHI4_Pattern_2},
+        {R"((?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]))",
+         &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
         {R"((?i:'s|'t|'re|'ve|'m|'ll|'d))", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_1},
-        {R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
-        {R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
-        {R"(\p{N}{1,3})", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3},
+        {R"( ?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
         {R"( ?[^\s\p{L}\p{N}]+[\r\n]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
-        {R"(\s*[\r\n]+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5},
-        {R"( ?\p{L}+| ?\p{N}+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_2},
+        {R"([^\r\n\p{L}\p{N}]?\p{L}+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_2},
+        {R"('s|'t|'re|'ve|'m|'ll|'d)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_1},
         {R"( ?[^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_3},
-        {R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4},
+        {R"( ?\p{L}+| ?\p{N}+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_2},
         {R"([\p{L}]+|[\p{N}])", &PreTokenizerWithRegEx::Match_CLIP_Pattern_1},
         {R"([^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_CLIP_Pattern_2},
-        {R"(?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
+        {R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4},
+        {R"(\p{N}{1,3})", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_3},
+        {R"(\s*[\r\n]+)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_5},
         {R"(\p{N})", &PreTokenizerWithRegEx::Match_General_Pattern_1},
     };
 
     std::string regex_compound = regex;
+    std::map<size_t, RegexMatchFunc> patterns_map;  // using map for a ordered pattern matchers
     for (const auto& [pattern, func] : patterns) {
       auto pos = regex_compound.find(pattern);
       if (pos != std::string::npos) {
@@ -539,8 +540,9 @@ class PreTokenizerWithRegEx {
             continue;
           }
         }
-
-        activated_matchers_.push_back(func);
+        auto original_pos = regex.find(pattern);
+        assert(original_pos != std::string::npos);
+        patterns_map[original_pos] = func;
         std::string regex_prefix;
         auto pattern_size = pattern.size();
         if (pos > 0) {  // remove the '|' at the end of the prefix
@@ -557,6 +559,9 @@ class PreTokenizerWithRegEx {
         regex_compound = regex_prefix + regex_compound.substr(pos + pattern_size);
       }
     }
+    for (const auto& [_, func] : patterns_map) {
+      activated_matchers_.push_back(func);
+    }
 
     if (regex_compound.size() > 0) {
       try {

diff --git a/test/test_pp_api.py b/test/test_pp_api.py
@@ -10,7 +10,7 @@
 # os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
 is_pp_api_available = False
 hf_token_id = None
-phi4_model_local_path = None
+phi4_model_local_path = "microsoft/Phi-4-multimodal-instruct"
 try:
     from transformers import AutoImageProcessor, AutoTokenizer
     from onnxruntime_extensions import pp_api
@@ -201,7 +201,7 @@ def test_Qwen_QVQ_tokenizer(self):
     def test_Phi4_tokenizer(self):
         model_id = phi4_model_local_path
         test_sentence = ['<|user|>\n' + self.tokenizer_test_sentence]
-        hf_enc = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+        hf_enc = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=True)
         inputs = hf_enc(test_sentence)["input_ids"]
         tokenizer = pp_api.Tokenizer(model_id)
         ortx_inputs = tokenizer.tokenize(test_sentence)