Merge branch 'main' into yguo/fix-one-branch-yml

microsoft · Feb 5, 2024 · 3e14cdf · 3e14cdf
2 parents 859f408 + d47a3dd
commit 3e14cdf
Show file tree

Hide file tree

Showing 16 changed files with 174 additions and 58 deletions.
diff --git a/.clang-tidy b/.clang-tidy
@@ -2,7 +2,7 @@
 # turn off readability-braces-around-statements to allow single line statement like 'if (x == y) doSomething();'
 Checks:          '-*,cppcoreguidelines-*,google-*,readability-*,modernize-*,-readability-braces-around-statements,-google-runtime-references,-cppcoreguidelines-pro-type-reinterpret-cast'
 WarningsAsErrors: ''
-HeaderFilterRegex: '.*onnxruntime\/core\/.*'
+HeaderFilterRegex: 'includes\/.*'
 AnalyzeTemporaryDtors: false
 FormatStyle:     none
 CheckOptions:    

diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml
@@ -558,7 +558,7 @@ stages:
       name: 'onnxruntime-extensions-Linux-GPU-A10'
     timeoutInMinutes: 120
     variables:
-      ORT_VERSION: '1.16.2'
+      ORT_VERSION: '1.16.3'
       TORCH_VERSION: 'torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118'
     steps:
       - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
@@ -584,7 +584,7 @@ stages:
           userRepository: 'microsoft/onnxruntime'
           defaultVersionType: 'specificTag'
           version: 'v$(ORT_VERSION)'
-          itemPattern: '*-linux-x64-$(ORT_VERSION)*'
+          itemPattern: '*-linux-x64-gpu-$(ORT_VERSION)*'
           downloadPath: '$(Build.SourcesDirectory)'
         displayName: Download the ONNXRuntime prebuilt package.
 
@@ -609,7 +609,7 @@ stages:
           script: |
             docker run --gpus all --rm \
               --volume $(Build.SourcesDirectory):/onnxruntime-extensions \
-              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-$(ORT_VERSION):/onnxruntime \
+              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-gpu-$(ORT_VERSION):/onnxruntime \
               -e CUDA_PATH=/usr/local/cuda-11.8 \
               onnxruntime-extensionscuda11build \
               /bin/bash -c "
@@ -626,7 +626,7 @@ stages:
           script: |
             docker run --gpus all --rm \
               --volume $(Build.SourcesDirectory):/onnxruntime-extensions \
-              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-$(ORT_VERSION):/onnxruntime \
+              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-gpu-$(ORT_VERSION):/onnxruntime \
               -e CUDA_PATH=/usr/local/cuda-11.8 \
               onnxruntime-extensionscuda11build \
               /bin/bash -c "
@@ -644,7 +644,7 @@ stages:
           script: |
             docker run --gpus all --rm \
               --volume $(Build.SourcesDirectory):/onnxruntime-extensions \
-              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-$(ORT_VERSION):/onnxruntime \
+              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-gpu-$(ORT_VERSION):/onnxruntime \
               -e CUDA_PATH=/usr/local/cuda-11.8 \
               onnxruntime-extensionscuda11build \
               /bin/bash -c "

diff --git a/onnxruntime_extensions/_extensions_pydll.pyi b/onnxruntime_extensions/_extensions_pydll.pyi
@@ -23,6 +23,7 @@ class PyCustomOpDef:
     dt_complex64: int = ...
     dt_complex128: int = ...
     dt_bfloat16: int = ...
+
     def install_hooker(self, invocation_handler: Callable) -> None:
         ...
     ...

diff --git a/onnxruntime_extensions/_hf_cvt.py b/onnxruntime_extensions/_hf_cvt.py
@@ -9,8 +9,6 @@
 
 import json
 import onnx
-import uuid
-import numpy as np
 from numpy import array as nparray
 from functools import partial
 from collections import namedtuple, OrderedDict
@@ -31,7 +29,8 @@ def convert_bpe_vocab(hf_tokenizer):
             # ids = sorted(hf_tokenizer.added_tokens_encoder.values())
             # if not ids == list(range(min(ids), max(ids) + 1)):
             #     raise RuntimeError(f"{hf_tokenizer.__name__}: the ids in added_tokens_encoder are not consecutive")
-            token_map = [f"{_k}={_v}" for _k, _v in hf_tokenizer.added_tokens_encoder.items()]
+            token_map = [f"{_k}={_v}" for _k,
+                         _v in hf_tokenizer.added_tokens_encoder.items()]
             attrs.update({"added_token": "\n".join(token_map)})
 
         sorted_merges = {v_: k_ for k_, v_ in hf_tokenizer.bpe_ranks.items()}
@@ -42,7 +41,8 @@ def convert_bpe_vocab(hf_tokenizer):
     def bpe_tokenizer(self, **kwargs):
         hf_gpt2_tokenizer = self.tokenizer
         if type(self.tokenizer).__name__.endswith('Fast'):
-            raise ValueError('Please use the slow version of the tokenizer (ex: GPT2Tokenizer).')
+            raise ValueError(
+                'Please use the slow version of the tokenizer (ex: GPT2Tokenizer).')
 
         attrs = self.convert_bpe_vocab(hf_gpt2_tokenizer)
         attrs.update(**kwargs)
@@ -51,12 +51,15 @@ def bpe_tokenizer(self, **kwargs):
     def bert_tokenizer(self, **kwargs):
         hf_bert_tokenizer = self.tokenizer
         # has to be sorted since the id of token was generated automatically.
-        ordered_vocab = OrderedDict(sorted(hf_bert_tokenizer.vocab.items(), key=lambda item: int(item[1])))
+        ordered_vocab = OrderedDict(
+            sorted(hf_bert_tokenizer.vocab.items(), key=lambda item: int(item[1])))
         vocab = '\n'.join(ordered_vocab.keys())
         attrs = dict(vocab=vocab)
         init_kwargs = hf_bert_tokenizer.init_kwargs
-        attrs['do_lower_case'] = 1 if 'do_lower_case' in init_kwargs and init_kwargs.get('do_lower_case') else 0
-        attrs['strip_accents'] = 1 if 'strip_accents' in init_kwargs and init_kwargs.get('strip_accents') else 0
+        attrs['do_lower_case'] = 1 if 'do_lower_case' in init_kwargs and init_kwargs.get(
+            'do_lower_case') else 0
+        attrs['strip_accents'] = 1 if 'strip_accents' in init_kwargs and init_kwargs.get(
+            'strip_accents') else 0
         attrs.update(**kwargs)
         return attrs
 
@@ -91,7 +94,8 @@ def clip_tokenizer(self, **kwargs):
         hf_clip_tokenizer = self.tokenizer
 
         if type(self.tokenizer).__name__.endswith('Fast'):
-            raise ValueError('Please use the slow version of the tokenizer (ex: CLIPTokenizer).')
+            raise ValueError(
+                'Please use the slow version of the tokenizer (ex: CLIPTokenizer).')
 
         attrs = self.convert_bpe_vocab(hf_clip_tokenizer)
         attrs.update(**kwargs)
@@ -101,7 +105,8 @@ def roberta_tokenizer(self, **kwargs):
         hf_roberta_tokenizer = self.tokenizer
 
         if type(self.tokenizer).__name__.endswith('Fast'):
-            raise ValueError('Please use the slow version of the tokenizer (ex: RobertaTokenizer).')
+            raise ValueError(
+                'Please use the slow version of the tokenizer (ex: RobertaTokenizer).')
 
         attrs = self.convert_bpe_vocab(hf_roberta_tokenizer)
         attrs.update(**kwargs)
@@ -133,7 +138,7 @@ def spm_decoder(self, **kwargs):
     "DistilBertTokenizer":  TokenOpParam('BertTokenizer',   HFTokenizerConverter.bert_tokenizer,
                                          'BertDecoder',     HFTokenizerConverter.bpe_decoder, None),
     "GPT2Tokenizer":        TokenOpParam('GPT2Tokenizer',   HFTokenizerConverter.bpe_tokenizer,
-                                         'BpeDecoder',      HFTokenizerConverter.bpe_decoder, None),                                     
+                                         'BpeDecoder',      HFTokenizerConverter.bpe_decoder, None),
     "CodeGenTokenizer":     TokenOpParam('GPT2Tokenizer',   HFTokenizerConverter.bpe_tokenizer,
                                          'BpeDecoder',      HFTokenizerConverter.bpe_decoder, None),
     "CLIPTokenizer":        TokenOpParam('CLIPTokenizer',   HFTokenizerConverter.clip_tokenizer,
@@ -167,7 +172,8 @@ class HFTokenizerOnnxGraph:
 
     @staticmethod
     def extract_cls_name(processor):
-        cls_name = processor if isinstance(processor, str) else type(processor).__name__
+        cls_name = processor if isinstance(
+            processor, str) else type(processor).__name__
         if cls_name.endswith("TokenizerFast"):
             cls_name = cls_name[:-len("Fast")]
         return cls_name
@@ -184,6 +190,8 @@ def __init__(self, processor, **kwargs):
 
     def pre_processing(self, **kwargs):
         with_default_inputs = kwargs.pop("WITH_DEFAULT_INPUTS", True)
+        cast_token_id = kwargs.pop("CAST_TOKEN_ID", False)
+
         _cvt_op = self.cvt_quadruple.pre_op
         _cvt_func = self.cvt_quadruple.pre_attribute_cvt
         cvt = partial(_cvt_func, self.cvt_obj)
@@ -200,22 +208,41 @@ def pre_processing(self, **kwargs):
         if self.cvt_quadruple.default_inputs is not None:
             default_inputs.update(self.cvt_quadruple.default_inputs)
             if len(default_inputs) != n_inputs:
-                raise ValueError("Op: {} does have the inputs from its TokenOpParam.".format(_cvt_op))
+                raise ValueError(
+                    "Op: {} does not have the inputs from its TokenOpParam.".format(_cvt_op))
 
         new_initializers = []
 
         for k, v in default_inputs.items():
             input_value_info = next((i for i in g.input if i.name == k), None)
             if input_value_info is None:
-                raise ValueError("The input {} is not found in the graph".format(k))
+                raise ValueError(
+                    "The input {} is not found in the graph".format(k))
 
-            np_dtype = onnx.helper.tensor_dtype_to_np_dtype(input_value_info.type.tensor_type.elem_type)
+            np_dtype = onnx.helper.tensor_dtype_to_np_dtype(
+                input_value_info.type.tensor_type.elem_type)
             value = nparray(v, np_dtype)
             new_initializers.append(onnx.numpy_helper.from_array(value, k))
         g.initializer.extend(new_initializers)
         new_inputs = [i for i in g.input if i.name not in default_inputs]
         g.ClearField("input")
         g.input.extend(new_inputs)
+
+        if cast_token_id:
+            # assume the first output is always the token ID.
+            if g.output[0].type.tensor_type.elem_type != onnx.onnx_pb.TensorProto.INT64:
+                new_output_name = g.output[0].name + '_cast'
+                shape = g.output[0].type.tensor_type.shape
+                cast_node = onnx.helper.make_node('Cast', [g.output[0].name], [new_output_name],
+                                                  to=onnx.onnx_pb.TensorProto.INT64)
+                new_output = [onnx.helper.make_tensor_value_info(
+                    new_output_name, onnx.onnx_pb.TensorProto.INT64, None)] + list(g.output)[1:]
+                if shape is not None:
+                    new_output[0].type.tensor_type.shape.CopyFrom(shape)
+                g.node.append(cast_node)
+                g.ClearField('output')
+                g.output.extend(new_output)
+
         return g
 
     def post_processing(self, **kwargs):

diff --git a/onnxruntime_extensions/cvt.py b/onnxruntime_extensions/cvt.py
@@ -41,6 +41,8 @@ def gen_processing_models(processor: Union[str, object],
         has to be provided in the kwargs
     pre_kwargs: dict
         Keyword arguments for generating the pre-processing model
+        WITH_DEFAULT_INPUTS: bool, add default inputs to the graph, default is True
+        CAST_TOKEN_ID: bool, add a cast op to output token IDs to be int64 if needed, default is False
     post_kwargs: dict
         Keyword arguments for generating the post-processing model
     opset: int
@@ -54,7 +56,8 @@ def gen_processing_models(processor: Union[str, object],
         The pre- and post-processing ONNX models
     """
     if pre_kwargs is None and post_kwargs is None:
-        raise ValueError("Either pre_kwargs or post_kwargs should be provided. None means no processing")
+        raise ValueError(
+            "Either pre_kwargs or post_kwargs should be provided. None means no processing graph output.")
     if isinstance(processor, str):
         g_pre, g_post = (None, None)
         if pre_kwargs:
@@ -64,23 +67,29 @@ def gen_processing_models(processor: Union[str, object],
                 cls_name = processor
             else:
                 if processor not in _PRE_POST_PAIR:
-                    raise RuntimeError(f"Cannot locate the post processing operator name from {processor}")
+                    raise RuntimeError(
+                        f"Cannot locate the post processing operator name from {processor}")
                 cls_name = _PRE_POST_PAIR[processor]
             g_post = SingleOpGraph.build_graph(cls_name, **post_kwargs)
         return make_onnx_model(g_pre) if g_pre else None, make_onnx_model(g_post) if g_post else None
 
     cls_name = type(processor).__name__
     if cls_name == "WhisperProcessor":
         if WhisperDataProcGraph is None:
-            raise ValueError("The Whisper processor needs torch.onnx support, please install pytorch 2.0 and above")
+            raise ValueError(
+                "The Whisper processor needs torch.onnx support, please install pytorch 2.0 and above")
         _converter = WhisperDataProcGraph(processor, opset=opset, **kwargs)
-        pre_m = _converter.pre_processing(**pre_kwargs) if pre_kwargs is not None else None
-        post_m = _converter.post_processing(**post_kwargs) if post_kwargs is not None else None
+        pre_m = _converter.pre_processing(
+            **pre_kwargs) if pre_kwargs is not None else None
+        post_m = _converter.post_processing(
+            **post_kwargs) if post_kwargs is not None else None
         return pre_m, post_m
     elif HFTokenizerOnnxGraph.is_supported(processor):
         _converter = HFTokenizerOnnxGraph(processor)
-        pre_g = _converter.pre_processing(**pre_kwargs) if pre_kwargs is not None else None
-        post_g = _converter.post_processing(**post_kwargs) if post_kwargs is not None else None
+        pre_g = _converter.pre_processing(
+            **pre_kwargs) if pre_kwargs is not None else None
+        post_g = _converter.post_processing(
+            **post_kwargs) if post_kwargs is not None else None
         return make_onnx_model(pre_g) if pre_g else None, \
             make_onnx_model(post_g) if post_g else None
     else:

diff --git a/onnxruntime_extensions/util.py b/onnxruntime_extensions/util.py
@@ -112,11 +112,13 @@ def remove_unused_initializers(subgraph, top_level_initializers=None):
     all_initializers = initializers + top_level_initializers
 
     # Filter the initializers by checking if their names are in the list of used input tensors
-    used_initializers = [init for init in all_initializers if init.name in input_tensors]
+    used_initializers = [
+        init for init in all_initializers if init.name in input_tensors]
 
     # Update the subgraph's initializers
     del subgraph.initializer[:]
-    subgraph.initializer.extend([init for init in used_initializers if init in initializers])
+    subgraph.initializer.extend(
+        [init for init in used_initializers if init in initializers])
 
     # Recursively process subgraphs within this subgraph
     for node in nodes:
@@ -125,7 +127,8 @@ def remove_unused_initializers(subgraph, top_level_initializers=None):
                 remove_unused_initializers(attr.g, top_level_initializers)
             elif attr.type == onnx.AttributeProto.GRAPHS:
                 for subgraph in attr.graphs:
-                    remove_unused_initializers(subgraph, top_level_initializers)
+                    remove_unused_initializers(
+                        subgraph, top_level_initializers)
 
 
 def quick_merge(*models, connection_indices=None):
@@ -150,12 +153,14 @@ def quick_merge(*models, connection_indices=None):
     merged_graph = models[0].graph
 
     # Dictionary to store unique opsets
-    opset_imports = {opset.domain if opset.domain else "ai.onnx": opset for opset in models[0].opset_import}
+    opset_imports = {
+        opset.domain if opset.domain else "ai.onnx": opset for opset in models[0].opset_import}
 
     # Iterate over all other models and merge
     for model_idx, model in enumerate(models[1:], start=1):
         if connection_indices is None:
-            io_map = [(out.name, in_.name) for out, in_ in zip(models[model_idx - 1].graph.output, model.graph.input)]
+            io_map = [(out.name, in_.name) for out, in_ in zip(
+                models[model_idx - 1].graph.output, model.graph.input)]
         else:
             io_map = [(models[model_idx - 1].graph.output[out_idx].name, model.graph.input[in_idx].name)
                       for out_idx, in_idx in connection_indices[model_idx - 1]]
@@ -174,7 +179,8 @@ def quick_merge(*models, connection_indices=None):
 
     default_opset = opset_imports.pop("ai.onnx", None)
     merged_model = onnx.helper.make_model_gen_version(merged_graph,
-                                                      opset_imports=[default_opset],
+                                                      opset_imports=[
+                                                          default_opset],
                                                       producer_name='ONNX Model Merger')
     merged_model.opset_import.extend(opset_imports.values())
     return merged_model
diff --git a/operators/math/segement_extraction.cc → operators/math/segment_extraction.cc b/operators/math/segement_extraction.cc → operators/math/segment_extraction.cc
diff --git a/operators/tokenizer/sentencepiece_tokenizer.cc b/operators/tokenizer/sentencepiece_tokenizer.cc
@@ -83,32 +83,33 @@ OrtStatusPtr KernelSentencepieceTokenizer::Compute(const ortc::Tensor<std::strin
         content.push_back(tokenizer_.eos_id());
         token_indices.push_back(ort_extensions::narrow<int32_t>(str_input[i].length()));
       }
-
-      if (fairseq.has_value() && (*fairseq)) {
-        // HF Fairseq Example (XLMRobertaTokenizer) : https://huggingface.co/transformers/v4.6.0/_modules/transformers/models/xlm_roberta/tokenization_xlm_roberta.html#XLMRobertaTokenizer
-        //
-        // Original fairseq vocab and spm vocab must be "aligned":
-        // Vocab    |    0    |    1    |    2    |    3    |  4  |  5  |  6  |   7   |   8   | 9
-        // -------- | ------- | ------- | ------  | ------- | --- | --- | --- | ----- | ----- | ----
-        // fairseq  | '<s>'   | '<pad>' | '</s>'  | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
-        // spm      | '<unk>' | '<s>'   | '</s>'  | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
-        //
-        // As per HF, the first "real" token "," has position 4 in the XLMRobertaTokenizer vocab and position
-        // 3 in the SPM vocab, so we add a padding value of 1 to IDs, and fix exceptions for '<unk>' and '<s>'.
-        std::for_each(content.begin(), content.end(), [](int& n) {
-          if (n == 0) {  // '<unk>': 0 -> 3
-            n = 3;
-          } else if (n == 1) {  // '<s>': 1 -> 0
-            n = 0;
-          } else if (n != 2) {  // '</s>': 2 -> 2, '<*>': x -> x + 1
-            n++;
-          }
-        });
-      }
     }
   }
   instance_indices.push_back(content.size());
 
+  // Patch fairseq indices
+  if (fairseq.has_value() && (*fairseq) && !add_rev) {
+    // HF Fairseq Example (XLMRobertaTokenizer) : https://huggingface.co/transformers/v4.6.0/_modules/transformers/models/xlm_roberta/tokenization_xlm_roberta.html#XLMRobertaTokenizer
+    //
+    // Original fairseq vocab and spm vocab must be "aligned":
+    // Vocab    |    0    |    1    |    2    |    3    |  4  |  5  |  6  |   7   |   8   | 9
+    // -------- | ------- | ------- | ------  | ------- | --- | --- | --- | ----- | ----- | ----
+    // fairseq  | '<s>'   | '<pad>' | '</s>'  | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
+    // spm      | '<unk>' | '<s>'   | '</s>'  | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+    //
+    // As per HF, the first "real" token "," has position 4 in the XLMRobertaTokenizer vocab and position
+    // 3 in the SPM vocab, so we add a padding value of 1 to IDs, and fix exceptions for '<unk>' and '<s>'.
+    std::for_each(content.begin(), content.end(), [](int& n) {
+      if (n == 0) {  // '<unk>': 0 -> 3
+        n = 3;
+      } else if (n == 1) {  // '<s>': 1 -> 0
+        n = 0;
+      } else if (n != 2) {  // '</s>': 2 -> 2, '<*>': x -> x + 1
+        n++;
+      }
+    });
+  }
+
   // Setup output
   std::vector<int64_t> size_content(1);
   size_content[0] = content.size();

diff --git a/setup.cfg b/setup.cfg
@@ -1,3 +1,2 @@
 [build]
 build_base = .scb
-# debug = 1
diff --git a/test/data/cuda/test_fastgelu.onnx b/test/data/cuda/test_fastgelu.onnx
diff --git a/test/data/cuda/test_fastgelu_f16.onnx b/test/data/cuda/test_fastgelu_f16.onnx
diff --git a/test/data/cuda/test_negpos.onnx b/test/data/cuda/test_negpos.onnx
diff --git a/test/shared_test/test_kernel.hpp b/test/shared_test/test_kernel.hpp
@@ -52,6 +52,7 @@ void RunSession(Ort::Session& session_object,
 void TestInference(Ort::Env& env, const ORTCHAR_T* model_uri,
                    const std::vector<TestValue>& inputs,
                    const std::vector<TestValue>& outputs,
-                   OutputValidator output_validator = nullptr);
+                   OutputValidator output_validator = nullptr,
+                   void* cuda_compute_stream = nullptr);
 
 void GetTensorMutableDataString(const OrtApi& api, const OrtValue* value, std::vector<std::string>& output);