From 43994eb34a1b0cd0df6c7626e4ae771a0f618c00 Mon Sep 17 00:00:00 2001
From: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
Date: Mon, 8 May 2023 11:37:54 -0700
Subject: [PATCH 01/17] Fix the unit test failure with ONNX 1.14 package.
 (#428)

* Fix the unit test failure with ONNX 1.14 package.

* more tests

* Update whisper_e2e.py
---
 onnxruntime_extensions/cmd.py           | 21 +++++++-
 onnxruntime_extensions/pnp/_base.py     |  5 ++
 onnxruntime_extensions/pnp/_onnx_ops.py |  2 +
 onnxruntime_extensions/pnp/_utils.py    | 28 +++++++----
 onnxruntime_extensions/util.py          | 64 +++++++++++++++++++++++++
 tutorials/whisper_e2e.py                |  6 +--
 6 files changed, 113 insertions(+), 13 deletions(-)

diff --git a/onnxruntime_extensions/cmd.py b/onnxruntime_extensions/cmd.py
index 68c0d4ba3..d7f92f642 100644
--- a/onnxruntime_extensions/cmd.py
+++ b/onnxruntime_extensions/cmd.py
@@ -1,5 +1,5 @@
 import os
-import fire
+import argparse
 import onnx
 import numpy
 
@@ -36,5 +36,22 @@ def selfcheck(self, *args):
         print("The extensions loaded, status: OK.")
 
 
+def main():
+    parser = argparse.ArgumentParser(description="ORT Extension commands")
+    parser.add_argument("command", choices=["run", "selfcheck"])
+    parser.add_argument("--model", default="model.onnx", help="Path to the ONNX model file")
+    parser.add_argument("--testdata-dir", help="Path to the test data directory")
+    parser.add_argument("args", nargs=argparse.REMAINDER, help="Additional arguments")
+
+    args = parser.parse_args()
+
+    ort_commands = ORTExtCommands(model=args.model, testdata_dir=args.testdata_dir)
+
+    if args.command == "run":
+        ort_commands.run(*args.args)
+    elif args.command == "selfcheck":
+        ort_commands.selfcheck(*args.args)
+
+
 if __name__ == '__main__':
-    fire.Fire(ORTExtCommands)
+    main()
diff --git a/onnxruntime_extensions/pnp/_base.py b/onnxruntime_extensions/pnp/_base.py
index 00f986c8f..b70db9078 100644
--- a/onnxruntime_extensions/pnp/_base.py
+++ b/onnxruntime_extensions/pnp/_base.py
@@ -5,6 +5,8 @@
 from onnx.onnx_pb import TensorProto
 from torch.onnx import TrainingMode, export as _export
 
+from ._onnx_ops import OPSET_TO_IR_VERSION
+
 
 def _export_f(model, *args,
               opset_version=None,
@@ -32,6 +34,9 @@ def _export_f(model, *args,
                 custom_opsets=custom_opsets)
 
         mdl = onnx.load_model(io.BytesIO(f.getvalue()))
+        for ops in mdl.opset_import:
+            if ops.domain in ('', 'ai.onnx'):
+                mdl.ir_version = OPSET_TO_IR_VERSION[ops.version]
         if output_path is not None:
             if output_seq > 0:
                 output_path.replace('.onnx', '.{}.onnx'.format(output_seq))
diff --git a/onnxruntime_extensions/pnp/_onnx_ops.py b/onnxruntime_extensions/pnp/_onnx_ops.py
index a10a54b71..5dfbaa59e 100644
--- a/onnxruntime_extensions/pnp/_onnx_ops.py
+++ b/onnxruntime_extensions/pnp/_onnx_ops.py
@@ -15,6 +15,8 @@
     7: 3, 8: 3, 9: 4, 10: 5, 11: 6, 12: 7,
     13: 7, 14: 7, 15: 8, 16: 8, 17: 8
 }
+if hasattr(helper, 'VERSION_TABLE'):
+    OPSET_TO_IR_VERSION = {row[2]: row[1] for row in helper.VERSION_TABLE}
 
 
 def _get_main_opset_version(model):
diff --git a/onnxruntime_extensions/pnp/_utils.py b/onnxruntime_extensions/pnp/_utils.py
index 31e2f1400..6be09b643 100644
--- a/onnxruntime_extensions/pnp/_utils.py
+++ b/onnxruntime_extensions/pnp/_utils.py
@@ -1,6 +1,6 @@
 import copy
 import onnx
-from onnx import numpy_helper
+from onnx import helper, numpy_helper
 from collections import namedtuple
 
 
@@ -271,20 +271,32 @@ def join_models(cls, *models, io_mapping=None):
                 del _n.input[:]
                 _n.input.extend([port_mapping[_i] if _i in port_mapping else _i for _i in new_input])
 
-        name = ''
+        name = "_".join([_mdl.graph.name for _mdl in models])
         domains = set()
         _opset = []
         for _mdl in models:
             for _ops in _mdl.opset_import:
-                if _ops.domain not in domains:
-                    domains.update([_ops.domain])
-                    _opset.append(_ops)
-            name = name + '_' + _mdl.graph.name if name else _mdl.graph.name
+                domain = _ops.domain if _ops.domain else "ai.onnx"
+                if domain in domains:
+                    if domain == "ai.onnx":
+                      assert _ops.version == _opset[0].version, \
+                        f"ai.onnx domain version doesn't match {_ops.version} != {_opset[0].version}"
+                else:
+                    domains.add(domain)
+                    if domain == "ai.onnx":
+                        _opset.insert(0, _ops)
+                    else:
+                        _opset.append(_ops)
 
         inits = cls._remove_unused_initializers(nodes, container.initializer)
-        helper = onnx.helper
         g = helper.make_graph(nodes, name, inputs, outputs,
                               initializer=inits,
                               value_info=container.value_info)
-        m = helper.make_model(g, opset_imports=_opset)
+
+        if hasattr(helper, 'make_model_gen_version'):
+            # make_model_gen_version doesn't accept the custom domain.
+            m = helper.make_model_gen_version(g, opset_imports=_opset[:1])
+            m.opset_import.extend(_opset[1:])
+        else:
+            m = helper.make_model(g, opset_imports=_opset)
         return m
diff --git a/onnxruntime_extensions/util.py b/onnxruntime_extensions/util.py
index 00fb49c21..b62bb320f 100644
--- a/onnxruntime_extensions/util.py
+++ b/onnxruntime_extensions/util.py
@@ -1,5 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
+import onnx
 import pathlib
 import inspect
 
@@ -57,3 +58,66 @@ def mel_filterbank(
     energy_norm = 2.0 / (mel_bins[2 : n_mels + 2] - mel_bins[:n_mels])
     fbank *= energy_norm[:, np.newaxis]
     return fbank
+
+
+def remove_unused_constants(subgraph):
+    nodes = [_n for _n in subgraph.node]
+
+    # Find the names of all input tensors for all nodes in the subgraph
+    input_tensors = set()
+    for node in nodes:
+        for input_name in node.input:
+            input_tensors.add(input_name)
+
+    # Remove Constant nodes whose output is not used by any other nodes
+    nodes_to_remove = []
+    for node in nodes:
+        if node.op_type == 'Constant':
+            output_name = node.output[0]
+            if output_name not in input_tensors:
+                nodes_to_remove.append(node)
+
+    for node in nodes_to_remove:
+        subgraph.node.remove(node)
+
+    # Recursively process subgraphs within this subgraph
+    for node in nodes:
+        for attr in node.attribute:
+            if attr.type == onnx.AttributeProto.GRAPH:
+                remove_unused_constants(attr.g)
+            elif attr.type == onnx.AttributeProto.GRAPHS:
+                for subgraph in attr.graphs:
+                    remove_unused_constants(subgraph)
+
+
+def remove_unused_initializers(subgraph, top_level_initializers=None):
+    if top_level_initializers is None:
+        top_level_initializers = []
+        remove_unused_constants(subgraph)
+    initializers = [_i for _i in subgraph.initializer]
+    nodes = subgraph.node
+
+    # Find the names of all input tensors for all nodes in the subgraph
+    input_tensors = set()
+    for node in nodes:
+        for input_name in node.input:
+            input_tensors.add(input_name)
+
+    # Combine top-level and current subgraph initializers
+    all_initializers = initializers + top_level_initializers
+
+    # Filter the initializers by checking if their names are in the list of used input tensors
+    used_initializers = [init for init in all_initializers if init.name in input_tensors]
+
+    # Update the subgraph's initializers
+    del subgraph.initializer[:]
+    subgraph.initializer.extend([init for init in used_initializers if init in initializers])
+
+    # Recursively process subgraphs within this subgraph
+    for node in nodes:
+        for attr in node.attribute:
+            if attr.type == onnx.AttributeProto.GRAPH:
+                remove_unused_initializers(attr.g, top_level_initializers)
+            elif attr.type == onnx.AttributeProto.GRAPHS:
+                for subgraph in attr.graphs:
+                    remove_unused_initializers(subgraph, top_level_initializers)
diff --git a/tutorials/whisper_e2e.py b/tutorials/whisper_e2e.py
index 5852d00de..1faa53016 100644
--- a/tutorials/whisper_e2e.py
+++ b/tutorials/whisper_e2e.py
@@ -166,6 +166,7 @@ def preprocessing(audio_data):
     onnx.save_model(pre_model, os.path.join(root_dir, prep_model_name))
     if USE_ONNX_STFT:
         pre_model = _to_onnx_stft(pre_model)
+        util.remove_unused_initializers(pre_model.graph)
 
     pre_f = PyOrtFunction.from_model(pre_model, cpu_only=True)
     if not USE_AUDIO_DECODER:
@@ -255,9 +256,8 @@ def postprocessing(token_ids, hf_processor):
     # model = WhisperForConditionalGeneration.from_pretrained(model_name)
 
     # The onnx model can be generated by the following command:
-    #   python <ONNXRUNTIME_DIR>\onnxruntime\python\tools\transformers\models\whisper\convert_to_onnx.py
-    #       -m "openai/whisper-base.en" -e
-    # !only be valid after onnxruntime 1.15 or main branch of 04/04/2023
+    #  python -m onnxruntime.transformers.models.whisper.convert_to_onnx -m "openai/whisper-base.en" -e
+    # !!! only be valid after onnxruntime 1.15 or nightly build after 05/05/2023
     model = PyOrtFunction.from_model(args.model, cpu_only=True)
 
     test_file = util.get_test_data_file(args.audio)

From edac207dc365292de9de06dd35ccd47cfbddeed2 Mon Sep 17 00:00:00 2001
From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com>
Date: Mon, 8 May 2023 13:44:19 -0700
Subject: [PATCH 02/17] Add nuget.org publish version option (#426)

* Add nuget.org publish version option

* typo

* small fix

* typo

---------

Co-authored-by: Sayan Shaw <sayanshaw@microsoft.com>
---
 .pipelines/nuget.yml                             | 6 ++++++
 .pipelines/templates/build-package-for-nuget.yml | 8 +++++++-
 tools/ci_build/update_nuspec_for_native_nuget.py | 7 +++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/.pipelines/nuget.yml b/.pipelines/nuget.yml
index 73781147b..e0b8edd98 100644
--- a/.pipelines/nuget.yml
+++ b/.pipelines/nuget.yml
@@ -14,6 +14,11 @@ parameters:
   type: boolean
   default: false
 
+- name: IsForNugetPublish
+  displayName: Is this for publishing to nuget.org? If so, set to true, and update version info.
+  type: boolean
+  default: false
+
 trigger:
   branches:
     exclude:
@@ -30,4 +35,5 @@ stages:
     DoCompliance: ${{ parameters.DoCompliance }}
     DoEsrp: ${{ parameters.DoEsrp }}
     IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+    IsForNugetPublish: ${{ parameters.IsForNugetPublish }}
     OrtNugetPackageId: 'Microsoft.ML.OnnxRuntime.Extensions'
diff --git a/.pipelines/templates/build-package-for-nuget.yml b/.pipelines/templates/build-package-for-nuget.yml
index 769117b1a..dfd792484 100644
--- a/.pipelines/templates/build-package-for-nuget.yml
+++ b/.pipelines/templates/build-package-for-nuget.yml
@@ -14,6 +14,11 @@ parameters:
   type: boolean
   default: false
 
+- name: IsForNugetPublish
+  displayName: Is for publish to nuget.org?
+  type: boolean
+  default: false
+
 - name: OrtNugetPackageId
   displayName: Package name for nuget
   type: string
@@ -219,7 +224,8 @@ stages:
           python $(Build.SourcesDirectory)\tools\ci_build\update_nuspec_for_native_nuget.py `
             --package_version $OrtExtVersion `
             --commit_id $(Build.SourceVersion) `
-            --is_release_build ${{ parameters.IsReleaseBuild }} 
+            --is_release_build ${{ parameters.IsReleaseBuild }} `
+            --is_for_nuget_publish ${{ parameters.IsForNugetPublish }}
 
           cat $(Build.SourcesDirectory)\nuget\NativeNuget.nuspec
         workingDirectory: '$(Build.SourcesDirectory)'
diff --git a/tools/ci_build/update_nuspec_for_native_nuget.py b/tools/ci_build/update_nuspec_for_native_nuget.py
index 04b666946..59547b189 100644
--- a/tools/ci_build/update_nuspec_for_native_nuget.py
+++ b/tools/ci_build/update_nuspec_for_native_nuget.py
@@ -21,6 +21,11 @@ def update_nuspec(args):
         if package_item.tag == "version" and args.package_version:
             if args.is_release_build:
                 package_item.text = args.package_version
+            elif args.is_for_nuget_publish:
+                # Update prefix and postfix below as per NuGet prelease guidelines and team discussions
+                prefix = "alpha"
+                postfix = "1"
+                package_item.text = f"{args.package_version}-{prefix}.{postfix}"
             else:
                 import datetime
                 now = datetime.datetime.now().strftime('%Y%m%d-%H%M')
@@ -51,10 +56,12 @@ def parse_arguments():
                         help="Path to nuspec file to update.")
     parser.add_argument("--commit_id", required=True, help="The last commit id included in this package.")
     parser.add_argument("--is_release_build", default="False", type=str, help="If it's a release build.")
+    parser.add_argument("--is_for_nuget_publish", default="False", type=str, help="If it's for publishing to nuget.org.")
 
     args = parser.parse_args()
     args.nuspec_path = args.nuspec_path.resolve(strict=True)
     args.is_release_build = args.is_release_build.lower() == "true"
+    args.is_for_nuget_publish = args.is_for_nuget_publish.lower() == "true"
     print("used args:", args)
 
     return args

From 03b96c822c9db2a2dddee92c5c4f30fb5c8d1143 Mon Sep 17 00:00:00 2001
From: Vishal Jain <36761320+VishalX@users.noreply.github.com>
Date: Thu, 11 May 2023 14:06:14 +0530
Subject: [PATCH 03/17] Fix ReadMe : Example usage of the PrePostProcessor.md
 (#436)

- Small typo fix in "Add post-processing steps"
---
 .../tools/Example usage of the PrePostProcessor.md              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime_extensions/tools/Example usage of the PrePostProcessor.md b/onnxruntime_extensions/tools/Example usage of the PrePostProcessor.md
index c885dff25..d6d461aa5 100644
--- a/onnxruntime_extensions/tools/Example usage of the PrePostProcessor.md	
+++ b/onnxruntime_extensions/tools/Example usage of the PrePostProcessor.md	
@@ -79,7 +79,7 @@ Similarly the post-processing is assembled the same way. Let's say it's simply a
 first model output:
 
 ``` py
-pipeline.add_pre_processing(
+pipeline.add_post_processing(
     [
         Softmax()
     ]

From 64f20828ce0291394886e277c23529cd1d11320d Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Fri, 12 May 2023 07:13:37 +1000
Subject: [PATCH 04/17] Handle ONNX 1.14 in test scripts (#435)

* Calculate and specify ir_version so we use the oldest possible for maximum compatibility

* Don't use `ignore_unknown` in call to `find_min_ir_version_for` as it's only supported in the most recent ONNX release.
---
 .../pre_post_processing/pre_post_processor.py      | 14 +++++++++-----
 test/data/ppp_vision/create_boxdrawing_model.py    |  6 ++++--
 .../ppp_vision/create_decode_encode_test_model.py  |  4 +++-
 .../test_tools_add_pre_post_processing_to_model.py | 10 ++++++----
 4 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/onnxruntime_extensions/tools/pre_post_processing/pre_post_processor.py b/onnxruntime_extensions/tools/pre_post_processing/pre_post_processor.py
index d4960422a..d175edaf8 100644
--- a/onnxruntime_extensions/tools/pre_post_processing/pre_post_processor.py
+++ b/onnxruntime_extensions/tools/pre_post_processing/pre_post_processor.py
@@ -57,10 +57,10 @@ def __init__(self, inputs: List[onnx.ValueInfoProto] = None, onnx_opset: int = 1
         self._post_processing_joins = None  # type: Union[None,List[Tuple[Union[Step, str], int, str]]]
 
         self._inputs = inputs if inputs else []
-        
+
         # preserve outputs from IOMapEntry, avoid it's consumed by the Follow-up steps.
         # we now can support a output value has more than one consumers with IOEntryValuePreserver.
-        # IOEntryValuePreserver will preserve the output value and add it to the graph output 
+        # IOEntryValuePreserver will preserve the output value and add it to the graph output
         # until consumer step is done.
         self.outputs_preserver = []  # type: List[IOEntryValuePreserver]
 
@@ -206,7 +206,7 @@ def connect_and_run(graph: onnx.GraphProto, processor: Step, connections: List[I
                 io_map.append((step.output_names[step_idx], graph_input))
                 step_graph_outputs.remove((step.output_names[step_idx]))
 
-            # add outputs from previous IoMapEntry producers to maintain them as graph outputs 
+            # add outputs from previous IoMapEntry producers to maintain them as graph outputs
             # until consumed by the final Step that requires them.
             step_graph_outputs += [
                 o.name for o in graph.output if o.name not in step_graph_outputs]
@@ -253,7 +253,11 @@ def connect_and_run(graph: onnx.GraphProto, processor: Step, connections: List[I
 
         opset_imports = [onnx.helper.make_operatorsetid(domain, opset)
                          for domain, opset in self._custom_op_checker_context.opset_imports.items()]
-        new_model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        # find_min_ir_version_for doesn't support custom domains until ONNX 1.14 so extract the ONNX opset from the
+        # imports and only pass that in.
+        ir_version = onnx.helper.find_min_ir_version_for([entry for entry in opset_imports
+                                                          if entry.domain == "" or entry.domain == "ai.onnx"])
+        new_model = onnx.helper.make_model(graph, opset_imports=opset_imports, ir_version=ir_version)
 
         onnx.checker.check_model(new_model)
 
@@ -275,7 +279,7 @@ def __add_processing(
                    Can be:
                      A Step instance. This will be implicitly joined to the immediately previous Step if one exists.
                      A tuple of (Step instance, list of IoMapEntry)
-                      The IoMapEntry values are used to manually join an output from a producer Step to an input 
+                      The IoMapEntry values are used to manually join an output from a producer Step to an input
                       of the current Step.
                         In each IoMapEntry, if a step name is provided the producer Step will be searched for in all
                         predecessor steps. It is valid for a post-processor step to consume output from a
diff --git a/test/data/ppp_vision/create_boxdrawing_model.py b/test/data/ppp_vision/create_boxdrawing_model.py
index 17f9a8ba7..c0d4f4a4b 100644
--- a/test/data/ppp_vision/create_boxdrawing_model.py
+++ b/test/data/ppp_vision/create_boxdrawing_model.py
@@ -14,7 +14,7 @@ def create_model(output_file: Path, **kwargs):
     """
     Create unit test model. If input is bytes from a jpg we do the following
       - DecodeImage: jpg to BGR
-      - Resize: for simulate fixed input size, 
+      - Resize: for simulate fixed input size,
       - LetterBox: for simulate fixed input size, copy border to fill the rest
       - DrawBoundingBoxes: draw bounding boxes on the image
       - EncodeImage: BGR to png (output format is set in the node)
@@ -61,7 +61,9 @@ def create_model(output_file: Path, **kwargs):
     )
 
     onnx_import = onnx.helper.make_operatorsetid('', onnx_opset)
-    model = onnx.helper.make_model(g, opset_imports=[onnx_import])
+    ir_version = onnx.helper.find_min_ir_version_for([onnx_import])
+    model = onnx.helper.make_model_gen_version(g, opset_imports=[onnx_import], ir_version=ir_version)
+
     new_model = pipeline.run(model)
     new_model.doc_string = "Model for testing drawing box."
     new_model.graph.doc_string = ""  # clear out all the messages from graph merges
diff --git a/test/data/ppp_vision/create_decode_encode_test_model.py b/test/data/ppp_vision/create_decode_encode_test_model.py
index 06a9375f6..5f43b2fdb 100644
--- a/test/data/ppp_vision/create_decode_encode_test_model.py
+++ b/test/data/ppp_vision/create_decode_encode_test_model.py
@@ -48,7 +48,9 @@ def create_model(output_file: Path):
     )
 
     onnx_import = onnx.helper.make_operatorsetid('', onnx_opset)
-    model = onnx.helper.make_model(g, opset_imports=[onnx_import])
+    ir_version = onnx.helper.find_min_ir_version_for([onnx_import])
+    model = onnx.helper.make_model_gen_version(g, opset_imports=[onnx_import], ir_version=ir_version)
+
     new_model = pipeline.run(model)
     new_model.doc_string = "Model for testing DecodeImage and EncodeImage."
     new_model.graph.doc_string = ""  # clear out all the messages from graph merges
diff --git a/test/test_tools_add_pre_post_processing_to_model.py b/test/test_tools_add_pre_post_processing_to_model.py
index 6561ab040..bdbcbd033 100644
--- a/test/test_tools_add_pre_post_processing_to_model.py
+++ b/test/test_tools_add_pre_post_processing_to_model.py
@@ -496,7 +496,8 @@ def create_pipeline_and_run_for_nms(self, output_model: Path, length: int,
 
         inputs = [create_named_value("box_and_score", onnx.TensorProto.FLOAT, ["num_boxes", length])]
 
-        pipeline = pre_post_processing.PrePostProcessor(inputs)
+        onnx_opset = 16
+        pipeline = pre_post_processing.PrePostProcessor(inputs, onnx_opset)
 
         pipeline.add_post_processing([
             SplitOutBoxAndScore(num_classes=1),
@@ -512,9 +513,10 @@ def create_pipeline_and_run_for_nms(self, output_model: Path, length: int,
                 _output = Identity(_input)
             }}
         """)
-        input_model = onnx.helper.make_model(graph_def, producer_name="onnx-1")
-        input_model.opset_import.pop()
-        input_model.opset_import.extend([onnx.helper.make_operatorsetid("", 16)])
+
+        onnx_import = onnx.helper.make_operatorsetid('', onnx_opset)
+        ir_version = onnx.helper.find_min_ir_version_for([onnx_import])
+        input_model = onnx.helper.make_model_gen_version(graph_def, opset_imports=[onnx_import], ir_version=ir_version)
 
         new_model = pipeline.run(input_model)
         onnx.save_model(new_model, output_model)

From 598dfcbfc7ea75b5ade6ddffc8e7bd85eba50eb9 Mon Sep 17 00:00:00 2001
From: JiCheng <247153481@qq.com>
Date: Mon, 15 May 2023 10:47:15 +0800
Subject: [PATCH 05/17] overflow (#439)

---
 operators/vision/draw_bounding_box.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operators/vision/draw_bounding_box.cc b/operators/vision/draw_bounding_box.cc
index b917806b1..c6d28f044 100644
--- a/operators/vision/draw_bounding_box.cc
+++ b/operators/vision/draw_bounding_box.cc
@@ -205,7 +205,7 @@ void DrawBoxesForNumClasses(ImageView& image, const BoxArray& boxes, int64_t thi
             [](const std::pair<size_t, int64_t>& first_, const std::pair<size_t, int64_t>& second_) {
               return first_.second < second_.second;
             });
-  for (int64_t i = static_cast<int64_t>(box_reverse.size() - 1); i >= 0; --i) {
+  for (int64_t i = static_cast<int64_t>(box_reverse.size()) - 1; i >= 0; --i) {
     auto [box_index, color_index] = box_reverse[i];
     const auto box = boxes.GetBox(box_index);
     const auto color = KBGRColorMap[color_index];

From 56b978233d18abb57aa43657619064ad372c01f3 Mon Sep 17 00:00:00 2001
From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com>
Date: Mon, 15 May 2023 10:34:07 -0700
Subject: [PATCH 06/17] Fix OneBranch Official pipeline CodeQL issue (#437)

Co-authored-by: Sayan Shaw <sayanshaw@microsoft.com>
---
 .pipelines/OneBranch.Official.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.pipelines/OneBranch.Official.yml b/.pipelines/OneBranch.Official.yml
index ef5ac0271..4fb3ff3b5 100644
--- a/.pipelines/OneBranch.Official.yml
+++ b/.pipelines/OneBranch.Official.yml
@@ -47,8 +47,9 @@ extends:
         break: true # always break the build on binskim issues in addition to TSA upload
         analyzeTargetGlob: '**\RelWithDebInfo\ortextensions.dll'  # avoid scanning the 3rd party DLLs.
       codeql:
-        python:
+        compiled:
           enabled: true
+        cadence: 10
       policheck:
         break: true # always break the build on policheck issues. You can disable it by setting to 'false'
         exclusionsFile: '$(REPOROOT)\.config\policheck_exclusions.xml'

From 239febe4c36fb472f42da86d5d689e1454f8c1ff Mon Sep 17 00:00:00 2001
From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com>
Date: Mon, 15 May 2023 13:11:05 -0700
Subject: [PATCH 07/17] Update cgmanifest.json and ThirdPartyNotices.txt (#438)

* Update cgmanifest.json and ThirdPartyNotices.txt

* add gsl and dr_libs

---------

Co-authored-by: Sayan Shaw <sayanshaw@microsoft.com>
---
 ThirdPartyNotices.txt | 55 ++++++++++++++++++++++++++++++++++++++++---
 cgmanifest.json       | 44 ++++++++++++++++++++++++----------
 2 files changed, 83 insertions(+), 16 deletions(-)

diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
index 14bda2dfb..70dbb325d 100644
--- a/ThirdPartyNotices.txt
+++ b/ThirdPartyNotices.txt
@@ -45,7 +45,7 @@ blingfire 0831265c1aca95ca02eca5bf1155e4251e545328
 
 _____
 
-dlib v19.22
+dlib a12824d42584e292ecb3bad05c4b32c2015a7b89
 
 Boost Software License - Version 1.0 - August 17th, 2003
 
@@ -72,7 +72,7 @@ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 DEALINGS IN THE SOFTWARE.
 _____
 
-google/re2 2020-11-01
+google/re2 2021-06-01
 
 Copyright (c) 2009 The RE2 Authors. All rights reserved.
 
@@ -122,7 +122,7 @@ Viatcheslav Ostapenko <sl.ostapenko@samsung.com>
 
 _____
 
-nlohmann/json
+nlohmann/json v3.10.5
 
 MIT License
 
@@ -571,3 +571,52 @@ sentencepiece 0.1.96
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
+
+_____
+
+dr_libs dd762b861ecadf5ddd5fb03e9ca1db6707b54fbb
+
+    MIT No Attribution
+
+    Copyright 2020 David Reid
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy of
+    this software and associated documentation files (the "Software"), to deal in
+    the Software without restriction, including without limitation the rights to
+    use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is furnished to do
+    so.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE.
+
+_____
+
+gsl 3.24.0
+
+    Copyright (c) 2015 Microsoft Corporation. All rights reserved. 
+ 
+    This code is licensed under the MIT License (MIT). 
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy 
+    of this software and associated documentation files (the "Software"), to deal 
+    in the Software without restriction, including without limitation the rights 
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
+    of the Software, and to permit persons to whom the Software is furnished to do 
+    so, subject to the following conditions: 
+
+    The above copyright notice and this permission notice shall be included in all 
+    copies or substantial portions of the Software. 
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
+    THE SOFTWARE.
\ No newline at end of file
diff --git a/cgmanifest.json b/cgmanifest.json
index 28722a186..5990790b7 100644
--- a/cgmanifest.json
+++ b/cgmanifest.json
@@ -5,7 +5,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "v1.6.0",
+          "commitHash": "v1.14.1",
           "repositoryUrl": "https://github.com/microsoft/onnxruntime.git"
         }
       }
@@ -14,7 +14,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "0dab03ba7bc438d7ba3eac2b2c1eb39ed520f928",
+          "commitHash": "6e511679de8ab0feefc1cdac1505b2fac5548e42",
           "repositoryUrl": "https://github.com/protocolbuffers/protobuf.git"
         }
       }
@@ -32,7 +32,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "v19.22",
+          "commitHash": "a12824d42584e292ecb3bad05c4b32c2015a7b89",
           "repositoryUrl": "https://github.com/davisking/dlib.git"
         }
       }
@@ -41,7 +41,16 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "2020-11-01",
+          "commitHash": "dd762b861ecadf5ddd5fb03e9ca1db6707b54fbb",
+          "repositoryUrl": "https://github.com/mackron/dr_libs.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "2021-06-01",
           "repositoryUrl": "https://github.com/google/re2.git"
         }
       }
@@ -59,7 +68,16 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "v3.7.3",
+          "commitHash": "3.24.0",
+          "repositoryUrl": "https://github.com/microsoft/GSL.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "v3.10.5",
           "repositoryUrl": "https://github.com/nlohmann/json.git"
         }
       }
@@ -77,8 +95,8 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "9d  12-Jan-2020",
-          "repositoryUrl": "https://github.com/opencv/3rdparty/libjpeg"
+          "commitHash": "364702b1c98943e4e306e745389d3f464010f069",
+          "repositoryUrl": "https://github.com/opencv/opencv/tree/4.x/3rdparty/libjpeg"
         },
         "comments": "Used by OpenCV"
       }
@@ -87,8 +105,8 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "2.4.0",
-          "repositoryUrl": "https://github.com/opencv/3rdparty/openjpeg"
+          "commitHash": "a2fc479c0b36d1786a9570ddb76f2ab72626994b",
+          "repositoryUrl": "https://github.com/opencv/opencv/tree/4.x/3rdparty/openjpeg"
         },
         "comments": "Used by OpenCV"
       }
@@ -97,8 +115,8 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "1.6.37",
-          "repositoryUrl": "https://github.com/opencv/3rdparty/libpng"
+          "commitHash": "d9bf522b271ed026813cbe35399b5aead3c9b670",
+          "repositoryUrl": "https://github.com/opencv/opencv/tree/4.x/3rdparty/libpng"
         },
         "comments": "Used by OpenCV"
       }
@@ -107,7 +125,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "v2.6.0",
+          "commitHash": "v2.6.2",
           "repositoryUrl": "https://github.com/pybind/pybind11.git"
         }
       }
@@ -125,7 +143,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "v1.2.11",
+          "commitHash": "v1.2.13",
           "repositoryUrl": "https://github.com/madler/zlib.git"
         },
         "comments": "Used by OpenCV"

From 2cedfa9fdf405e35fac289ed9d8108259a240b43 Mon Sep 17 00:00:00 2001
From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com>
Date: Mon, 15 May 2023 14:53:49 -0700
Subject: [PATCH 08/17] Update nuget version to beta (#441)

* Update nuget version to beta

* small change

---------

Co-authored-by: Sayan Shaw <sayanshaw@microsoft.com>
---
 tools/ci_build/update_nuspec_for_native_nuget.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tools/ci_build/update_nuspec_for_native_nuget.py b/tools/ci_build/update_nuspec_for_native_nuget.py
index 59547b189..4a7c65830 100644
--- a/tools/ci_build/update_nuspec_for_native_nuget.py
+++ b/tools/ci_build/update_nuspec_for_native_nuget.py
@@ -22,10 +22,9 @@ def update_nuspec(args):
             if args.is_release_build:
                 package_item.text = args.package_version
             elif args.is_for_nuget_publish:
-                # Update prefix and postfix below as per NuGet prelease guidelines and team discussions
-                prefix = "alpha"
-                postfix = "1"
-                package_item.text = f"{args.package_version}-{prefix}.{postfix}"
+                # Update version_suffix below if publishing to NuGet
+                version_suffix = "beta" # alpha/beta/rc
+                package_item.text = f"{args.package_version}-{version_suffix}"
             else:
                 import datetime
                 now = datetime.datetime.now().strftime('%Y%m%d-%H%M')

From 4d652011a8d474f836b3bb000f75eaff9dff402b Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Wed, 17 May 2023 08:02:42 +1000
Subject: [PATCH 09/17] Minor cmake updates (#432)

* Update minimum cmake version to 3.25
  * Resolve issue with CMAKE_FIND_FRAMEWORK
  * Change to use pool with VS2022 for win32 wheel build so it has cmake 3.25
* Update ext_ortlib.cmake so it doesn't break when cross-compiling for Android on Windows by defaulting to a build even though it can't be used with Android. Need to address the unit testing gap for Android/iOS separately.
---
 .pipelines/wheels_win32.yml |  2 +-
 CMakeLists.txt              | 11 ++++++++---
 cmake/ext_ortlib.cmake      | 30 ++++++++++++++++--------------
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/.pipelines/wheels_win32.yml b/.pipelines/wheels_win32.yml
index cc65b4666..a486ce8fb 100644
--- a/.pipelines/wheels_win32.yml
+++ b/.pipelines/wheels_win32.yml
@@ -1,7 +1,7 @@
 jobs:
 - job: windows
   timeoutInMinutes: 120
-  pool: {vmImage: 'windows-latest', name: 'Win-CPU-2021'}
+  pool: {name: 'onnxruntime-Win-CPU-2022'}
   variables:
     CIBW_BUILD: "cp3{7,8,9,10}-*amd64"
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 987aeefc5..96a3ea43e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.20)
+cmake_minimum_required(VERSION 3.25)
 project(onnxruntime_extensions LANGUAGES C CXX)
 
 # set(CMAKE_VERBOSE_MAKEFILE ON)
@@ -145,10 +145,15 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
-set(CMAKE_FIND_FRAMEWORK NEVER CACHE STRING "...")
+# set both regular and cache variables to NEVER. the regular variable has a default of FIRST defined by cmake,
+# but due to CMP0126 that will exist in parallel to the cached variable if the CMake minimum version is >= 3.25.
+# if we don't set this to NEVER (or possibly LAST) the builds of the wheel for different python versions will fail
+# as it will find the system python version first and not the correct python version for the wheel.
+set(CMAKE_FIND_FRAMEWORK "NEVER")
+set(CMAKE_FIND_FRAMEWORK "NEVER" CACHE STRING "...")
 
 if(NOT "${CMAKE_FIND_FRAMEWORK}" STREQUAL "NEVER")
-  message(FATAL_ERROR "CMAKE_FIND_FRAMEWORK is not NEVER")
+  message(STATUS "CMAKE_FIND_FRAMEWORK is ${CMAKE_FIND_FRAMEWORK} not NEVER.")
 endif()
 
 # External dependencies
diff --git a/cmake/ext_ortlib.cmake b/cmake/ext_ortlib.cmake
index bab2c3ee4..edd0d3059 100644
--- a/cmake/ext_ortlib.cmake
+++ b/cmake/ext_ortlib.cmake
@@ -11,24 +11,26 @@ else()
   # default to 1.11.1 if not specified
   set(ONNXRUNTIME_VER "1.11.1" CACHE STRING "ONNX Runtime version")
 
-  if(CMAKE_HOST_APPLE)
+  if(APPLE)
     set(ONNXRUNTIME_URL "v${ONNXRUNTIME_VER}/onnxruntime-osx-universal2-${ONNXRUNTIME_VER}.tgz")
-  elseif(CMAKE_HOST_WIN32)
-    if(CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64")
+  elseif(WIN32)
+    set(ONNXRUNTIME_BINARY_PLATFORM "x64")
+
+    # override if generator platform is set
+    if (CMAKE_GENERATOR_PLATFORM)
       if (CMAKE_GENERATOR_PLATFORM STREQUAL "Win32")
-        set(ONNXRUNTIME_URL "v${ONNXRUNTIME_VER}/onnxruntime-win-x86-${ONNXRUNTIME_VER}.zip")
-      else()
-        set(ONNXRUNTIME_URL "v${ONNXRUNTIME_VER}/onnxruntime-win-x64-${ONNXRUNTIME_VER}.zip")
-      endif()
-    elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64")
-      if (CMAKE_GENERATOR_PLATFORM STREQUAL "ARM")
-        set(ONNXRUNTIME_URL "v${ONNXRUNTIME_VER}/onnxruntime-win-arm-${ONNXRUNTIME_VER}.zip")
-      else()
-        set(ONNXRUNTIME_URL "v${ONNXRUNTIME_VER}/onnxruntime-win-arm64-${ONNXRUNTIME_VER}.zip")
+        set(ONNXRUNTIME_BINARY_PLATFORM "x86")
+      elseif (CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64EC")
+        set(ONNXRUNTIME_BINARY_PLATFORM "arm64")
+      elseif (CMAKE_GENERATOR_PLATFORM STREQUAL "ARM")
+        set(ONNXRUNTIME_BINARY_PLATFORM "arm")
       endif()
-    else()
-      message(FATAL_ERROR "Unexpected CMAKE_SYSTEM_PROCESSOR of ${CMAKE_SYSTEM_PROCESSOR}.")
+    elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64")
+      # or if building on arm64 machine
+      set(ONNXRUNTIME_BINARY_PLATFORM "arm64")
     endif()
+
+    set(ONNXRUNTIME_URL "v${ONNXRUNTIME_VER}/onnxruntime-win-${ONNXRUNTIME_BINARY_PLATFORM}-${ONNXRUNTIME_VER}.zip")
   else()
     # Linux or other, using Linux package to retrieve the headers
     set(ONNXRUNTIME_URL "v${ONNXRUNTIME_VER}/onnxruntime-linux-x64-${ONNXRUNTIME_VER}.tgz")

From 15dfd7033817c0bf7db020959d20d024837d8eb3 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 18 May 2023 17:46:36 -0700
Subject: [PATCH 10/17] Add explicit 'set +x' before printing a vso[] command
 to avoid output getting parsed again with a trailing quote. (#443)

---
 .pipelines/android_packaging.yml                           | 2 ++
 .pipelines/ci.yml                                          | 2 ++
 .pipelines/templates/build-package-for-android-aar.yml     | 2 ++
 .pipelines/templates/run-with-android-emulator-steps.yml   | 2 ++
 .pipelines/templates/run-with-ios-simulator-steps.yml      | 2 ++
 .pipelines/templates/set-package-version-variable-step.yml | 2 ++
 6 files changed, 12 insertions(+)

diff --git a/.pipelines/android_packaging.yml b/.pipelines/android_packaging.yml
index d30ee929c..bc7b2fe2c 100644
--- a/.pipelines/android_packaging.yml
+++ b/.pipelines/android_packaging.yml
@@ -44,6 +44,8 @@ jobs:
               VERSION=$(cat ./version.txt)
               AAR_PATH="$(Build.BinariesDirectory)/android_aar/aar_out/$(buildConfig)/com/microsoft/onnxruntime/onnxruntime-extensions-android/${VERSION}/onnxruntime-extensions-android-${VERSION}.aar"
 
+              # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
+              set +x
               echo "##vso[task.setvariable variable=ORT_EXTENSIONS_AAR_PATH]${AAR_PATH}"
             displayName: Build onnxruntime-extensions AAR package
 
diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml
index 912fb3390..b3f435ec1 100644
--- a/.pipelines/ci.yml
+++ b/.pipelines/ci.yml
@@ -385,6 +385,8 @@ jobs:
               VERSION=$(cat ./version.txt)
               AAR_PATH="$(Build.BinariesDirectory)/android_aar/aar_out/com/microsoft/onnxruntime/onnxruntime-extensions-android/${VERSION}/onnxruntime-extensions-android-${VERSION}.aar"
 
+              # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
+              set +x
               echo "##vso[task.setvariable variable=ORT_EXTENSIONS_AAR_PATH]${AAR_PATH}"
             displayName: Build onnxruntime-extensions AAR package
 
diff --git a/.pipelines/templates/build-package-for-android-aar.yml b/.pipelines/templates/build-package-for-android-aar.yml
index d026911a8..87aeaf254 100644
--- a/.pipelines/templates/build-package-for-android-aar.yml
+++ b/.pipelines/templates/build-package-for-android-aar.yml
@@ -87,6 +87,8 @@ stages:
           VERSION=$(cat ./version.txt)
           AAR_PATH="$(Build.BinariesDirectory)/android_aar/aar_out/$(buildConfig)/com/microsoft/onnxruntime/onnxruntime-extensions-android/${VERSION}/onnxruntime-extensions-android-${VERSION}.aar"
 
+          # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
+          set +x
           echo "##vso[task.setvariable variable=ORT_EXTENSIONS_AAR_PATH]${AAR_PATH}"
         workingDirectory: '$(Build.SourcesDirectory)'
     - bash: |
diff --git a/.pipelines/templates/run-with-android-emulator-steps.yml b/.pipelines/templates/run-with-android-emulator-steps.yml
index c955e22ac..c58737b74 100644
--- a/.pipelines/templates/run-with-android-emulator-steps.yml
+++ b/.pipelines/templates/run-with-android-emulator-steps.yml
@@ -14,6 +14,8 @@ steps:
       --start --emulator-extra-args="-partition-size 4096" \
       --emulator-pid-file "${ORT_EXTENSIONS_BUILD_ANDROID_EMULATOR_PID_FILE}"
 
+    # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
+    set +x
     echo "##vso[task.setvariable variable=ORT_EXTENSIONS_BUILD_ANDROID_EMULATOR_PID_FILE]${ORT_EXTENSIONS_BUILD_ANDROID_EMULATOR_PID_FILE}"
   displayName: "Create and start Android emulator"
 
diff --git a/.pipelines/templates/run-with-ios-simulator-steps.yml b/.pipelines/templates/run-with-ios-simulator-steps.yml
index 7cade706d..905b0d162 100644
--- a/.pipelines/templates/run-with-ios-simulator-steps.yml
+++ b/.pipelines/templates/run-with-ios-simulator-steps.yml
@@ -8,6 +8,8 @@ steps:
 
     ORT_EXTENSIONS_BUILD_SIMULATOR_ID=$(xcrun simctl create iPhoneSimulatorForPipeline com.apple.CoreSimulator.SimDeviceType.iPhone-8)
 
+    # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
+    set +x
     echo "##vso[task.setvariable variable=ORT_EXTENSIONS_BUILD_SIMULATOR_ID]${ORT_EXTENSIONS_BUILD_SIMULATOR_ID}"
   displayName: "Create iPhone simulator"
 
diff --git a/.pipelines/templates/set-package-version-variable-step.yml b/.pipelines/templates/set-package-version-variable-step.yml
index d1d65f7fa..11c10e40a 100644
--- a/.pipelines/templates/set-package-version-variable-step.yml
+++ b/.pipelines/templates/set-package-version-variable-step.yml
@@ -25,5 +25,7 @@ steps:
       VERSION="${BASE_VERSION}-dev+$(Build.BuildId).${SHORT_COMMIT_HASH}"
     fi
 
+    # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
+    set +x
     echo "##vso[task.setvariable variable=${{ parameters.PackageVersionVariableName }}]${VERSION}"
   displayName: "Set \"${{ parameters.PackageVersionVariableName }}\" variable to package version"

From b603c0283aa82e3e6afc10d42e36b4a0f73640d3 Mon Sep 17 00:00:00 2001
From: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
Date: Sun, 21 May 2023 21:40:11 -0700
Subject: [PATCH 11/17] fixing the universal2 python package for macOS (#448)

---
 pyproject.toml |  2 +-
 setup.py       | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b0d581da3..cdc0c1bbb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [build-system]
 # Minimum requirements for the build system to execute.
-requires = ["setuptools", "wheel", "numpy>=1.18.5", "cmake"]  # PEP 508 specifications.
+requires = ["setuptools", "wheel", "numpy>=1.18.5", "ninja", "cmake"]  # PEP 508 specifications.
 
 [tool.black]
 line-length = 120
diff --git a/setup.py b/setup.py
index 47e3c8ccf..d0a4716c6 100644
--- a/setup.py
+++ b/setup.py
@@ -8,6 +8,7 @@
 from setuptools.command.build import build as _build
 from setuptools.command.build_ext import build_ext as _build_ext
 
+import re
 import os
 import sys
 import setuptools
@@ -82,6 +83,7 @@ def build_cmake(self, extension):
             '-DOCOS_EXTENTION_NAME=' + ext_fullpath.name,
             '-DCMAKE_BUILD_TYPE=' + config
         ]
+
         if os.environ.get('OCOS_NO_OPENCV') == '1':
             # Disabling openCV can drastically reduce the build time.
             cmake_args += [
@@ -90,6 +92,38 @@ def build_cmake(self, extension):
                 '-DOCOS_ENABLE_CV2=OFF',
                 '-DOCOS_ENABLE_VISION=OFF']
 
+        # CMake lets you override the generator - we need to check this.
+        # Can be set with Conda-Build, for example.
+        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
+        # Adding CMake arguments set as environment variable
+        # (needed e.g. to build for ARM OSx on conda-forge)
+        if "CMAKE_ARGS" in os.environ:
+            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
+
+        if sys.platform != "win32":
+            # Using Ninja-build since it a) is available as a wheel and b)
+            # multithreads automatically. MSVC would require all variables be
+            # exported for Ninja to pick it up, which is a little tricky to do.
+            # Users can override the generator with CMAKE_GENERATOR in CMake
+            # 3.15+.
+            if not cmake_generator or cmake_generator == "Ninja":
+                try:
+                    import ninja  # noqa: F401
+
+                    ninja_executable_path = os.path.join(ninja.BIN_DIR, "ninja")
+                    cmake_args += [
+                        "-GNinja",
+                        f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
+                    ]
+                except ImportError:
+                    pass
+
+        if sys.platform.startswith("darwin"):
+            # Cross-compile support for macOS - respect ARCHFLAGS if set
+            archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
+            if archs:
+                cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
+
         # overwrite the Python module info if the auto-detection doesn't work.
         # export Python3_INCLUDE_DIRS=/opt/python/cp38-cp38
         # export Python3_LIBRARIES=/opt/python/cp38-cp38

From 77cf3e6d2cb40c3ef7c8e083a97a0a9cdf19cfa8 Mon Sep 17 00:00:00 2001
From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com>
Date: Sun, 21 May 2023 23:34:09 -0700
Subject: [PATCH 12/17] Remove onnx<1.14 from requirements.txt (#447)

* remove onnx<1.14 from requirements.txt

* downgrade protobuf

* move protobuf req to requirements-dev.txt

---------

Co-authored-by: Sayan Shaw <sayanshaw@microsoft.com>
Co-authored-by: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
---
 requirements-dev.txt | 1 +
 requirements.txt     | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 5ac0ca151..2f6d3b102 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -5,3 +5,4 @@ pytest
 onnxruntime      >=1.10.0
 transformers     >= 4.9.2,<=4.24.0
 tensorflow_text  >=2.5.0
+protobuf==3.20.*
diff --git a/requirements.txt b/requirements.txt
index e6e7e984f..ccbcb38bd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1 @@
-# 1.14 hasn't be supported yet.
-onnx>=1.9.0,<1.14
+onnx>=1.9.0

From 32e76e17b33469edf4ec66d4b8b005151bc0316c Mon Sep 17 00:00:00 2001
From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com>
Date: Mon, 22 May 2023 23:30:41 -0700
Subject: [PATCH 13/17] Upgrade CMake for Linux NuGet packaging pipeline (#454)

* update nuget linux packaging pool to fix cmake version issue on nuget packaging pipeline

* switch nuget linux pool to ubuntu-latest

* upgrade cmake

* more fixes

* install cmake binary

* try to use pip installed cmake

* more fixes

* add source bash profile reset

* typo

* try ~/.local/bin again

* add comment

---------

Co-authored-by: Sayan Shaw <sayanshaw@microsoft.com>
---
 .pipelines/templates/build-package-for-linux.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.pipelines/templates/build-package-for-linux.yml b/.pipelines/templates/build-package-for-linux.yml
index ed8e1607a..a0d91665a 100644
--- a/.pipelines/templates/build-package-for-linux.yml
+++ b/.pipelines/templates/build-package-for-linux.yml
@@ -36,6 +36,7 @@ jobs:
 
     # Currently we can only run tests on x64 as the arm64 tests have a failure
     # https://github.com/microsoft/onnxruntime-extensions/issues/417
+    # NOTE: on arm64 machine, CMake version needs to be updated since we now require CMake 3.25 or newer.
     - ${{ if eq(parameters.OrtExtensionsArch, 'x64') }}:
       - bash: |
           export CFLAGS="${{parameters.OrtExtensionsCFlags}}"
@@ -45,6 +46,10 @@ jobs:
         displayName: 'build onnxruntime-extensions and run tests'
     - ${{ else }}:
       - bash: |
+          sudo apt remove cmake
+          pip install cmake --upgrade
+          export PATH=~/.local/bin:$PATH
+          cmake --version
           export CFLAGS="${{parameters.OrtExtensionsCFlags}}"
           export CXXFLAGS="${{parameters.OrtExtensionsCXXFlags}}"
           ./build_lib.sh --build_dir $(Build.BinariesDirectory)/out/ --config RelWithDebInfo --parallel ${{parameters.AdditionalBuildFlags}}

From 30aa8f1315bdc1d35cbe4b917ae4ca87fd4ec5b0 Mon Sep 17 00:00:00 2001
From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com>
Date: Thu, 25 May 2023 13:35:53 -0700
Subject: [PATCH 14/17]  Add ADO parameter for nuget version suffix (#455)

* update nuget version to rc

* add ADO parameter for nuget version suffix

* remove is_for_nuget_publish

---------

Co-authored-by: Sayan Shaw <sayanshaw@microsoft.com>
---
 .pipelines/nuget.yml                             | 10 +++++-----
 .pipelines/templates/build-package-for-nuget.yml |  9 ++++-----
 tools/ci_build/update_nuspec_for_native_nuget.py |  9 +++------
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/.pipelines/nuget.yml b/.pipelines/nuget.yml
index e0b8edd98..0a588ccfa 100644
--- a/.pipelines/nuget.yml
+++ b/.pipelines/nuget.yml
@@ -14,10 +14,10 @@ parameters:
   type: boolean
   default: false
 
-- name: IsForNugetPublish
-  displayName: Is this for publishing to nuget.org? If so, set to true, and update version info.
-  type: boolean
-  default: false
+- name: NugetVersionSuffix
+  displayName: Update nuget version suffix (e.g. alpha/beta/rc, only if publishing to nuget.org, otherwise leave as "none").
+  type: string
+  default: none
 
 trigger:
   branches:
@@ -35,5 +35,5 @@ stages:
     DoCompliance: ${{ parameters.DoCompliance }}
     DoEsrp: ${{ parameters.DoEsrp }}
     IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-    IsForNugetPublish: ${{ parameters.IsForNugetPublish }}
+    NugetVersionSuffix: ${{ parameters.NugetVersionSuffix }}
     OrtNugetPackageId: 'Microsoft.ML.OnnxRuntime.Extensions'
diff --git a/.pipelines/templates/build-package-for-nuget.yml b/.pipelines/templates/build-package-for-nuget.yml
index dfd792484..a7981a8c0 100644
--- a/.pipelines/templates/build-package-for-nuget.yml
+++ b/.pipelines/templates/build-package-for-nuget.yml
@@ -14,10 +14,9 @@ parameters:
   type: boolean
   default: false
 
-- name: IsForNugetPublish
-  displayName: Is for publish to nuget.org?
-  type: boolean
-  default: false
+- name: NugetVersionSuffix
+  displayName: Nuget version suffix
+  type: string
 
 - name: OrtNugetPackageId
   displayName: Package name for nuget
@@ -225,7 +224,7 @@ stages:
             --package_version $OrtExtVersion `
             --commit_id $(Build.SourceVersion) `
             --is_release_build ${{ parameters.IsReleaseBuild }} `
-            --is_for_nuget_publish ${{ parameters.IsForNugetPublish }}
+            --nuget_version_suffix ${{ parameters.NugetVersionSuffix }}
 
           cat $(Build.SourcesDirectory)\nuget\NativeNuget.nuspec
         workingDirectory: '$(Build.SourcesDirectory)'
diff --git a/tools/ci_build/update_nuspec_for_native_nuget.py b/tools/ci_build/update_nuspec_for_native_nuget.py
index 4a7c65830..4d2149080 100644
--- a/tools/ci_build/update_nuspec_for_native_nuget.py
+++ b/tools/ci_build/update_nuspec_for_native_nuget.py
@@ -21,10 +21,8 @@ def update_nuspec(args):
         if package_item.tag == "version" and args.package_version:
             if args.is_release_build:
                 package_item.text = args.package_version
-            elif args.is_for_nuget_publish:
-                # Update version_suffix below if publishing to NuGet
-                version_suffix = "beta" # alpha/beta/rc
-                package_item.text = f"{args.package_version}-{version_suffix}"
+            elif args.nuget_version_suffix != "none":
+                package_item.text = f"{args.package_version}-{args.nuget_version_suffix}"
             else:
                 import datetime
                 now = datetime.datetime.now().strftime('%Y%m%d-%H%M')
@@ -55,12 +53,11 @@ def parse_arguments():
                         help="Path to nuspec file to update.")
     parser.add_argument("--commit_id", required=True, help="The last commit id included in this package.")
     parser.add_argument("--is_release_build", default="False", type=str, help="If it's a release build.")
-    parser.add_argument("--is_for_nuget_publish", default="False", type=str, help="If it's for publishing to nuget.org.")
+    parser.add_argument("--nuget_version_suffix", type=str, help="Nuget version suffix (needed if publishing to nuget.org and not release build)")
 
     args = parser.parse_args()
     args.nuspec_path = args.nuspec_path.resolve(strict=True)
     args.is_release_build = args.is_release_build.lower() == "true"
-    args.is_for_nuget_publish = args.is_for_nuget_publish.lower() == "true"
     print("used args:", args)
 
     return args

From 70411fdd9652bbc2d19df7c7da560228b2353bcb Mon Sep 17 00:00:00 2001
From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com>
Date: Fri, 26 May 2023 10:30:16 -0700
Subject: [PATCH 15/17] Update release notes for nuget (#456)

* Update release notes for nuget

* indentation fix

---------

Co-authored-by: Sayan Shaw <sayanshaw@microsoft.com>
---
 nuget/NativeNuget.nuspec | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/nuget/NativeNuget.nuspec b/nuget/NativeNuget.nuspec
index be0910241..e79c95b5e 100644
--- a/nuget/NativeNuget.nuspec
+++ b/nuget/NativeNuget.nuspec
@@ -2,23 +2,19 @@
 <package>
   <metadata>
     <id>Microsoft.ML.OnnxRuntime.Extensions</id>
-    <version>0.8.0-alpha</version>
+    <version>0.8.0</version>
     <authors>Microsoft</authors>
     <owners>Microsoft</owners>
     <description>ONNX Runtime Extensions NuGet Package</description>
     <releaseNotes>
-      General
-      1. New custom operators: Whisper, DrawBoundingBoxes, RobertaTokenizer, ClipTokenizer, EncodeImage, DecodeImage
-      2. Optional input/output support
-      3. ORT custom operator C++ stub generation tool
-      4. Operator implementation and documentation improved.
-
-      Mobile
-      1. Android package: Maven
-      2. iOS package: CocoaPods
-      3. PrePostProcessor tool for mobile model
-      4. Super-resolution model pre- / post- processing end-to-end examples
-    </releaseNotes>
+      1. NuGet package for the .NET platform. This package offers comprehensive platform support, including Windows, Linux, MacOS, Android, and iOS. Both x64 and arm64 architectures are supported, where applicable.
+      2. Support for pre-processing and post-processing of the Whisper model, inclusive of Audio and Tokenizer decoding operators.
+      3. Extends support for pre-processing and post-processing of object-detection models, including a new DrawBoundingBoxes operator. Pre/post processing tools can add non-max-suppression to the model to select the best bounding boxes, and scale those to the original image. See the end-to-end example in yolo_e2e.py.
+      4. Introduces the Audio Domain, complemented with AudioCodec and optimized STFT Operators, enhancing audio processing capabilities.
+      5. Enabled optional input/output support for some operators such as GPT2Tokenizer, ClipTokenizer, and RobertaTokenizer.
+      6. Refined the implementation of offset mapping for BBPE-style tokenizers for more operators and efficiency improvement.
+      7. Other bug and security fixes.
+	</releaseNotes>
     <copyright>© Microsoft Corporation. All rights reserved.</copyright>
     <tags>ONNX ONNXRuntime AI Machine Learning</tags>
     <icon>ORT_icon_for_light_bg.png</icon>

From 93f239c1434521b7a4b78f322dca1aafcda9a1be Mon Sep 17 00:00:00 2001
From: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
Date: Tue, 30 May 2023 11:01:30 -0700
Subject: [PATCH 16/17] Unit test being compatible with ONNXRuntime-GPU
 package, and some clean-ups. (#457)

---
 onnxruntime_extensions/cmake_helper.py | 35 --------------
 test/test_cliptok.py                   | 32 +++++++-----
 test/test_cmake_helper.py              | 29 -----------
 test/test_cv2.py                       |  8 +--
 test/test_gpt2tok.py                   | 11 +++--
 test/test_math_ops.py                  |  4 +-
 test/test_pyops.py                     | 12 ++---
 test/test_robertatok.py                | 12 ++---
 test/test_string_concat.py             |  2 +-
 test/test_string_ecma_regex.py         | 10 ++--
 test/test_string_length.py             |  2 +-
 test/test_string_ops.py                | 67 +++++++++++++-------------
 test/test_tools_customop_template.py   | 28 +++++++++--
 13 files changed, 107 insertions(+), 145 deletions(-)
 delete mode 100644 onnxruntime_extensions/cmake_helper.py
 delete mode 100644 test/test_cmake_helper.py

diff --git a/onnxruntime_extensions/cmake_helper.py b/onnxruntime_extensions/cmake_helper.py
deleted file mode 100644
index 7955b683c..000000000
--- a/onnxruntime_extensions/cmake_helper.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import inspect
-from ._ocos import default_opset_domain
-from . import _cuops
-
-
-ALL_CUSTOM_OPS = {_name: _obj for _name, _obj in inspect.getmembers(_cuops)
-                  if (inspect.isclass(_obj) and issubclass(_obj, _cuops.CustomOp))}
-
-
-OPMAP_TO_CMAKE_FLAGS = {'GPT2Tokenizer': 'OCOS_ENABLE_GPT2_TOKENIZER',
-                        'BlingFireSentenceBreaker': 'OCOS_ENABLE_BLINGFIRE'
-                       }
-
-
-def gen_cmake_oplist(opconfig_file, oplist_cmake_file = '_selectedoplist.cmake'):
-
-    ext_domain = default_opset_domain()
-    with open(oplist_cmake_file, 'w') as f:
-        print("# Auto-Generated File, not edited!!!", file=f)
-        with open(opconfig_file, 'r') as opfile:
-            for _ln in opfile:
-                if _ln.startswith(ext_domain):
-                    items = _ln.strip().split(';')
-                    if len(items) < 3:
-                        raise RuntimeError("The malformated operator config file.")
-                    for _op in items[2].split(','):
-                        if not _op:
-                            continue # is None or ""
-                        if _op not in OPMAP_TO_CMAKE_FLAGS:
-                            raise RuntimeError("Cannot find the custom operator({})\'s build flags, "
-                                            + "Please update the OPMAP_TO_CMAKE_FLAGS dictionary.".format(_op))
-                        print("set({} ON CACHE INTERNAL \"\")".format(OPMAP_TO_CMAKE_FLAGS[_op]), file=f)
-        print("# End of Building the Operator CMake variables", file=f)
-
-    print('The cmake tool file has been generated successfully.')
diff --git a/test/test_cliptok.py b/test/test_cliptok.py
index 750f21ff5..5f772a682 100644
--- a/test/test_cliptok.py
+++ b/test/test_cliptok.py
@@ -12,6 +12,7 @@
     PyOrtFunction)
 from onnxruntime_extensions.cvt import HFTokenizerConverter
 
+
 def _get_file_content(path):
     with open(path, "rb") as file:
         return file.read()
@@ -34,7 +35,8 @@ def _create_test_model(**kwargs):
     if kwargs["attention_mask"]:
         if kwargs["offset_map"]:
             node = [helper.make_node(
-                'CLIPTokenizer', ['string_input'], ['input_ids', 'attention_mask', 'offset_mapping'], vocab=_get_file_content(vocab_file),
+                'CLIPTokenizer', ['string_input'],
+                ['input_ids', 'attention_mask', 'offset_mapping'], vocab=_get_file_content(vocab_file),
                 merges=_get_file_content(merges_file), name='bpetok', padding_length=max_length,
                 domain='ai.onnx.contrib')]
 
@@ -73,10 +75,11 @@ def setUpClass(cls):
         cls.tokenizer_cvt = HFTokenizerConverter(cls.slow_tokenizer)
 
     def _run_tokenizer(self, test_sentence, padding_length=-1):
-        model = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, max_length=padding_length, attention_mask=True, offset_map=True)
+        model = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges,
+                                   max_length=padding_length, attention_mask=True, offset_map=True)
         so = _ort.SessionOptions()
         so.register_custom_ops_library(_get_library_path())
-        sess = _ort.InferenceSession(model.SerializeToString(), so)
+        sess = _ort.InferenceSession(model.SerializeToString(), so, providers=["CPUExecutionProvider"])
         input_text = np.array(test_sentence)
         input_ids, attention_mask, offset_mapping = sess.run(None, {'string_input': input_text})
         print("\nTest Sentence: " + str(test_sentence))
@@ -111,7 +114,9 @@ def test_tokenizer(self):
         self._run_tokenizer(["One Microsoft Way, Redmond, WA"])
 
     def test_converter(self):
-        fn_tokenizer = PyOrtFunction.from_customop("CLIPTokenizer", cvt=(self.tokenizer_cvt).clip_tokenizer)
+        fn_tokenizer = PyOrtFunction.from_customop("CLIPTokenizer",
+                                                   cvt=(self.tokenizer_cvt).clip_tokenizer,
+                                                   cpu_only=True)
         test_str = "I can feel the magic, can you?"
         fn_out = fn_tokenizer([test_str])
         clip_out = self.tokenizer(test_str, return_offsets_mapping=True)
@@ -120,16 +125,20 @@ def test_converter(self):
         expect_offset_mapping = clip_out['offset_mapping']
         np.testing.assert_array_equal(fn_out[0].reshape((fn_out[0].size,)), expect_input_ids)
         np.testing.assert_array_equal(fn_out[1].reshape((fn_out[1].size,)), expect_attention_mask)
-        np.testing.assert_array_equal(fn_out[2].reshape((fn_out[2].shape[1], fn_out[2].shape[2])), expect_offset_mapping)
+        np.testing.assert_array_equal(fn_out[2].reshape((fn_out[2].shape[1], fn_out[2].shape[2])),
+                                      expect_offset_mapping)
 
     def test_optional_outputs(self):
-        # Test for models without offset mapping and without both attention mask and offset mapping (input id output is always required)
-        model1 = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, max_length=-1, attention_mask=True, offset_map=False)
-        model2 = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, max_length=-1, attention_mask=False, offset_map=False)
+        # Test for models without offset mapping and without both attention mask and offset mapping
+        #   (input id output is always required)
+        model1 = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges,
+                                    max_length=-1, attention_mask=True, offset_map=False)
+        model2 = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges,
+                                    max_length=-1, attention_mask=False, offset_map=False)
         so = _ort.SessionOptions()
         so.register_custom_ops_library(_get_library_path())
-        sess1 = _ort.InferenceSession(model1.SerializeToString(), so)
-        sess2 = _ort.InferenceSession(model2.SerializeToString(), so)
+        sess1 = _ort.InferenceSession(model1.SerializeToString(), so, providers=["CPUExecutionProvider"])
+        sess2 = _ort.InferenceSession(model2.SerializeToString(), so, providers=["CPUExecutionProvider"])
         input_text = np.array(["Hello World"])
         outputs1 = sess1.run(None, {'string_input': input_text})
         outputs2 = sess2.run(None, {'string_input': input_text})
@@ -142,10 +151,9 @@ def test_optional_outputs(self):
         clip_out = self.tokenizer(["Hello World"], return_offsets_mapping=True)
         expect_input_ids = clip_out['input_ids']
         expect_attention_mask = clip_out['attention_mask']
-        expect_offset_mapping = clip_out['offset_mapping']
         np.testing.assert_array_equal(expect_input_ids, outputs1[0])
         np.testing.assert_array_equal(expect_attention_mask, outputs1[1])
-        np.testing.assert_array_equal(expect_input_ids, outputs2[0])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_cmake_helper.py b/test/test_cmake_helper.py
deleted file mode 100644
index 5a4b11a73..000000000
--- a/test/test_cmake_helper.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import os
-import unittest
-from pathlib import Path
-from onnxruntime_extensions import cmake_helper
-
-
-def _get_test_data_file(*sub_dirs):
-    test_dir = Path(__file__).parent
-    return str(test_dir.joinpath(*sub_dirs))
-
-
-class TestCMakeHelper(unittest.TestCase):
-    def test_cmake_file_gen(self):
-        cfgfile = _get_test_data_file('data', 'test.op.config')
-        cfile = '_selectedoplist.cmake'
-        cmake_helper.gen_cmake_oplist(cfgfile, cfile)
-        found = False
-        with open(cfile, 'r') as f:
-            for _ln in f:
-                if _ln.strip() == "set(OCOS_ENABLE_GPT2_TOKENIZER ON CACHE INTERNAL \"\")":
-                    found = True
-                    break
-
-        os.remove(cfile)
-        self.assertTrue(found)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cv2.py b/test/test_cv2.py
index 87e59d03a..b2b89b204 100644
--- a/test/test_cv2.py
+++ b/test/test_cv2.py
@@ -18,7 +18,7 @@ def test_image_reader(self):
         try:
             rdr = OrtPyFunction.from_customop("ImageReader")
             img_nhwc = rdr([img_file])
-        except ONNXRuntimeError as e:
+        except ONNXRuntimeError:
             pass
 
         if img_nhwc is not None:
@@ -59,9 +59,9 @@ def test_image_decoder(self):
         expected = np.asarray(expected, dtype=np.uint8).copy()
 
         # Convert the image to BGR format since cv2 is default BGR format.
-        red = expected[:,:,0].copy()
-        expected[:,:,0] = expected[:,:,2].copy()
-        expected[:,:,2] = red
+        red = expected[:, :, 0].copy()
+        expected[:, :, 0] = expected[:, :, 2].copy()
+        expected[:, :, 2] = red
 
         self.assertEqual(actual.shape[0], expected.shape[0])
         self.assertEqual(actual.shape[1], expected.shape[1])
diff --git a/test/test_gpt2tok.py b/test/test_gpt2tok.py
index ad32762c1..6850b5b3c 100644
--- a/test/test_gpt2tok.py
+++ b/test/test_gpt2tok.py
@@ -90,10 +90,11 @@ def tearDown(self) -> None:
         return super().tearDown()
 
     def _run_tokenizer(self, test_sentence, padding_length=-1):
-        model = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, max_length=padding_length, attention_mask=True)
+        model = _create_test_model(vocab_file=self.tokjson,
+                                   merges_file=self.merges, max_length=padding_length, attention_mask=True)
         so = _ort.SessionOptions()
         so.register_custom_ops_library(_get_library_path())
-        sess = _ort.InferenceSession(model.SerializeToString(), so)
+        sess = _ort.InferenceSession(model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input_text = np.array(test_sentence)
         input_ids, attention_mask = sess.run(None, {'string_input': input_text})
         expect_input_ids, expect_attention_mask = self.tokenizer.tokenizer_sentence(test_sentence, padding_length)
@@ -118,10 +119,11 @@ def test_optional_outputs(self):
         enable_py_op(False)
 
         # Test for model without attention mask (input id output is always required)
-        model = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, max_length=-1, attention_mask=False)
+        model = _create_test_model(vocab_file=self.tokjson,
+                                   merges_file=self.merges, max_length=-1, attention_mask=False)
         so = _ort.SessionOptions()
         so.register_custom_ops_library(_get_library_path())
-        sess = _ort.InferenceSession(model.SerializeToString(), so)
+        sess = _ort.InferenceSession(model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input_text = np.array(["Hello World"])
         outputs = sess.run(None, {'string_input': input_text})
 
@@ -133,7 +135,6 @@ def test_optional_outputs(self):
         expect_input_ids = gpt2_out[0]
         np.testing.assert_array_equal(expect_input_ids, outputs[0])
 
-
     def test_tokenizer_pyop(self):
         self._run_tokenizer(["I can feel the magic, can you?"])
         self._run_tokenizer(["Hey Cortana"])
diff --git a/test/test_math_ops.py b/test/test_math_ops.py
index 55329b996..f5132e07d 100644
--- a/test/test_math_ops.py
+++ b/test/test_math_ops.py
@@ -52,7 +52,7 @@ def test_segment_sum_cc(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_segment_sum("")
         self.assertIn('op_type: "SegmentSum"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         data = np.array([[1, 2, 3, 4], [4, 3, 2, 1], [5, 6, 7, 8]], dtype=np.float32)
         segment_ids = np.array([0, 0, 1], dtype=np.int64)
         exp = np.array([[5, 5, 5, 5], [5, 6, 7, 8]], dtype=np.float32)
@@ -65,7 +65,7 @@ def test_segment_sum_python(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_segment_sum("Py")
         self.assertIn('op_type: "PySegmentSum"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         data = np.array([[1, 2, 3, 4], [4, 3, 2, 1], [5, 6, 7, 8]], dtype=np.float32)
         segment_ids = np.array([0, 0, 1], dtype=np.int64)
         exp = np.array([[5, 5, 5, 5], [5, 6, 7, 8]], dtype=np.float32)
diff --git a/test/test_pyops.py b/test/test_pyops.py
index 00dd2d73c..16f463ce7 100644
--- a/test/test_pyops.py
+++ b/test/test_pyops.py
@@ -154,7 +154,7 @@ def test_python_operator(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model()
         self.assertIn('op_type: "PyReverseMatrix"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input_1 = np.array(
             [1, 2, 3, 4, 5, 6]).astype(np.float32).reshape([3, 2])
         txout = sess.run(None, {'input_1': input_1})
@@ -165,7 +165,7 @@ def test_add_epsilon_python(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_double('Py')
         self.assertIn('op_type: "PyAddEpsilon"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input_1 = np.array([[0., 1., 1.5], [7., 8., -5.5]])
         txout = sess.run(None, {'input_1': input_1})
         diff = txout[0] - input_1 - 1e-3
@@ -176,7 +176,7 @@ def test_python_negpos(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_2outputs('Py')
         self.assertIn('op_type: "PyNegPos"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         x = np.array([[0., 1., 1.5], [7., 8., -5.5]]).astype(np.float32)
         neg, pos = sess.run(None, {'x': x})
         diff = x - (neg + pos)
@@ -187,7 +187,7 @@ def test_cc_negpos(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_2outputs("")
         self.assertIn('op_type: "NegPos"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         x = np.array([[0., 1., 1.5], [7., 8., -5.5]]).astype(np.float32)
         neg, pos = sess.run(None, {'x': x})
         diff = x - (neg + pos)
@@ -210,7 +210,7 @@ def test_cc_operator(self):
         onnx_content = _create_test_model_test()
         self.assertIn('op_type: "CustomOpOne"', str(onnx_content))
         ser = onnx_content.SerializeToString()
-        sess0 = _ort.InferenceSession(ser, so)
+        sess0 = _ort.InferenceSession(ser, so, providers=['CPUExecutionProvider'])
         res = sess0.run(None, {
             'input_1': np.random.rand(3, 5).astype(np.float32),
             'input_2': np.random.rand(3, 5).astype(np.float32)})
@@ -221,7 +221,7 @@ def test_python_join(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_join()
         self.assertIn('op_type: "PyOpJoin"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         arr = np.array([["a", "b"]], dtype=object)
         txout = sess.run(None, {'input_1': arr})
         exp = np.array(["a;b"], dtype=object)
diff --git a/test/test_robertatok.py b/test/test_robertatok.py
index 49b320aaf..89a0bb206 100644
--- a/test/test_robertatok.py
+++ b/test/test_robertatok.py
@@ -73,10 +73,11 @@ def setUpClass(cls):
         cls.tokenizer_cvt = HFTokenizerConverter(cls.slow_tokenizer)
 
     def _run_tokenizer(self, test_sentence, padding_length=-1):
-        model = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, max_length=padding_length, attention_mask=True, offset_map=True)
+        model = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges,
+                                   max_length=padding_length, attention_mask=True, offset_map=True)
         so = _ort.SessionOptions()
         so.register_custom_ops_library(_get_library_path())
-        sess = _ort.InferenceSession(model.SerializeToString(), so)
+        sess = _ort.InferenceSession(model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input_text = np.array(test_sentence)
         input_ids, attention_mask, offset_mapping = sess.run(None, {'string_input': input_text})
         print("\nTest Sentence: " + str(test_sentence))
@@ -128,8 +129,8 @@ def test_optional_outputs(self):
         model2 = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, max_length=-1, attention_mask=False, offset_map=False)
         so = _ort.SessionOptions()
         so.register_custom_ops_library(_get_library_path())
-        sess1 = _ort.InferenceSession(model1.SerializeToString(), so)
-        sess2 = _ort.InferenceSession(model2.SerializeToString(), so)
+        sess1 = _ort.InferenceSession(model1.SerializeToString(), so, providers=['CPUExecutionProvider'])
+        sess2 = _ort.InferenceSession(model2.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input_text = np.array(["Hello World"])
         outputs1 = sess1.run(None, {'string_input': input_text})
         outputs2 = sess2.run(None, {'string_input': input_text})
@@ -142,10 +143,9 @@ def test_optional_outputs(self):
         roberta_out = self.tokenizer(["Hello World"], return_offsets_mapping=True)
         expect_input_ids = roberta_out['input_ids']
         expect_attention_mask = roberta_out['attention_mask']
-        expect_offset_mapping = roberta_out['offset_mapping']
         np.testing.assert_array_equal(expect_input_ids, outputs1[0])
         np.testing.assert_array_equal(expect_attention_mask, outputs1[1])
-        np.testing.assert_array_equal(expect_input_ids, outputs2[0])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_string_concat.py b/test/test_string_concat.py
index 7db62f21f..71d29b387 100644
--- a/test/test_string_concat.py
+++ b/test/test_string_concat.py
@@ -29,7 +29,7 @@ def _run_string_concat(input1, input2):
 
     so = _ort.SessionOptions()
     so.register_custom_ops_library(_get_library_path())
-    sess = _ort.InferenceSession(model.SerializeToString(), so)
+    sess = _ort.InferenceSession(model.SerializeToString(), so, providers=['CPUExecutionProvider'])
     result = sess.run(None, {'input_1': input1, 'input_2': input2})
 
     # verify
diff --git a/test/test_string_ecma_regex.py b/test/test_string_ecma_regex.py
index 506e28d85..03621665e 100644
--- a/test/test_string_ecma_regex.py
+++ b/test/test_string_ecma_regex.py
@@ -83,7 +83,7 @@ def test_string_replace_cc(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_replace("")
         self.assertIn('op_type: "StringECMARegexReplace"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         pattern = np.array([r"def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):"])
         rewrite = np.array([r"static PyObject* py_$1(void) {"])
         text = np.array([["def myfunc():"], ["def dummy():"]])
@@ -99,7 +99,7 @@ def test_string_replace_cc_first(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_replace("", global_replace=False)
         self.assertIn('op_type: "StringECMARegexReplace"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         pattern = np.array([r"def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):"])
         rewrite = np.array([r"static PyObject* py_$1(void) {"])
         text = np.array([["def myfunc():def myfunc():"], ["def dummy():def dummy():"]])
@@ -115,7 +115,7 @@ def test_string_replace_cc_x2(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_replace("")
         self.assertIn('op_type: "StringECMARegexReplace"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         pattern = np.array([r"def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):"])
         rewrite = np.array([r"static PyObject* py_$1(void) {"])
         text = np.array([["def myfunc():"], ["def dummy():" * 2]])
@@ -132,7 +132,7 @@ def test_string_replace_uncased(self):
         onnx_model = _create_test_model_string_replace(
             "", "ai.onnx.contrib", True, True
         )
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
 
         pattern = np.array(
             [
@@ -157,7 +157,7 @@ def test_string_regex_split_cc(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_regex_split("")
         self.assertIn('op_type: "StringECMARegexSplitWithOffsets"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input = np.array(["hello there", "hello  there"])
         pattern = np.array(["(\\s)"])
 
diff --git a/test/test_string_length.py b/test/test_string_length.py
index 7af5aa12b..26b2b4b89 100644
--- a/test/test_string_length.py
+++ b/test/test_string_length.py
@@ -26,7 +26,7 @@ def _run_string_length(input):
 
     so = _ort.SessionOptions()
     so.register_custom_ops_library(_get_library_path())
-    sess = _ort.InferenceSession(model.SerializeToString(), so)
+    sess = _ort.InferenceSession(model.SerializeToString(), so, providers=['CPUExecutionProvider'])
     result = sess.run(None, {'input_1': input})
 
     # verify
diff --git a/test/test_string_ops.py b/test/test_string_ops.py
index c6de5ddd2..3064a99c5 100644
--- a/test/test_string_ops.py
+++ b/test/test_string_ops.py
@@ -441,7 +441,7 @@ def test_string_upper_cc(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_upper('')
         self.assertIn('op_type: "StringUpper"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input_1 = np.array([["Abc"]])
         txout = sess.run(None, {'input_1': input_1})
         self.assertEqual(txout[0].tolist(), np.array([["ABC"]]).tolist())
@@ -451,7 +451,7 @@ def test_string_lower_cc(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_lower('')
         self.assertIn('op_type: "StringLower"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input_1 = np.array([["Abc"]])
         txout = sess.run(None, {'input_1': input_1})
         self.assertEqual(txout[0].tolist(), np.array([["abc"]]).tolist())
@@ -461,7 +461,7 @@ def test_string_upper_cc_accent(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_upper('')
         self.assertIn('op_type: "StringUpper"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input_1 = np.array([["R"], ["Abcé"], ["ABC"], ["A"]])
         txout = sess.run(None, {'input_1': input_1})
         self.assertEqual(
@@ -473,7 +473,7 @@ def test_string_lower_cc_accent(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_lower('')
         self.assertIn('op_type: "StringLower"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input_1 = np.array([["R"], ["Abce"], ["ABC"], ["A"]])
         txout = sess.run(None, {'input_1': input_1})
         self.assertEqual(
@@ -497,7 +497,7 @@ def test_string_upper_python(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_upper('Py')
         self.assertIn('op_type: "PyStringUpper"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input_1 = np.array([["Abc"]])
         txout = sess.run(None, {'input_1': input_1})
         self.assertEqual(txout[0].tolist(), np.array([["ABC"]]).tolist())
@@ -507,7 +507,7 @@ def test_string_lower_python(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_lower('Py')
         self.assertIn('op_type: "PyStringLower"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input_1 = np.array([["Abc"]])
         txout = sess.run(None, {'input_1': input_1})
         self.assertEqual(txout[0].tolist(), np.array([["abc"]]).tolist())
@@ -517,7 +517,7 @@ def test_string_upper_python_accent(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_upper('Py')
         self.assertIn('op_type: "PyStringUpper"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input_1 = np.array([["Abcé"]])
         txout = sess.run(None, {'input_1': input_1})
         self.assertEqual(txout[0].tolist(),
@@ -528,7 +528,7 @@ def test_string_lower_python_accent(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_lower('Py')
         self.assertIn('op_type: "PyStringLower"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input_1 = np.array([["Abcé"]])
         txout = sess.run(None, {'input_1': input_1})
         self.assertEqual(txout[0].tolist(),
@@ -539,7 +539,7 @@ def test_string_join_python(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_join('Py')
         self.assertIn('op_type: "PyStringJoin"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         text = np.vstack([np.array([["a", "b", "c"]]),
                           np.array([["aa", "bb", ""]])])
         self.assertEqual(text.shape, (2, 3))
@@ -560,7 +560,7 @@ def test_string_join_python_3d(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_join('Py')
         self.assertIn('op_type: "PyStringJoin"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         text = np.vstack([np.array([["a", "b", "c"]]),
                           np.array([["aa", "bb", ""]])]).reshape((2, 3, 1))
         sep = np.array([";"])
@@ -575,7 +575,7 @@ def test_string_join_python_1d(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_join('Py')
         self.assertIn('op_type: "PyStringJoin"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         text = np.array(["a", "b", "cc"])
         sep = np.array([";"])
         axis = np.array([0], dtype=np.int64)
@@ -589,7 +589,7 @@ def test_string_join_cc(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_join('')
         self.assertIn('op_type: "StringJoin"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         text = np.vstack([np.array([["a", "b", "c"]]),
                           np.array([["aa", "bb", ""]])])
         sep = np.array([";"])
@@ -607,7 +607,7 @@ def test_string_join_cc_1d(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_join('')
         self.assertIn('op_type: "StringJoin"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         text = np.array(["a", "b", "cc"])
         sep = np.array([";"])
         axis = np.array([0], dtype=np.int64)
@@ -620,7 +620,7 @@ def test_string_join_empty(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_join('')
         self.assertIn('op_type: "StringJoin"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         text = np.array([""])
         sep = np.array([" "])
         axis = np.array([0], dtype=np.int64)
@@ -633,7 +633,7 @@ def test_string_join_scalar(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_join('')
         self.assertIn('op_type: "StringJoin"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         text = np.array("a scalar string")
         sep = np.array([" "])
         axis = np.array([0], dtype=np.int64)
@@ -646,7 +646,7 @@ def test_string_join_cc_3d(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_join('')
         self.assertIn('op_type: "StringJoin"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         text = np.array(["a", "b", "c", "d", "e", "f", "g", "h"]).reshape((
             2, 2, 2))
         sep = np.array([";"])
@@ -671,7 +671,7 @@ def test_string_replace_cc(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_replace('')
         self.assertIn('op_type: "StringRegexReplace"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         pattern = np.array([r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):'])
         rewrite = np.array([r'static PyObject* py_\1(void) {'])
         text = np.array([['def myfunc():'], ['def dummy():']])
@@ -687,7 +687,7 @@ def test_string_replace_cc_first(self):
         onnx_model = _create_test_model_string_replace(
             '', global_replace=False)
         self.assertIn('op_type: "StringRegexReplace"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         pattern = np.array([r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):'])
         rewrite = np.array([r'static PyObject* py_\1(void) {'])
         text = np.array([['def myfunc():def myfunc():'],
@@ -703,7 +703,7 @@ def test_string_replace_cc_x2(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_replace('')
         self.assertIn('op_type: "StringRegexReplace"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         pattern = np.array([r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):'])
         rewrite = np.array([r'static PyObject* py_\1(void) {'])
         text = np.array([['def myfunc():'], ['def dummy():' * 2]])
@@ -718,7 +718,7 @@ def test_string_replace_python(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_replace('Py')
         self.assertIn('op_type: "PyStringRegexReplace"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         pattern = np.array([r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):'])
         rewrite = np.array([r'static PyObject*\npy_\1(void)\n{'])
         text = np.array([['def myfunc():'], ['def dummy():']])
@@ -733,7 +733,7 @@ def test_string_replace_python_x2(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_replace('Py')
         self.assertIn('op_type: "PyStringRegexReplace"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         pattern = np.array([r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):'])
         rewrite = np.array([r'static PyObject*\npy_\1(void)\n{'])
         text = np.array([['def myfunc():'], ['def dummy():' * 2]])
@@ -748,7 +748,7 @@ def test_string_to_crc32_python(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_to_hash('Py', kind='crc32')
         self.assertIn('op_type: "PyStringToCRC32"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         text = np.array([["abc", "abcdé"], ["$$^l!%*ù", ""]])
         num_buckets = np.array([44], dtype=np.uint32)
         res = self._string_to_crc32(text, num_buckets)
@@ -765,7 +765,7 @@ def test_string_to_hash_bucket_cc(self):
         onnx_model = _create_test_model_string_to_hash(
             '', kind='hash_bucket')
         self.assertIn('op_type: "StringToHashBucket"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         raw = ["abc", "abcdé", "$$^l!%*ù", "", "a", "A"]
         text = np.array(raw).reshape((3, 2))
         num_buckets = np.array([NUM_BUCKETS], dtype=np.int64)
@@ -791,7 +791,7 @@ def test_string_to_hash_bucket_fast_cc(self):
         onnx_model = _create_test_model_string_to_hash(
             '', kind='hash_bucket_fast')
         self.assertIn('op_type: "StringToHashBucketFast"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         raw = ["abc", "abcdé", "$$^l!%*ù", "", "a", "A"]
         text = np.array(raw).reshape((3, 2))
         num_buckets = np.array([NUM_BUCKETS], dtype=np.int64)
@@ -817,7 +817,7 @@ def test_string_to_hash_bucket_python(self):
         onnx_model = _create_test_model_string_to_hash(
             'Py', kind='hash_bucket')
         self.assertIn('op_type: "PyStringToHashBucket"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         raw = ["abc", "abcdé", "$$^l!%*ù", "", "a", "A"]
         text = np.array(raw).reshape((3, 2))
         num_buckets = np.array([NUM_BUCKETS], dtype=np.int64)
@@ -850,7 +850,7 @@ def test_string_equal_python(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_equal('Py')
         self.assertIn('op_type: "PyStringEqual"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
 
         for x, y in self.enumerate_matrix_couples():
             txout = sess.run(None, {'x': x, 'y': y})
@@ -863,7 +863,7 @@ def test_string_equal_cc(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_equal('')
         self.assertIn('op_type: "StringEqual"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
 
         for x, y in self.enumerate_matrix_couples():
             txout = sess.run(None, {'x': x, 'y': y})
@@ -876,7 +876,7 @@ def test_string_split_python(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_split('Py')
         self.assertIn('op_type: "PyStringSplit"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input = np.array(["a,,b", "", "aa,b,c", "dddddd"])
         delimiter = np.array([","])
 
@@ -908,7 +908,7 @@ def test_string_split_cc(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_split('')
         self.assertIn('op_type: "StringSplit"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input = np.array(["a,,b", "", "aa,b,c", "dddddd"])
         delimiter = np.array([","])
 
@@ -956,7 +956,7 @@ def test_string_split_cc_sep2(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_split('')
         self.assertIn('op_type: "StringSplit"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input = np.array(["a*b", "a,*b", "aa,b,,c", 'z', "dddddd,", "**"])
         delimiter = np.array([",*"])
 
@@ -1009,7 +1009,7 @@ def test_string_split_cc_sep0(self):
         so.register_custom_ops_library(_get_library_path())
         onnx_model = _create_test_model_string_split('')
         self.assertIn('op_type: "StringSplit"', str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input = np.array(["a*b", "a,*b"])
         delimiter = np.array([""])
 
@@ -1051,7 +1051,7 @@ def test_string_regex_split_cc(self):
         onnx_model = _create_test_model_string_regex_split('')
         self.assertIn('op_type: "StringRegexSplitWithOffsets"',
                       str(onnx_model))
-        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
         input = np.array(["hello there", "hello  there"])
         pattern = np.array(["(\\s)"])
 
@@ -1114,7 +1114,7 @@ def test_string_wordpiece_tokenizer_cc(self):
         so.register_custom_ops_library(_get_library_path())
         cc_onnx_model = _create_test_model_wordpiece('')
         self.assertIn('op_type: "WordpieceTokenizer"', str(cc_onnx_model))
-        cc_sess = _ort.InferenceSession(cc_onnx_model.SerializeToString(), so)
+        cc_sess = _ort.InferenceSession(cc_onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider'])
 
         inputs = dict(text=np.array(["unwanted running",
                                      "unwantedX running"], dtype=object))
@@ -1149,7 +1149,6 @@ def _CreateTable(vocab, num_oov=1):
                     value_dtype=tf.int64)
                 res = tf.lookup.StaticVocabularyTable(init, num_oov, lookup_key_dtype=tf.string)
                 res.__len__ = lambda self: len(vocab)
-                  
 
             vocab_table = _CreateTable(["want", "##want", "##ed", "wa", "un", "runn", "##ing"])
 
diff --git a/test/test_tools_customop_template.py b/test/test_tools_customop_template.py
index a650401e5..30d54cbfe 100644
--- a/test/test_tools_customop_template.py
+++ b/test/test_tools_customop_template.py
@@ -15,7 +15,8 @@
 test_data_dir = os.path.join(ort_ext_root, "test", "data")
 sys.path.append(tools_dir)
 
-import gen_customop_template
+import gen_customop_template    # noqa: E402
+
 
 # create generic custom op models with some basic math ops for testing purposes
 def _create_test_model_1():
@@ -34,6 +35,7 @@ def _create_test_model_1():
     model = make_onnx_model(graph)
     return model
 
+
 def _create_test_model_2(prefix=""):
     nodes = [
         helper.make_node("Identity", ["data"], ["id1"]),
@@ -51,8 +53,19 @@ def _create_test_model_2(prefix=""):
     model = make_onnx_model(graph)
     return model
 
+
 class TestCustomOpTemplate(unittest.TestCase):
 
+    @classmethod
+    def tearDownClass(cls) -> None:
+        # remove generated files
+        template_output_path = os.path.join(test_data_dir, "generated")
+        if os.path.exists(template_output_path):
+            for file in os.listdir(template_output_path):
+                os.remove(os.path.join(template_output_path, file))
+            os.rmdir(template_output_path)
+        return super().tearDownClass()
+
     # check input and output type count of models extracted by template generator
     def check_io_count(self, model_name, output_path, expected_input_count, expected_output_count):
         model_path = os.path.join(test_data_dir, "generated", model_name)
@@ -63,14 +76,19 @@ def check_io_count(self, model_name, output_path, expected_input_count, expected
     def test_template(self):
         template_output_path = os.path.join(test_data_dir, "generated")
         os.mkdir(template_output_path)
-        
+
         onnx.save(_create_test_model_1(), os.path.join(template_output_path, "test_model_1.onnx"))
         test1_template_output_path = os.path.join(template_output_path, "custom_op_template_test1.hpp")
-        self.check_io_count(model_name = "test_model_1.onnx", output_path = test1_template_output_path, expected_input_count = 1, expected_output_count = 1)
-        
+        self.check_io_count(model_name="test_model_1.onnx",
+                            output_path=test1_template_output_path,
+                            expected_input_count=1, expected_output_count=1)
+
         onnx.save(_create_test_model_2(), os.path.join(template_output_path, "test_model_2.onnx"))
         test2_template_output_path = os.path.join(template_output_path, "custom_op_template_test2.hpp")
-        self.check_io_count(model_name = "test_model_2.onnx", output_path = test2_template_output_path, expected_input_count = 2, expected_output_count = 1)
+        self.check_io_count(model_name="test_model_2.onnx",
+                            output_path=test2_template_output_path,
+                            expected_input_count=2, expected_output_count=1)
+
 
 if __name__ == "__main__":
     unittest.main()

From 30eb7afcfa1202ff181f6ba2df407f2a1be12039 Mon Sep 17 00:00:00 2001
From: "Aidan Ryan (MSFT)" <109703696+aidanryan-msft@users.noreply.github.com>
Date: Tue, 30 May 2023 16:52:59 -0400
Subject: [PATCH 17/17] Add string strip text operator (#460)

* add string strip text operator

---------

Co-authored-by: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
---
 operators/text/string_strip.cc  | 54 +++++++++++++++++++++++++++++++++
 operators/text/string_strip.hpp | 20 ++++++++++++
 operators/text/text.cc          | 14 ++++-----
 test/test_string_ops.py         | 37 +++++++++++++++++++++-
 4 files changed, 117 insertions(+), 8 deletions(-)
 create mode 100644 operators/text/string_strip.cc
 create mode 100644 operators/text/string_strip.hpp

diff --git a/operators/text/string_strip.cc b/operators/text/string_strip.cc
new file mode 100644
index 000000000..a55c54d20
--- /dev/null
+++ b/operators/text/string_strip.cc
@@ -0,0 +1,54 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "string_strip.hpp"
+#include "string_tensor.h"
+#include <vector>
+#include <cmath>
+#include <algorithm>
+
+const char* WHITE_SPACE_CHARS = " \t\n\r\f\v";
+
+KernelStringStrip::KernelStringStrip(const OrtApi& api, const OrtKernelInfo& info) : BaseKernel(api, info) {
+}
+
+void KernelStringStrip::Compute(OrtKernelContext* context) {
+  // Setup inputs
+  const OrtValue* input_X = ort_.KernelContext_GetInput(context, 0);
+  std::vector<std::string> X;
+  GetTensorMutableDataString(api_, ort_, context, input_X, X);
+
+  // For each string in input, replace with whitespace-trimmed version.
+  for (size_t i = 0; i < X.size(); ++i) {
+    size_t nonWhitespaceBegin = X[i].find_first_not_of(WHITE_SPACE_CHARS);
+    if (nonWhitespaceBegin != std::string::npos) {
+      size_t nonWhitespaceEnd = X[i].find_last_not_of(WHITE_SPACE_CHARS);
+      size_t nonWhitespaceRange = nonWhitespaceEnd - nonWhitespaceBegin + 1;
+
+      X[i] = X[i].substr(nonWhitespaceBegin, nonWhitespaceRange);
+    }
+  }
+
+  // Fills the output
+  OrtTensorDimensions dimensions(ort_, input_X);
+  OrtValue* output = ort_.KernelContext_GetOutput(context, 0, dimensions.data(), dimensions.size());
+  FillTensorDataString(api_, ort_, context, X, output);
+}
+
+const char* CustomOpStringStrip::GetName() const { return "StringStrip"; };
+
+size_t CustomOpStringStrip::GetInputTypeCount() const {
+  return 1;
+};
+
+ONNXTensorElementDataType CustomOpStringStrip::GetInputType(size_t /*index*/) const {
+  return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
+};
+
+size_t CustomOpStringStrip::GetOutputTypeCount() const {
+  return 1;
+};
+
+ONNXTensorElementDataType CustomOpStringStrip::GetOutputType(size_t /*index*/) const {
+  return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
+};
diff --git a/operators/text/string_strip.hpp b/operators/text/string_strip.hpp
new file mode 100644
index 000000000..a8c181c9c
--- /dev/null
+++ b/operators/text/string_strip.hpp
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "ocos.h"
+#include "string_utils.h"
+
+struct KernelStringStrip : BaseKernel {
+  KernelStringStrip(const OrtApi& api, const OrtKernelInfo& info);
+  void Compute(OrtKernelContext* context);
+};
+
+struct CustomOpStringStrip : OrtW::CustomOpBase<CustomOpStringStrip, KernelStringStrip> {
+  const char* GetName() const;
+  size_t GetInputTypeCount() const;
+  ONNXTensorElementDataType GetInputType(size_t index) const;
+  size_t GetOutputTypeCount() const;
+  ONNXTensorElementDataType GetOutputType(size_t index) const;
+};
diff --git a/operators/text/text.cc b/operators/text/text.cc
index d97c1fd67..5c1ceab64 100644
--- a/operators/text/text.cc
+++ b/operators/text/text.cc
@@ -4,6 +4,7 @@
 #include "text/string_join.hpp"
 #include "text/string_lower.hpp"
 #include "text/string_split.hpp"
+#include "text/string_strip.hpp"
 #include "text/string_to_vector.hpp"
 #include "text/string_upper.hpp"
 #include "text/vector_to_string.hpp"
@@ -17,15 +18,14 @@
 #if defined(ENABLE_RE2_REGEX)
 #include "text/re2_strings/string_regex_replace.hpp"
 #include "text/re2_strings/string_regex_split.hpp"
-#endif // ENABLE_RE2_REGEX
+#endif  // ENABLE_RE2_REGEX
 
-
-FxLoadCustomOpFactory LoadCustomOpClasses_Text = 
-    LoadCustomOpClasses<CustomOpClassBegin, 
+FxLoadCustomOpFactory LoadCustomOpClasses_Text =
+    LoadCustomOpClasses<CustomOpClassBegin,
 #if defined(ENABLE_RE2_REGEX)
                         CustomOpStringRegexReplace,
                         CustomOpStringRegexSplitWithOffsets,
-#endif // ENABLE_RE2_REGEX
+#endif  // ENABLE_RE2_REGEX
                         CustomOpRaggedTensorToDense,
                         CustomOpRaggedTensorToSparse,
                         CustomOpStringRaggedTensorToDense,
@@ -38,10 +38,10 @@ FxLoadCustomOpFactory LoadCustomOpClasses_Text =
                         CustomOpStringMapping,
                         CustomOpMaskedFill,
                         CustomOpStringSplit,
+                        CustomOpStringStrip,
                         CustomOpStringToVector,
                         CustomOpVectorToString,
                         CustomOpStringLength,
                         CustomOpStringConcat,
                         CustomOpStringECMARegexReplace,
-                        CustomOpStringECMARegexSplitWithOffsets
-                        >;
+                        CustomOpStringECMARegexSplitWithOffsets>;
diff --git a/test/test_string_ops.py b/test/test_string_ops.py
index 3064a99c5..c49d2a297 100644
--- a/test/test_string_ops.py
+++ b/test/test_string_ops.py
@@ -173,6 +173,22 @@ def _create_test_model_string_equal(prefix, domain='ai.onnx.contrib'):
     return model
 
 
+def _create_test_model_string_strip(prefix, domain='ai.onnx.contrib'):
+    nodes = []
+    nodes[0:] = [helper.make_node('Identity', ['input_1'], ['identity1'])]
+    nodes[1:] = [helper.make_node('%sStringStrip' % prefix,
+                                  ['identity1'], ['customout'],
+                                  domain=domain)]
+
+    input0 = helper.make_tensor_value_info(
+        'input_1', onnx_proto.TensorProto.STRING, [None, None])
+    output0 = helper.make_tensor_value_info(
+        'customout', onnx_proto.TensorProto.STRING, [None, None])
+
+    graph = helper.make_graph(nodes, 'test0', [input0], [output0])
+    model = make_onnx_model(graph)
+    return model
+
 def _create_test_model_string_split(prefix, domain='ai.onnx.contrib'):
     nodes = []
     nodes.append(helper.make_node('Identity', ['input'], ['id1']))
@@ -436,6 +452,26 @@ def test_check_types(self):
         for t in type_list:
             self.assertIn(t, def_list)
 
+    def test_string_strip_cc(self):
+        so = _ort.SessionOptions()
+        so.register_custom_ops_library(_get_library_path())
+        onnx_model = _create_test_model_string_strip('')
+        self.assertIn('op_type: "StringStrip"', str(onnx_model))
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        input_1 = np.array([["  a b c  "]])
+        txout = sess.run(None, {'input_1': input_1})
+        self.assertEqual(txout[0].tolist(), np.array([["a b c"]]).tolist())
+
+    def test_string_strip_cc_empty(self):
+        so = _ort.SessionOptions()
+        so.register_custom_ops_library(_get_library_path())
+        onnx_model = _create_test_model_string_strip('')
+        self.assertIn('op_type: "StringStrip"', str(onnx_model))
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
+        input_1 = np.array([[""]])
+        txout = sess.run(None, {'input_1': input_1})
+        self.assertEqual(txout[0].tolist(), np.array([[""]]).tolist())
+
     def test_string_upper_cc(self):
         so = _ort.SessionOptions()
         so.register_custom_ops_library(_get_library_path())
@@ -1151,7 +1187,6 @@ def _CreateTable(vocab, num_oov=1):
                 res.__len__ = lambda self: len(vocab)
 
             vocab_table = _CreateTable(["want", "##want", "##ed", "wa", "un", "runn", "##ing"])
-
             text = tf.convert_to_tensor(["unwanted running", "unwantedX running"], dtype=tf.string)
             try:
                 tf_tokens, tf_rows, tf_begins, tf_ends = (