From 43994eb34a1b0cd0df6c7626e4ae771a0f618c00 Mon Sep 17 00:00:00 2001 From: Wenbing Li <10278425+wenbingl@users.noreply.github.com> Date: Mon, 8 May 2023 11:37:54 -0700 Subject: [PATCH 01/17] Fix the unit test failure with ONNX 1.14 package. (#428) * Fix the unit test failure with ONNX 1.14 package. * more tests * Update whisper_e2e.py --- onnxruntime_extensions/cmd.py | 21 +++++++- onnxruntime_extensions/pnp/_base.py | 5 ++ onnxruntime_extensions/pnp/_onnx_ops.py | 2 + onnxruntime_extensions/pnp/_utils.py | 28 +++++++---- onnxruntime_extensions/util.py | 64 +++++++++++++++++++++++++ tutorials/whisper_e2e.py | 6 +-- 6 files changed, 113 insertions(+), 13 deletions(-) diff --git a/onnxruntime_extensions/cmd.py b/onnxruntime_extensions/cmd.py index 68c0d4ba3..d7f92f642 100644 --- a/onnxruntime_extensions/cmd.py +++ b/onnxruntime_extensions/cmd.py @@ -1,5 +1,5 @@ import os -import fire +import argparse import onnx import numpy @@ -36,5 +36,22 @@ def selfcheck(self, *args): print("The extensions loaded, status: OK.") +def main(): + parser = argparse.ArgumentParser(description="ORT Extension commands") + parser.add_argument("command", choices=["run", "selfcheck"]) + parser.add_argument("--model", default="model.onnx", help="Path to the ONNX model file") + parser.add_argument("--testdata-dir", help="Path to the test data directory") + parser.add_argument("args", nargs=argparse.REMAINDER, help="Additional arguments") + + args = parser.parse_args() + + ort_commands = ORTExtCommands(model=args.model, testdata_dir=args.testdata_dir) + + if args.command == "run": + ort_commands.run(*args.args) + elif args.command == "selfcheck": + ort_commands.selfcheck(*args.args) + + if __name__ == '__main__': - fire.Fire(ORTExtCommands) + main() diff --git a/onnxruntime_extensions/pnp/_base.py b/onnxruntime_extensions/pnp/_base.py index 00f986c8f..b70db9078 100644 --- a/onnxruntime_extensions/pnp/_base.py +++ b/onnxruntime_extensions/pnp/_base.py @@ -5,6 +5,8 @@ from onnx.onnx_pb import TensorProto from torch.onnx import TrainingMode, export as _export +from ._onnx_ops import OPSET_TO_IR_VERSION + def _export_f(model, *args, opset_version=None, @@ -32,6 +34,9 @@ def _export_f(model, *args, custom_opsets=custom_opsets) mdl = onnx.load_model(io.BytesIO(f.getvalue())) + for ops in mdl.opset_import: + if ops.domain in ('', 'ai.onnx'): + mdl.ir_version = OPSET_TO_IR_VERSION[ops.version] if output_path is not None: if output_seq > 0: output_path.replace('.onnx', '.{}.onnx'.format(output_seq)) diff --git a/onnxruntime_extensions/pnp/_onnx_ops.py b/onnxruntime_extensions/pnp/_onnx_ops.py index a10a54b71..5dfbaa59e 100644 --- a/onnxruntime_extensions/pnp/_onnx_ops.py +++ b/onnxruntime_extensions/pnp/_onnx_ops.py @@ -15,6 +15,8 @@ 7: 3, 8: 3, 9: 4, 10: 5, 11: 6, 12: 7, 13: 7, 14: 7, 15: 8, 16: 8, 17: 8 } +if hasattr(helper, 'VERSION_TABLE'): + OPSET_TO_IR_VERSION = {row[2]: row[1] for row in helper.VERSION_TABLE} def _get_main_opset_version(model): diff --git a/onnxruntime_extensions/pnp/_utils.py b/onnxruntime_extensions/pnp/_utils.py index 31e2f1400..6be09b643 100644 --- a/onnxruntime_extensions/pnp/_utils.py +++ b/onnxruntime_extensions/pnp/_utils.py @@ -1,6 +1,6 @@ import copy import onnx -from onnx import numpy_helper +from onnx import helper, numpy_helper from collections import namedtuple @@ -271,20 +271,32 @@ def join_models(cls, *models, io_mapping=None): del _n.input[:] _n.input.extend([port_mapping[_i] if _i in port_mapping else _i for _i in new_input]) - name = '' + name = "_".join([_mdl.graph.name for _mdl in models]) domains = set() _opset = [] for _mdl in models: for _ops in _mdl.opset_import: - if _ops.domain not in domains: - domains.update([_ops.domain]) - _opset.append(_ops) - name = name + '_' + _mdl.graph.name if name else _mdl.graph.name + domain = _ops.domain if _ops.domain else "ai.onnx" + if domain in domains: + if domain == "ai.onnx": + assert _ops.version == _opset[0].version, \ + f"ai.onnx domain version doesn't match {_ops.version} != {_opset[0].version}" + else: + domains.add(domain) + if domain == "ai.onnx": + _opset.insert(0, _ops) + else: + _opset.append(_ops) inits = cls._remove_unused_initializers(nodes, container.initializer) - helper = onnx.helper g = helper.make_graph(nodes, name, inputs, outputs, initializer=inits, value_info=container.value_info) - m = helper.make_model(g, opset_imports=_opset) + + if hasattr(helper, 'make_model_gen_version'): + # make_model_gen_version doesn't accept the custom domain. + m = helper.make_model_gen_version(g, opset_imports=_opset[:1]) + m.opset_import.extend(_opset[1:]) + else: + m = helper.make_model(g, opset_imports=_opset) return m diff --git a/onnxruntime_extensions/util.py b/onnxruntime_extensions/util.py index 00fb49c21..b62bb320f 100644 --- a/onnxruntime_extensions/util.py +++ b/onnxruntime_extensions/util.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +import onnx import pathlib import inspect @@ -57,3 +58,66 @@ def mel_filterbank( energy_norm = 2.0 / (mel_bins[2 : n_mels + 2] - mel_bins[:n_mels]) fbank *= energy_norm[:, np.newaxis] return fbank + + +def remove_unused_constants(subgraph): + nodes = [_n for _n in subgraph.node] + + # Find the names of all input tensors for all nodes in the subgraph + input_tensors = set() + for node in nodes: + for input_name in node.input: + input_tensors.add(input_name) + + # Remove Constant nodes whose output is not used by any other nodes + nodes_to_remove = [] + for node in nodes: + if node.op_type == 'Constant': + output_name = node.output[0] + if output_name not in input_tensors: + nodes_to_remove.append(node) + + for node in nodes_to_remove: + subgraph.node.remove(node) + + # Recursively process subgraphs within this subgraph + for node in nodes: + for attr in node.attribute: + if attr.type == onnx.AttributeProto.GRAPH: + remove_unused_constants(attr.g) + elif attr.type == onnx.AttributeProto.GRAPHS: + for subgraph in attr.graphs: + remove_unused_constants(subgraph) + + +def remove_unused_initializers(subgraph, top_level_initializers=None): + if top_level_initializers is None: + top_level_initializers = [] + remove_unused_constants(subgraph) + initializers = [_i for _i in subgraph.initializer] + nodes = subgraph.node + + # Find the names of all input tensors for all nodes in the subgraph + input_tensors = set() + for node in nodes: + for input_name in node.input: + input_tensors.add(input_name) + + # Combine top-level and current subgraph initializers + all_initializers = initializers + top_level_initializers + + # Filter the initializers by checking if their names are in the list of used input tensors + used_initializers = [init for init in all_initializers if init.name in input_tensors] + + # Update the subgraph's initializers + del subgraph.initializer[:] + subgraph.initializer.extend([init for init in used_initializers if init in initializers]) + + # Recursively process subgraphs within this subgraph + for node in nodes: + for attr in node.attribute: + if attr.type == onnx.AttributeProto.GRAPH: + remove_unused_initializers(attr.g, top_level_initializers) + elif attr.type == onnx.AttributeProto.GRAPHS: + for subgraph in attr.graphs: + remove_unused_initializers(subgraph, top_level_initializers) diff --git a/tutorials/whisper_e2e.py b/tutorials/whisper_e2e.py index 5852d00de..1faa53016 100644 --- a/tutorials/whisper_e2e.py +++ b/tutorials/whisper_e2e.py @@ -166,6 +166,7 @@ def preprocessing(audio_data): onnx.save_model(pre_model, os.path.join(root_dir, prep_model_name)) if USE_ONNX_STFT: pre_model = _to_onnx_stft(pre_model) + util.remove_unused_initializers(pre_model.graph) pre_f = PyOrtFunction.from_model(pre_model, cpu_only=True) if not USE_AUDIO_DECODER: @@ -255,9 +256,8 @@ def postprocessing(token_ids, hf_processor): # model = WhisperForConditionalGeneration.from_pretrained(model_name) # The onnx model can be generated by the following command: - # python \onnxruntime\python\tools\transformers\models\whisper\convert_to_onnx.py - # -m "openai/whisper-base.en" -e - # !only be valid after onnxruntime 1.15 or main branch of 04/04/2023 + # python -m onnxruntime.transformers.models.whisper.convert_to_onnx -m "openai/whisper-base.en" -e + # !!! only be valid after onnxruntime 1.15 or nightly build after 05/05/2023 model = PyOrtFunction.from_model(args.model, cpu_only=True) test_file = util.get_test_data_file(args.audio) From edac207dc365292de9de06dd35ccd47cfbddeed2 Mon Sep 17 00:00:00 2001 From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com> Date: Mon, 8 May 2023 13:44:19 -0700 Subject: [PATCH 02/17] Add nuget.org publish version option (#426) * Add nuget.org publish version option * typo * small fix * typo --------- Co-authored-by: Sayan Shaw --- .pipelines/nuget.yml | 6 ++++++ .pipelines/templates/build-package-for-nuget.yml | 8 +++++++- tools/ci_build/update_nuspec_for_native_nuget.py | 7 +++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/.pipelines/nuget.yml b/.pipelines/nuget.yml index 73781147b..e0b8edd98 100644 --- a/.pipelines/nuget.yml +++ b/.pipelines/nuget.yml @@ -14,6 +14,11 @@ parameters: type: boolean default: false +- name: IsForNugetPublish + displayName: Is this for publishing to nuget.org? If so, set to true, and update version info. + type: boolean + default: false + trigger: branches: exclude: @@ -30,4 +35,5 @@ stages: DoCompliance: ${{ parameters.DoCompliance }} DoEsrp: ${{ parameters.DoEsrp }} IsReleaseBuild: ${{ parameters.IsReleaseBuild }} + IsForNugetPublish: ${{ parameters.IsForNugetPublish }} OrtNugetPackageId: 'Microsoft.ML.OnnxRuntime.Extensions' diff --git a/.pipelines/templates/build-package-for-nuget.yml b/.pipelines/templates/build-package-for-nuget.yml index 769117b1a..dfd792484 100644 --- a/.pipelines/templates/build-package-for-nuget.yml +++ b/.pipelines/templates/build-package-for-nuget.yml @@ -14,6 +14,11 @@ parameters: type: boolean default: false +- name: IsForNugetPublish + displayName: Is for publish to nuget.org? + type: boolean + default: false + - name: OrtNugetPackageId displayName: Package name for nuget type: string @@ -219,7 +224,8 @@ stages: python $(Build.SourcesDirectory)\tools\ci_build\update_nuspec_for_native_nuget.py ` --package_version $OrtExtVersion ` --commit_id $(Build.SourceVersion) ` - --is_release_build ${{ parameters.IsReleaseBuild }} + --is_release_build ${{ parameters.IsReleaseBuild }} ` + --is_for_nuget_publish ${{ parameters.IsForNugetPublish }} cat $(Build.SourcesDirectory)\nuget\NativeNuget.nuspec workingDirectory: '$(Build.SourcesDirectory)' diff --git a/tools/ci_build/update_nuspec_for_native_nuget.py b/tools/ci_build/update_nuspec_for_native_nuget.py index 04b666946..59547b189 100644 --- a/tools/ci_build/update_nuspec_for_native_nuget.py +++ b/tools/ci_build/update_nuspec_for_native_nuget.py @@ -21,6 +21,11 @@ def update_nuspec(args): if package_item.tag == "version" and args.package_version: if args.is_release_build: package_item.text = args.package_version + elif args.is_for_nuget_publish: + # Update prefix and postfix below as per NuGet prelease guidelines and team discussions + prefix = "alpha" + postfix = "1" + package_item.text = f"{args.package_version}-{prefix}.{postfix}" else: import datetime now = datetime.datetime.now().strftime('%Y%m%d-%H%M') @@ -51,10 +56,12 @@ def parse_arguments(): help="Path to nuspec file to update.") parser.add_argument("--commit_id", required=True, help="The last commit id included in this package.") parser.add_argument("--is_release_build", default="False", type=str, help="If it's a release build.") + parser.add_argument("--is_for_nuget_publish", default="False", type=str, help="If it's for publishing to nuget.org.") args = parser.parse_args() args.nuspec_path = args.nuspec_path.resolve(strict=True) args.is_release_build = args.is_release_build.lower() == "true" + args.is_for_nuget_publish = args.is_for_nuget_publish.lower() == "true" print("used args:", args) return args From 03b96c822c9db2a2dddee92c5c4f30fb5c8d1143 Mon Sep 17 00:00:00 2001 From: Vishal Jain <36761320+VishalX@users.noreply.github.com> Date: Thu, 11 May 2023 14:06:14 +0530 Subject: [PATCH 03/17] Fix ReadMe : Example usage of the PrePostProcessor.md (#436) - Small typo fix in "Add post-processing steps" --- .../tools/Example usage of the PrePostProcessor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime_extensions/tools/Example usage of the PrePostProcessor.md b/onnxruntime_extensions/tools/Example usage of the PrePostProcessor.md index c885dff25..d6d461aa5 100644 --- a/onnxruntime_extensions/tools/Example usage of the PrePostProcessor.md +++ b/onnxruntime_extensions/tools/Example usage of the PrePostProcessor.md @@ -79,7 +79,7 @@ Similarly the post-processing is assembled the same way. Let's say it's simply a first model output: ``` py -pipeline.add_pre_processing( +pipeline.add_post_processing( [ Softmax() ] From 64f20828ce0291394886e277c23529cd1d11320d Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Fri, 12 May 2023 07:13:37 +1000 Subject: [PATCH 04/17] Handle ONNX 1.14 in test scripts (#435) * Calculate and specify ir_version so we use the oldest possible for maximum compatibility * Don't use `ignore_unknown` in call to `find_min_ir_version_for` as it's only supported in the most recent ONNX release. --- .../pre_post_processing/pre_post_processor.py | 14 +++++++++----- test/data/ppp_vision/create_boxdrawing_model.py | 6 ++++-- .../ppp_vision/create_decode_encode_test_model.py | 4 +++- .../test_tools_add_pre_post_processing_to_model.py | 10 ++++++---- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/onnxruntime_extensions/tools/pre_post_processing/pre_post_processor.py b/onnxruntime_extensions/tools/pre_post_processing/pre_post_processor.py index d4960422a..d175edaf8 100644 --- a/onnxruntime_extensions/tools/pre_post_processing/pre_post_processor.py +++ b/onnxruntime_extensions/tools/pre_post_processing/pre_post_processor.py @@ -57,10 +57,10 @@ def __init__(self, inputs: List[onnx.ValueInfoProto] = None, onnx_opset: int = 1 self._post_processing_joins = None # type: Union[None,List[Tuple[Union[Step, str], int, str]]] self._inputs = inputs if inputs else [] - + # preserve outputs from IOMapEntry, avoid it's consumed by the Follow-up steps. # we now can support a output value has more than one consumers with IOEntryValuePreserver. - # IOEntryValuePreserver will preserve the output value and add it to the graph output + # IOEntryValuePreserver will preserve the output value and add it to the graph output # until consumer step is done. self.outputs_preserver = [] # type: List[IOEntryValuePreserver] @@ -206,7 +206,7 @@ def connect_and_run(graph: onnx.GraphProto, processor: Step, connections: List[I io_map.append((step.output_names[step_idx], graph_input)) step_graph_outputs.remove((step.output_names[step_idx])) - # add outputs from previous IoMapEntry producers to maintain them as graph outputs + # add outputs from previous IoMapEntry producers to maintain them as graph outputs # until consumed by the final Step that requires them. step_graph_outputs += [ o.name for o in graph.output if o.name not in step_graph_outputs] @@ -253,7 +253,11 @@ def connect_and_run(graph: onnx.GraphProto, processor: Step, connections: List[I opset_imports = [onnx.helper.make_operatorsetid(domain, opset) for domain, opset in self._custom_op_checker_context.opset_imports.items()] - new_model = onnx.helper.make_model(graph, opset_imports=opset_imports) + # find_min_ir_version_for doesn't support custom domains until ONNX 1.14 so extract the ONNX opset from the + # imports and only pass that in. + ir_version = onnx.helper.find_min_ir_version_for([entry for entry in opset_imports + if entry.domain == "" or entry.domain == "ai.onnx"]) + new_model = onnx.helper.make_model(graph, opset_imports=opset_imports, ir_version=ir_version) onnx.checker.check_model(new_model) @@ -275,7 +279,7 @@ def __add_processing( Can be: A Step instance. This will be implicitly joined to the immediately previous Step if one exists. A tuple of (Step instance, list of IoMapEntry) - The IoMapEntry values are used to manually join an output from a producer Step to an input + The IoMapEntry values are used to manually join an output from a producer Step to an input of the current Step. In each IoMapEntry, if a step name is provided the producer Step will be searched for in all predecessor steps. It is valid for a post-processor step to consume output from a diff --git a/test/data/ppp_vision/create_boxdrawing_model.py b/test/data/ppp_vision/create_boxdrawing_model.py index 17f9a8ba7..c0d4f4a4b 100644 --- a/test/data/ppp_vision/create_boxdrawing_model.py +++ b/test/data/ppp_vision/create_boxdrawing_model.py @@ -14,7 +14,7 @@ def create_model(output_file: Path, **kwargs): """ Create unit test model. If input is bytes from a jpg we do the following - DecodeImage: jpg to BGR - - Resize: for simulate fixed input size, + - Resize: for simulate fixed input size, - LetterBox: for simulate fixed input size, copy border to fill the rest - DrawBoundingBoxes: draw bounding boxes on the image - EncodeImage: BGR to png (output format is set in the node) @@ -61,7 +61,9 @@ def create_model(output_file: Path, **kwargs): ) onnx_import = onnx.helper.make_operatorsetid('', onnx_opset) - model = onnx.helper.make_model(g, opset_imports=[onnx_import]) + ir_version = onnx.helper.find_min_ir_version_for([onnx_import]) + model = onnx.helper.make_model_gen_version(g, opset_imports=[onnx_import], ir_version=ir_version) + new_model = pipeline.run(model) new_model.doc_string = "Model for testing drawing box." new_model.graph.doc_string = "" # clear out all the messages from graph merges diff --git a/test/data/ppp_vision/create_decode_encode_test_model.py b/test/data/ppp_vision/create_decode_encode_test_model.py index 06a9375f6..5f43b2fdb 100644 --- a/test/data/ppp_vision/create_decode_encode_test_model.py +++ b/test/data/ppp_vision/create_decode_encode_test_model.py @@ -48,7 +48,9 @@ def create_model(output_file: Path): ) onnx_import = onnx.helper.make_operatorsetid('', onnx_opset) - model = onnx.helper.make_model(g, opset_imports=[onnx_import]) + ir_version = onnx.helper.find_min_ir_version_for([onnx_import]) + model = onnx.helper.make_model_gen_version(g, opset_imports=[onnx_import], ir_version=ir_version) + new_model = pipeline.run(model) new_model.doc_string = "Model for testing DecodeImage and EncodeImage." new_model.graph.doc_string = "" # clear out all the messages from graph merges diff --git a/test/test_tools_add_pre_post_processing_to_model.py b/test/test_tools_add_pre_post_processing_to_model.py index 6561ab040..bdbcbd033 100644 --- a/test/test_tools_add_pre_post_processing_to_model.py +++ b/test/test_tools_add_pre_post_processing_to_model.py @@ -496,7 +496,8 @@ def create_pipeline_and_run_for_nms(self, output_model: Path, length: int, inputs = [create_named_value("box_and_score", onnx.TensorProto.FLOAT, ["num_boxes", length])] - pipeline = pre_post_processing.PrePostProcessor(inputs) + onnx_opset = 16 + pipeline = pre_post_processing.PrePostProcessor(inputs, onnx_opset) pipeline.add_post_processing([ SplitOutBoxAndScore(num_classes=1), @@ -512,9 +513,10 @@ def create_pipeline_and_run_for_nms(self, output_model: Path, length: int, _output = Identity(_input) }} """) - input_model = onnx.helper.make_model(graph_def, producer_name="onnx-1") - input_model.opset_import.pop() - input_model.opset_import.extend([onnx.helper.make_operatorsetid("", 16)]) + + onnx_import = onnx.helper.make_operatorsetid('', onnx_opset) + ir_version = onnx.helper.find_min_ir_version_for([onnx_import]) + input_model = onnx.helper.make_model_gen_version(graph_def, opset_imports=[onnx_import], ir_version=ir_version) new_model = pipeline.run(input_model) onnx.save_model(new_model, output_model) From 598dfcbfc7ea75b5ade6ddffc8e7bd85eba50eb9 Mon Sep 17 00:00:00 2001 From: JiCheng <247153481@qq.com> Date: Mon, 15 May 2023 10:47:15 +0800 Subject: [PATCH 05/17] overflow (#439) --- operators/vision/draw_bounding_box.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operators/vision/draw_bounding_box.cc b/operators/vision/draw_bounding_box.cc index b917806b1..c6d28f044 100644 --- a/operators/vision/draw_bounding_box.cc +++ b/operators/vision/draw_bounding_box.cc @@ -205,7 +205,7 @@ void DrawBoxesForNumClasses(ImageView& image, const BoxArray& boxes, int64_t thi [](const std::pair& first_, const std::pair& second_) { return first_.second < second_.second; }); - for (int64_t i = static_cast(box_reverse.size() - 1); i >= 0; --i) { + for (int64_t i = static_cast(box_reverse.size()) - 1; i >= 0; --i) { auto [box_index, color_index] = box_reverse[i]; const auto box = boxes.GetBox(box_index); const auto color = KBGRColorMap[color_index]; From 56b978233d18abb57aa43657619064ad372c01f3 Mon Sep 17 00:00:00 2001 From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com> Date: Mon, 15 May 2023 10:34:07 -0700 Subject: [PATCH 06/17] Fix OneBranch Official pipeline CodeQL issue (#437) Co-authored-by: Sayan Shaw --- .pipelines/OneBranch.Official.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pipelines/OneBranch.Official.yml b/.pipelines/OneBranch.Official.yml index ef5ac0271..4fb3ff3b5 100644 --- a/.pipelines/OneBranch.Official.yml +++ b/.pipelines/OneBranch.Official.yml @@ -47,8 +47,9 @@ extends: break: true # always break the build on binskim issues in addition to TSA upload analyzeTargetGlob: '**\RelWithDebInfo\ortextensions.dll' # avoid scanning the 3rd party DLLs. codeql: - python: + compiled: enabled: true + cadence: 10 policheck: break: true # always break the build on policheck issues. You can disable it by setting to 'false' exclusionsFile: '$(REPOROOT)\.config\policheck_exclusions.xml' From 239febe4c36fb472f42da86d5d689e1454f8c1ff Mon Sep 17 00:00:00 2001 From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com> Date: Mon, 15 May 2023 13:11:05 -0700 Subject: [PATCH 07/17] Update cgmanifest.json and ThirdPartyNotices.txt (#438) * Update cgmanifest.json and ThirdPartyNotices.txt * add gsl and dr_libs --------- Co-authored-by: Sayan Shaw --- ThirdPartyNotices.txt | 55 ++++++++++++++++++++++++++++++++++++++++--- cgmanifest.json | 44 ++++++++++++++++++++++++---------- 2 files changed, 83 insertions(+), 16 deletions(-) diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt index 14bda2dfb..70dbb325d 100644 --- a/ThirdPartyNotices.txt +++ b/ThirdPartyNotices.txt @@ -45,7 +45,7 @@ blingfire 0831265c1aca95ca02eca5bf1155e4251e545328 _____ -dlib v19.22 +dlib a12824d42584e292ecb3bad05c4b32c2015a7b89 Boost Software License - Version 1.0 - August 17th, 2003 @@ -72,7 +72,7 @@ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. _____ -google/re2 2020-11-01 +google/re2 2021-06-01 Copyright (c) 2009 The RE2 Authors. All rights reserved. @@ -122,7 +122,7 @@ Viatcheslav Ostapenko _____ -nlohmann/json +nlohmann/json v3.10.5 MIT License @@ -571,3 +571,52 @@ sentencepiece 0.1.96 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +_____ + +dr_libs dd762b861ecadf5ddd5fb03e9ca1db6707b54fbb + + MIT No Attribution + + Copyright 2020 David Reid + + Permission is hereby granted, free of charge, to any person obtaining a copy of + this software and associated documentation files (the "Software"), to deal in + the Software without restriction, including without limitation the rights to + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is furnished to do + so. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + +_____ + +gsl 3.24.0 + + Copyright (c) 2015 Microsoft Corporation. All rights reserved. + + This code is licensed under the MIT License (MIT). + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is furnished to do + so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. \ No newline at end of file diff --git a/cgmanifest.json b/cgmanifest.json index 28722a186..5990790b7 100644 --- a/cgmanifest.json +++ b/cgmanifest.json @@ -5,7 +5,7 @@ "component": { "type": "git", "git": { - "commitHash": "v1.6.0", + "commitHash": "v1.14.1", "repositoryUrl": "https://github.com/microsoft/onnxruntime.git" } } @@ -14,7 +14,7 @@ "component": { "type": "git", "git": { - "commitHash": "0dab03ba7bc438d7ba3eac2b2c1eb39ed520f928", + "commitHash": "6e511679de8ab0feefc1cdac1505b2fac5548e42", "repositoryUrl": "https://github.com/protocolbuffers/protobuf.git" } } @@ -32,7 +32,7 @@ "component": { "type": "git", "git": { - "commitHash": "v19.22", + "commitHash": "a12824d42584e292ecb3bad05c4b32c2015a7b89", "repositoryUrl": "https://github.com/davisking/dlib.git" } } @@ -41,7 +41,16 @@ "component": { "type": "git", "git": { - "commitHash": "2020-11-01", + "commitHash": "dd762b861ecadf5ddd5fb03e9ca1db6707b54fbb", + "repositoryUrl": "https://github.com/mackron/dr_libs.git" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "2021-06-01", "repositoryUrl": "https://github.com/google/re2.git" } } @@ -59,7 +68,16 @@ "component": { "type": "git", "git": { - "commitHash": "v3.7.3", + "commitHash": "3.24.0", + "repositoryUrl": "https://github.com/microsoft/GSL.git" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "v3.10.5", "repositoryUrl": "https://github.com/nlohmann/json.git" } } @@ -77,8 +95,8 @@ "component": { "type": "git", "git": { - "commitHash": "9d 12-Jan-2020", - "repositoryUrl": "https://github.com/opencv/3rdparty/libjpeg" + "commitHash": "364702b1c98943e4e306e745389d3f464010f069", + "repositoryUrl": "https://github.com/opencv/opencv/tree/4.x/3rdparty/libjpeg" }, "comments": "Used by OpenCV" } @@ -87,8 +105,8 @@ "component": { "type": "git", "git": { - "commitHash": "2.4.0", - "repositoryUrl": "https://github.com/opencv/3rdparty/openjpeg" + "commitHash": "a2fc479c0b36d1786a9570ddb76f2ab72626994b", + "repositoryUrl": "https://github.com/opencv/opencv/tree/4.x/3rdparty/openjpeg" }, "comments": "Used by OpenCV" } @@ -97,8 +115,8 @@ "component": { "type": "git", "git": { - "commitHash": "1.6.37", - "repositoryUrl": "https://github.com/opencv/3rdparty/libpng" + "commitHash": "d9bf522b271ed026813cbe35399b5aead3c9b670", + "repositoryUrl": "https://github.com/opencv/opencv/tree/4.x/3rdparty/libpng" }, "comments": "Used by OpenCV" } @@ -107,7 +125,7 @@ "component": { "type": "git", "git": { - "commitHash": "v2.6.0", + "commitHash": "v2.6.2", "repositoryUrl": "https://github.com/pybind/pybind11.git" } } @@ -125,7 +143,7 @@ "component": { "type": "git", "git": { - "commitHash": "v1.2.11", + "commitHash": "v1.2.13", "repositoryUrl": "https://github.com/madler/zlib.git" }, "comments": "Used by OpenCV" From 2cedfa9fdf405e35fac289ed9d8108259a240b43 Mon Sep 17 00:00:00 2001 From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com> Date: Mon, 15 May 2023 14:53:49 -0700 Subject: [PATCH 08/17] Update nuget version to beta (#441) * Update nuget version to beta * small change --------- Co-authored-by: Sayan Shaw --- tools/ci_build/update_nuspec_for_native_nuget.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/ci_build/update_nuspec_for_native_nuget.py b/tools/ci_build/update_nuspec_for_native_nuget.py index 59547b189..4a7c65830 100644 --- a/tools/ci_build/update_nuspec_for_native_nuget.py +++ b/tools/ci_build/update_nuspec_for_native_nuget.py @@ -22,10 +22,9 @@ def update_nuspec(args): if args.is_release_build: package_item.text = args.package_version elif args.is_for_nuget_publish: - # Update prefix and postfix below as per NuGet prelease guidelines and team discussions - prefix = "alpha" - postfix = "1" - package_item.text = f"{args.package_version}-{prefix}.{postfix}" + # Update version_suffix below if publishing to NuGet + version_suffix = "beta" # alpha/beta/rc + package_item.text = f"{args.package_version}-{version_suffix}" else: import datetime now = datetime.datetime.now().strftime('%Y%m%d-%H%M') From 4d652011a8d474f836b3bb000f75eaff9dff402b Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Wed, 17 May 2023 08:02:42 +1000 Subject: [PATCH 09/17] Minor cmake updates (#432) * Update minimum cmake version to 3.25 * Resolve issue with CMAKE_FIND_FRAMEWORK * Change to use pool with VS2022 for win32 wheel build so it has cmake 3.25 * Update ext_ortlib.cmake so it doesn't break when cross-compiling for Android on Windows by defaulting to a build even though it can't be used with Android. Need to address the unit testing gap for Android/iOS separately. --- .pipelines/wheels_win32.yml | 2 +- CMakeLists.txt | 11 ++++++++--- cmake/ext_ortlib.cmake | 30 ++++++++++++++++-------------- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/.pipelines/wheels_win32.yml b/.pipelines/wheels_win32.yml index cc65b4666..a486ce8fb 100644 --- a/.pipelines/wheels_win32.yml +++ b/.pipelines/wheels_win32.yml @@ -1,7 +1,7 @@ jobs: - job: windows timeoutInMinutes: 120 - pool: {vmImage: 'windows-latest', name: 'Win-CPU-2021'} + pool: {name: 'onnxruntime-Win-CPU-2022'} variables: CIBW_BUILD: "cp3{7,8,9,10}-*amd64" diff --git a/CMakeLists.txt b/CMakeLists.txt index 987aeefc5..96a3ea43e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.20) +cmake_minimum_required(VERSION 3.25) project(onnxruntime_extensions LANGUAGES C CXX) # set(CMAKE_VERBOSE_MAKEFILE ON) @@ -145,10 +145,15 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) set_property(GLOBAL PROPERTY USE_FOLDERS ON) -set(CMAKE_FIND_FRAMEWORK NEVER CACHE STRING "...") +# set both regular and cache variables to NEVER. the regular variable has a default of FIRST defined by cmake, +# but due to CMP0126 that will exist in parallel to the cached variable if the CMake minimum version is >= 3.25. +# if we don't set this to NEVER (or possibly LAST) the builds of the wheel for different python versions will fail +# as it will find the system python version first and not the correct python version for the wheel. +set(CMAKE_FIND_FRAMEWORK "NEVER") +set(CMAKE_FIND_FRAMEWORK "NEVER" CACHE STRING "...") if(NOT "${CMAKE_FIND_FRAMEWORK}" STREQUAL "NEVER") - message(FATAL_ERROR "CMAKE_FIND_FRAMEWORK is not NEVER") + message(STATUS "CMAKE_FIND_FRAMEWORK is ${CMAKE_FIND_FRAMEWORK} not NEVER.") endif() # External dependencies diff --git a/cmake/ext_ortlib.cmake b/cmake/ext_ortlib.cmake index bab2c3ee4..edd0d3059 100644 --- a/cmake/ext_ortlib.cmake +++ b/cmake/ext_ortlib.cmake @@ -11,24 +11,26 @@ else() # default to 1.11.1 if not specified set(ONNXRUNTIME_VER "1.11.1" CACHE STRING "ONNX Runtime version") - if(CMAKE_HOST_APPLE) + if(APPLE) set(ONNXRUNTIME_URL "v${ONNXRUNTIME_VER}/onnxruntime-osx-universal2-${ONNXRUNTIME_VER}.tgz") - elseif(CMAKE_HOST_WIN32) - if(CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64") + elseif(WIN32) + set(ONNXRUNTIME_BINARY_PLATFORM "x64") + + # override if generator platform is set + if (CMAKE_GENERATOR_PLATFORM) if (CMAKE_GENERATOR_PLATFORM STREQUAL "Win32") - set(ONNXRUNTIME_URL "v${ONNXRUNTIME_VER}/onnxruntime-win-x86-${ONNXRUNTIME_VER}.zip") - else() - set(ONNXRUNTIME_URL "v${ONNXRUNTIME_VER}/onnxruntime-win-x64-${ONNXRUNTIME_VER}.zip") - endif() - elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64") - if (CMAKE_GENERATOR_PLATFORM STREQUAL "ARM") - set(ONNXRUNTIME_URL "v${ONNXRUNTIME_VER}/onnxruntime-win-arm-${ONNXRUNTIME_VER}.zip") - else() - set(ONNXRUNTIME_URL "v${ONNXRUNTIME_VER}/onnxruntime-win-arm64-${ONNXRUNTIME_VER}.zip") + set(ONNXRUNTIME_BINARY_PLATFORM "x86") + elseif (CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64EC") + set(ONNXRUNTIME_BINARY_PLATFORM "arm64") + elseif (CMAKE_GENERATOR_PLATFORM STREQUAL "ARM") + set(ONNXRUNTIME_BINARY_PLATFORM "arm") endif() - else() - message(FATAL_ERROR "Unexpected CMAKE_SYSTEM_PROCESSOR of ${CMAKE_SYSTEM_PROCESSOR}.") + elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64") + # or if building on arm64 machine + set(ONNXRUNTIME_BINARY_PLATFORM "arm64") endif() + + set(ONNXRUNTIME_URL "v${ONNXRUNTIME_VER}/onnxruntime-win-${ONNXRUNTIME_BINARY_PLATFORM}-${ONNXRUNTIME_VER}.zip") else() # Linux or other, using Linux package to retrieve the headers set(ONNXRUNTIME_URL "v${ONNXRUNTIME_VER}/onnxruntime-linux-x64-${ONNXRUNTIME_VER}.tgz") From 15dfd7033817c0bf7db020959d20d024837d8eb3 Mon Sep 17 00:00:00 2001 From: Edward Chen <18449977+edgchen1@users.noreply.github.com> Date: Thu, 18 May 2023 17:46:36 -0700 Subject: [PATCH 10/17] Add explicit 'set +x' before printing a vso[] command to avoid output getting parsed again with a trailing quote. (#443) --- .pipelines/android_packaging.yml | 2 ++ .pipelines/ci.yml | 2 ++ .pipelines/templates/build-package-for-android-aar.yml | 2 ++ .pipelines/templates/run-with-android-emulator-steps.yml | 2 ++ .pipelines/templates/run-with-ios-simulator-steps.yml | 2 ++ .pipelines/templates/set-package-version-variable-step.yml | 2 ++ 6 files changed, 12 insertions(+) diff --git a/.pipelines/android_packaging.yml b/.pipelines/android_packaging.yml index d30ee929c..bc7b2fe2c 100644 --- a/.pipelines/android_packaging.yml +++ b/.pipelines/android_packaging.yml @@ -44,6 +44,8 @@ jobs: VERSION=$(cat ./version.txt) AAR_PATH="$(Build.BinariesDirectory)/android_aar/aar_out/$(buildConfig)/com/microsoft/onnxruntime/onnxruntime-extensions-android/${VERSION}/onnxruntime-extensions-android-${VERSION}.aar" + # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote. + set +x echo "##vso[task.setvariable variable=ORT_EXTENSIONS_AAR_PATH]${AAR_PATH}" displayName: Build onnxruntime-extensions AAR package diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml index 912fb3390..b3f435ec1 100644 --- a/.pipelines/ci.yml +++ b/.pipelines/ci.yml @@ -385,6 +385,8 @@ jobs: VERSION=$(cat ./version.txt) AAR_PATH="$(Build.BinariesDirectory)/android_aar/aar_out/com/microsoft/onnxruntime/onnxruntime-extensions-android/${VERSION}/onnxruntime-extensions-android-${VERSION}.aar" + # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote. + set +x echo "##vso[task.setvariable variable=ORT_EXTENSIONS_AAR_PATH]${AAR_PATH}" displayName: Build onnxruntime-extensions AAR package diff --git a/.pipelines/templates/build-package-for-android-aar.yml b/.pipelines/templates/build-package-for-android-aar.yml index d026911a8..87aeaf254 100644 --- a/.pipelines/templates/build-package-for-android-aar.yml +++ b/.pipelines/templates/build-package-for-android-aar.yml @@ -87,6 +87,8 @@ stages: VERSION=$(cat ./version.txt) AAR_PATH="$(Build.BinariesDirectory)/android_aar/aar_out/$(buildConfig)/com/microsoft/onnxruntime/onnxruntime-extensions-android/${VERSION}/onnxruntime-extensions-android-${VERSION}.aar" + # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote. + set +x echo "##vso[task.setvariable variable=ORT_EXTENSIONS_AAR_PATH]${AAR_PATH}" workingDirectory: '$(Build.SourcesDirectory)' - bash: | diff --git a/.pipelines/templates/run-with-android-emulator-steps.yml b/.pipelines/templates/run-with-android-emulator-steps.yml index c955e22ac..c58737b74 100644 --- a/.pipelines/templates/run-with-android-emulator-steps.yml +++ b/.pipelines/templates/run-with-android-emulator-steps.yml @@ -14,6 +14,8 @@ steps: --start --emulator-extra-args="-partition-size 4096" \ --emulator-pid-file "${ORT_EXTENSIONS_BUILD_ANDROID_EMULATOR_PID_FILE}" + # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote. + set +x echo "##vso[task.setvariable variable=ORT_EXTENSIONS_BUILD_ANDROID_EMULATOR_PID_FILE]${ORT_EXTENSIONS_BUILD_ANDROID_EMULATOR_PID_FILE}" displayName: "Create and start Android emulator" diff --git a/.pipelines/templates/run-with-ios-simulator-steps.yml b/.pipelines/templates/run-with-ios-simulator-steps.yml index 7cade706d..905b0d162 100644 --- a/.pipelines/templates/run-with-ios-simulator-steps.yml +++ b/.pipelines/templates/run-with-ios-simulator-steps.yml @@ -8,6 +8,8 @@ steps: ORT_EXTENSIONS_BUILD_SIMULATOR_ID=$(xcrun simctl create iPhoneSimulatorForPipeline com.apple.CoreSimulator.SimDeviceType.iPhone-8) + # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote. + set +x echo "##vso[task.setvariable variable=ORT_EXTENSIONS_BUILD_SIMULATOR_ID]${ORT_EXTENSIONS_BUILD_SIMULATOR_ID}" displayName: "Create iPhone simulator" diff --git a/.pipelines/templates/set-package-version-variable-step.yml b/.pipelines/templates/set-package-version-variable-step.yml index d1d65f7fa..11c10e40a 100644 --- a/.pipelines/templates/set-package-version-variable-step.yml +++ b/.pipelines/templates/set-package-version-variable-step.yml @@ -25,5 +25,7 @@ steps: VERSION="${BASE_VERSION}-dev+$(Build.BuildId).${SHORT_COMMIT_HASH}" fi + # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote. + set +x echo "##vso[task.setvariable variable=${{ parameters.PackageVersionVariableName }}]${VERSION}" displayName: "Set \"${{ parameters.PackageVersionVariableName }}\" variable to package version" From b603c0283aa82e3e6afc10d42e36b4a0f73640d3 Mon Sep 17 00:00:00 2001 From: Wenbing Li <10278425+wenbingl@users.noreply.github.com> Date: Sun, 21 May 2023 21:40:11 -0700 Subject: [PATCH 11/17] fixing the universal2 python package for macOS (#448) --- pyproject.toml | 2 +- setup.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b0d581da3..cdc0c1bbb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] # Minimum requirements for the build system to execute. -requires = ["setuptools", "wheel", "numpy>=1.18.5", "cmake"] # PEP 508 specifications. +requires = ["setuptools", "wheel", "numpy>=1.18.5", "ninja", "cmake"] # PEP 508 specifications. [tool.black] line-length = 120 diff --git a/setup.py b/setup.py index 47e3c8ccf..d0a4716c6 100644 --- a/setup.py +++ b/setup.py @@ -8,6 +8,7 @@ from setuptools.command.build import build as _build from setuptools.command.build_ext import build_ext as _build_ext +import re import os import sys import setuptools @@ -82,6 +83,7 @@ def build_cmake(self, extension): '-DOCOS_EXTENTION_NAME=' + ext_fullpath.name, '-DCMAKE_BUILD_TYPE=' + config ] + if os.environ.get('OCOS_NO_OPENCV') == '1': # Disabling openCV can drastically reduce the build time. cmake_args += [ @@ -90,6 +92,38 @@ def build_cmake(self, extension): '-DOCOS_ENABLE_CV2=OFF', '-DOCOS_ENABLE_VISION=OFF'] + # CMake lets you override the generator - we need to check this. + # Can be set with Conda-Build, for example. + cmake_generator = os.environ.get("CMAKE_GENERATOR", "") + # Adding CMake arguments set as environment variable + # (needed e.g. to build for ARM OSx on conda-forge) + if "CMAKE_ARGS" in os.environ: + cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item] + + if sys.platform != "win32": + # Using Ninja-build since it a) is available as a wheel and b) + # multithreads automatically. MSVC would require all variables be + # exported for Ninja to pick it up, which is a little tricky to do. + # Users can override the generator with CMAKE_GENERATOR in CMake + # 3.15+. + if not cmake_generator or cmake_generator == "Ninja": + try: + import ninja # noqa: F401 + + ninja_executable_path = os.path.join(ninja.BIN_DIR, "ninja") + cmake_args += [ + "-GNinja", + f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}", + ] + except ImportError: + pass + + if sys.platform.startswith("darwin"): + # Cross-compile support for macOS - respect ARCHFLAGS if set + archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", "")) + if archs: + cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))] + # overwrite the Python module info if the auto-detection doesn't work. # export Python3_INCLUDE_DIRS=/opt/python/cp38-cp38 # export Python3_LIBRARIES=/opt/python/cp38-cp38 From 77cf3e6d2cb40c3ef7c8e083a97a0a9cdf19cfa8 Mon Sep 17 00:00:00 2001 From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com> Date: Sun, 21 May 2023 23:34:09 -0700 Subject: [PATCH 12/17] Remove onnx<1.14 from requirements.txt (#447) * remove onnx<1.14 from requirements.txt * downgrade protobuf * move protobuf req to requirements-dev.txt --------- Co-authored-by: Sayan Shaw Co-authored-by: Wenbing Li <10278425+wenbingl@users.noreply.github.com> --- requirements-dev.txt | 1 + requirements.txt | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 5ac0ca151..2f6d3b102 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,3 +5,4 @@ pytest onnxruntime >=1.10.0 transformers >= 4.9.2,<=4.24.0 tensorflow_text >=2.5.0 +protobuf==3.20.* diff --git a/requirements.txt b/requirements.txt index e6e7e984f..ccbcb38bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1 @@ -# 1.14 hasn't be supported yet. -onnx>=1.9.0,<1.14 +onnx>=1.9.0 From 32e76e17b33469edf4ec66d4b8b005151bc0316c Mon Sep 17 00:00:00 2001 From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com> Date: Mon, 22 May 2023 23:30:41 -0700 Subject: [PATCH 13/17] Upgrade CMake for Linux NuGet packaging pipeline (#454) * update nuget linux packaging pool to fix cmake version issue on nuget packaging pipeline * switch nuget linux pool to ubuntu-latest * upgrade cmake * more fixes * install cmake binary * try to use pip installed cmake * more fixes * add source bash profile reset * typo * try ~/.local/bin again * add comment --------- Co-authored-by: Sayan Shaw --- .pipelines/templates/build-package-for-linux.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.pipelines/templates/build-package-for-linux.yml b/.pipelines/templates/build-package-for-linux.yml index ed8e1607a..a0d91665a 100644 --- a/.pipelines/templates/build-package-for-linux.yml +++ b/.pipelines/templates/build-package-for-linux.yml @@ -36,6 +36,7 @@ jobs: # Currently we can only run tests on x64 as the arm64 tests have a failure # https://github.com/microsoft/onnxruntime-extensions/issues/417 + # NOTE: on arm64 machine, CMake version needs to be updated since we now require CMake 3.25 or newer. - ${{ if eq(parameters.OrtExtensionsArch, 'x64') }}: - bash: | export CFLAGS="${{parameters.OrtExtensionsCFlags}}" @@ -45,6 +46,10 @@ jobs: displayName: 'build onnxruntime-extensions and run tests' - ${{ else }}: - bash: | + sudo apt remove cmake + pip install cmake --upgrade + export PATH=~/.local/bin:$PATH + cmake --version export CFLAGS="${{parameters.OrtExtensionsCFlags}}" export CXXFLAGS="${{parameters.OrtExtensionsCXXFlags}}" ./build_lib.sh --build_dir $(Build.BinariesDirectory)/out/ --config RelWithDebInfo --parallel ${{parameters.AdditionalBuildFlags}} From 30aa8f1315bdc1d35cbe4b917ae4ca87fd4ec5b0 Mon Sep 17 00:00:00 2001 From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com> Date: Thu, 25 May 2023 13:35:53 -0700 Subject: [PATCH 14/17] Add ADO parameter for nuget version suffix (#455) * update nuget version to rc * add ADO parameter for nuget version suffix * remove is_for_nuget_publish --------- Co-authored-by: Sayan Shaw --- .pipelines/nuget.yml | 10 +++++----- .pipelines/templates/build-package-for-nuget.yml | 9 ++++----- tools/ci_build/update_nuspec_for_native_nuget.py | 9 +++------ 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/.pipelines/nuget.yml b/.pipelines/nuget.yml index e0b8edd98..0a588ccfa 100644 --- a/.pipelines/nuget.yml +++ b/.pipelines/nuget.yml @@ -14,10 +14,10 @@ parameters: type: boolean default: false -- name: IsForNugetPublish - displayName: Is this for publishing to nuget.org? If so, set to true, and update version info. - type: boolean - default: false +- name: NugetVersionSuffix + displayName: Update nuget version suffix (e.g. alpha/beta/rc, only if publishing to nuget.org, otherwise leave as "none"). + type: string + default: none trigger: branches: @@ -35,5 +35,5 @@ stages: DoCompliance: ${{ parameters.DoCompliance }} DoEsrp: ${{ parameters.DoEsrp }} IsReleaseBuild: ${{ parameters.IsReleaseBuild }} - IsForNugetPublish: ${{ parameters.IsForNugetPublish }} + NugetVersionSuffix: ${{ parameters.NugetVersionSuffix }} OrtNugetPackageId: 'Microsoft.ML.OnnxRuntime.Extensions' diff --git a/.pipelines/templates/build-package-for-nuget.yml b/.pipelines/templates/build-package-for-nuget.yml index dfd792484..a7981a8c0 100644 --- a/.pipelines/templates/build-package-for-nuget.yml +++ b/.pipelines/templates/build-package-for-nuget.yml @@ -14,10 +14,9 @@ parameters: type: boolean default: false -- name: IsForNugetPublish - displayName: Is for publish to nuget.org? - type: boolean - default: false +- name: NugetVersionSuffix + displayName: Nuget version suffix + type: string - name: OrtNugetPackageId displayName: Package name for nuget @@ -225,7 +224,7 @@ stages: --package_version $OrtExtVersion ` --commit_id $(Build.SourceVersion) ` --is_release_build ${{ parameters.IsReleaseBuild }} ` - --is_for_nuget_publish ${{ parameters.IsForNugetPublish }} + --nuget_version_suffix ${{ parameters.NugetVersionSuffix }} cat $(Build.SourcesDirectory)\nuget\NativeNuget.nuspec workingDirectory: '$(Build.SourcesDirectory)' diff --git a/tools/ci_build/update_nuspec_for_native_nuget.py b/tools/ci_build/update_nuspec_for_native_nuget.py index 4a7c65830..4d2149080 100644 --- a/tools/ci_build/update_nuspec_for_native_nuget.py +++ b/tools/ci_build/update_nuspec_for_native_nuget.py @@ -21,10 +21,8 @@ def update_nuspec(args): if package_item.tag == "version" and args.package_version: if args.is_release_build: package_item.text = args.package_version - elif args.is_for_nuget_publish: - # Update version_suffix below if publishing to NuGet - version_suffix = "beta" # alpha/beta/rc - package_item.text = f"{args.package_version}-{version_suffix}" + elif args.nuget_version_suffix != "none": + package_item.text = f"{args.package_version}-{args.nuget_version_suffix}" else: import datetime now = datetime.datetime.now().strftime('%Y%m%d-%H%M') @@ -55,12 +53,11 @@ def parse_arguments(): help="Path to nuspec file to update.") parser.add_argument("--commit_id", required=True, help="The last commit id included in this package.") parser.add_argument("--is_release_build", default="False", type=str, help="If it's a release build.") - parser.add_argument("--is_for_nuget_publish", default="False", type=str, help="If it's for publishing to nuget.org.") + parser.add_argument("--nuget_version_suffix", type=str, help="Nuget version suffix (needed if publishing to nuget.org and not release build)") args = parser.parse_args() args.nuspec_path = args.nuspec_path.resolve(strict=True) args.is_release_build = args.is_release_build.lower() == "true" - args.is_for_nuget_publish = args.is_for_nuget_publish.lower() == "true" print("used args:", args) return args From 70411fdd9652bbc2d19df7c7da560228b2353bcb Mon Sep 17 00:00:00 2001 From: Sayan Shaw <52221015+sayanshaw24@users.noreply.github.com> Date: Fri, 26 May 2023 10:30:16 -0700 Subject: [PATCH 15/17] Update release notes for nuget (#456) * Update release notes for nuget * indentation fix --------- Co-authored-by: Sayan Shaw --- nuget/NativeNuget.nuspec | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/nuget/NativeNuget.nuspec b/nuget/NativeNuget.nuspec index be0910241..e79c95b5e 100644 --- a/nuget/NativeNuget.nuspec +++ b/nuget/NativeNuget.nuspec @@ -2,23 +2,19 @@ Microsoft.ML.OnnxRuntime.Extensions - 0.8.0-alpha + 0.8.0 Microsoft Microsoft ONNX Runtime Extensions NuGet Package - General - 1. New custom operators: Whisper, DrawBoundingBoxes, RobertaTokenizer, ClipTokenizer, EncodeImage, DecodeImage - 2. Optional input/output support - 3. ORT custom operator C++ stub generation tool - 4. Operator implementation and documentation improved. - - Mobile - 1. Android package: Maven - 2. iOS package: CocoaPods - 3. PrePostProcessor tool for mobile model - 4. Super-resolution model pre- / post- processing end-to-end examples - + 1. NuGet package for the .NET platform. This package offers comprehensive platform support, including Windows, Linux, MacOS, Android, and iOS. Both x64 and arm64 architectures are supported, where applicable. + 2. Support for pre-processing and post-processing of the Whisper model, inclusive of Audio and Tokenizer decoding operators. + 3. Extends support for pre-processing and post-processing of object-detection models, including a new DrawBoundingBoxes operator. Pre/post processing tools can add non-max-suppression to the model to select the best bounding boxes, and scale those to the original image. See the end-to-end example in yolo_e2e.py. + 4. Introduces the Audio Domain, complemented with AudioCodec and optimized STFT Operators, enhancing audio processing capabilities. + 5. Enabled optional input/output support for some operators such as GPT2Tokenizer, ClipTokenizer, and RobertaTokenizer. + 6. Refined the implementation of offset mapping for BBPE-style tokenizers for more operators and efficiency improvement. + 7. Other bug and security fixes. + © Microsoft Corporation. All rights reserved. ONNX ONNXRuntime AI Machine Learning ORT_icon_for_light_bg.png From 93f239c1434521b7a4b78f322dca1aafcda9a1be Mon Sep 17 00:00:00 2001 From: Wenbing Li <10278425+wenbingl@users.noreply.github.com> Date: Tue, 30 May 2023 11:01:30 -0700 Subject: [PATCH 16/17] Unit test being compatible with ONNXRuntime-GPU package, and some clean-ups. (#457) --- onnxruntime_extensions/cmake_helper.py | 35 -------------- test/test_cliptok.py | 32 +++++++----- test/test_cmake_helper.py | 29 ----------- test/test_cv2.py | 8 +-- test/test_gpt2tok.py | 11 +++-- test/test_math_ops.py | 4 +- test/test_pyops.py | 12 ++--- test/test_robertatok.py | 12 ++--- test/test_string_concat.py | 2 +- test/test_string_ecma_regex.py | 10 ++-- test/test_string_length.py | 2 +- test/test_string_ops.py | 67 +++++++++++++------------- test/test_tools_customop_template.py | 28 +++++++++-- 13 files changed, 107 insertions(+), 145 deletions(-) delete mode 100644 onnxruntime_extensions/cmake_helper.py delete mode 100644 test/test_cmake_helper.py diff --git a/onnxruntime_extensions/cmake_helper.py b/onnxruntime_extensions/cmake_helper.py deleted file mode 100644 index 7955b683c..000000000 --- a/onnxruntime_extensions/cmake_helper.py +++ /dev/null @@ -1,35 +0,0 @@ -import inspect -from ._ocos import default_opset_domain -from . import _cuops - - -ALL_CUSTOM_OPS = {_name: _obj for _name, _obj in inspect.getmembers(_cuops) - if (inspect.isclass(_obj) and issubclass(_obj, _cuops.CustomOp))} - - -OPMAP_TO_CMAKE_FLAGS = {'GPT2Tokenizer': 'OCOS_ENABLE_GPT2_TOKENIZER', - 'BlingFireSentenceBreaker': 'OCOS_ENABLE_BLINGFIRE' - } - - -def gen_cmake_oplist(opconfig_file, oplist_cmake_file = '_selectedoplist.cmake'): - - ext_domain = default_opset_domain() - with open(oplist_cmake_file, 'w') as f: - print("# Auto-Generated File, not edited!!!", file=f) - with open(opconfig_file, 'r') as opfile: - for _ln in opfile: - if _ln.startswith(ext_domain): - items = _ln.strip().split(';') - if len(items) < 3: - raise RuntimeError("The malformated operator config file.") - for _op in items[2].split(','): - if not _op: - continue # is None or "" - if _op not in OPMAP_TO_CMAKE_FLAGS: - raise RuntimeError("Cannot find the custom operator({})\'s build flags, " - + "Please update the OPMAP_TO_CMAKE_FLAGS dictionary.".format(_op)) - print("set({} ON CACHE INTERNAL \"\")".format(OPMAP_TO_CMAKE_FLAGS[_op]), file=f) - print("# End of Building the Operator CMake variables", file=f) - - print('The cmake tool file has been generated successfully.') diff --git a/test/test_cliptok.py b/test/test_cliptok.py index 750f21ff5..5f772a682 100644 --- a/test/test_cliptok.py +++ b/test/test_cliptok.py @@ -12,6 +12,7 @@ PyOrtFunction) from onnxruntime_extensions.cvt import HFTokenizerConverter + def _get_file_content(path): with open(path, "rb") as file: return file.read() @@ -34,7 +35,8 @@ def _create_test_model(**kwargs): if kwargs["attention_mask"]: if kwargs["offset_map"]: node = [helper.make_node( - 'CLIPTokenizer', ['string_input'], ['input_ids', 'attention_mask', 'offset_mapping'], vocab=_get_file_content(vocab_file), + 'CLIPTokenizer', ['string_input'], + ['input_ids', 'attention_mask', 'offset_mapping'], vocab=_get_file_content(vocab_file), merges=_get_file_content(merges_file), name='bpetok', padding_length=max_length, domain='ai.onnx.contrib')] @@ -73,10 +75,11 @@ def setUpClass(cls): cls.tokenizer_cvt = HFTokenizerConverter(cls.slow_tokenizer) def _run_tokenizer(self, test_sentence, padding_length=-1): - model = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, max_length=padding_length, attention_mask=True, offset_map=True) + model = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, + max_length=padding_length, attention_mask=True, offset_map=True) so = _ort.SessionOptions() so.register_custom_ops_library(_get_library_path()) - sess = _ort.InferenceSession(model.SerializeToString(), so) + sess = _ort.InferenceSession(model.SerializeToString(), so, providers=["CPUExecutionProvider"]) input_text = np.array(test_sentence) input_ids, attention_mask, offset_mapping = sess.run(None, {'string_input': input_text}) print("\nTest Sentence: " + str(test_sentence)) @@ -111,7 +114,9 @@ def test_tokenizer(self): self._run_tokenizer(["One Microsoft Way, Redmond, WA"]) def test_converter(self): - fn_tokenizer = PyOrtFunction.from_customop("CLIPTokenizer", cvt=(self.tokenizer_cvt).clip_tokenizer) + fn_tokenizer = PyOrtFunction.from_customop("CLIPTokenizer", + cvt=(self.tokenizer_cvt).clip_tokenizer, + cpu_only=True) test_str = "I can feel the magic, can you?" fn_out = fn_tokenizer([test_str]) clip_out = self.tokenizer(test_str, return_offsets_mapping=True) @@ -120,16 +125,20 @@ def test_converter(self): expect_offset_mapping = clip_out['offset_mapping'] np.testing.assert_array_equal(fn_out[0].reshape((fn_out[0].size,)), expect_input_ids) np.testing.assert_array_equal(fn_out[1].reshape((fn_out[1].size,)), expect_attention_mask) - np.testing.assert_array_equal(fn_out[2].reshape((fn_out[2].shape[1], fn_out[2].shape[2])), expect_offset_mapping) + np.testing.assert_array_equal(fn_out[2].reshape((fn_out[2].shape[1], fn_out[2].shape[2])), + expect_offset_mapping) def test_optional_outputs(self): - # Test for models without offset mapping and without both attention mask and offset mapping (input id output is always required) - model1 = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, max_length=-1, attention_mask=True, offset_map=False) - model2 = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, max_length=-1, attention_mask=False, offset_map=False) + # Test for models without offset mapping and without both attention mask and offset mapping + # (input id output is always required) + model1 = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, + max_length=-1, attention_mask=True, offset_map=False) + model2 = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, + max_length=-1, attention_mask=False, offset_map=False) so = _ort.SessionOptions() so.register_custom_ops_library(_get_library_path()) - sess1 = _ort.InferenceSession(model1.SerializeToString(), so) - sess2 = _ort.InferenceSession(model2.SerializeToString(), so) + sess1 = _ort.InferenceSession(model1.SerializeToString(), so, providers=["CPUExecutionProvider"]) + sess2 = _ort.InferenceSession(model2.SerializeToString(), so, providers=["CPUExecutionProvider"]) input_text = np.array(["Hello World"]) outputs1 = sess1.run(None, {'string_input': input_text}) outputs2 = sess2.run(None, {'string_input': input_text}) @@ -142,10 +151,9 @@ def test_optional_outputs(self): clip_out = self.tokenizer(["Hello World"], return_offsets_mapping=True) expect_input_ids = clip_out['input_ids'] expect_attention_mask = clip_out['attention_mask'] - expect_offset_mapping = clip_out['offset_mapping'] np.testing.assert_array_equal(expect_input_ids, outputs1[0]) np.testing.assert_array_equal(expect_attention_mask, outputs1[1]) - np.testing.assert_array_equal(expect_input_ids, outputs2[0]) + if __name__ == "__main__": unittest.main() diff --git a/test/test_cmake_helper.py b/test/test_cmake_helper.py deleted file mode 100644 index 5a4b11a73..000000000 --- a/test/test_cmake_helper.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -import unittest -from pathlib import Path -from onnxruntime_extensions import cmake_helper - - -def _get_test_data_file(*sub_dirs): - test_dir = Path(__file__).parent - return str(test_dir.joinpath(*sub_dirs)) - - -class TestCMakeHelper(unittest.TestCase): - def test_cmake_file_gen(self): - cfgfile = _get_test_data_file('data', 'test.op.config') - cfile = '_selectedoplist.cmake' - cmake_helper.gen_cmake_oplist(cfgfile, cfile) - found = False - with open(cfile, 'r') as f: - for _ln in f: - if _ln.strip() == "set(OCOS_ENABLE_GPT2_TOKENIZER ON CACHE INTERNAL \"\")": - found = True - break - - os.remove(cfile) - self.assertTrue(found) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cv2.py b/test/test_cv2.py index 87e59d03a..b2b89b204 100644 --- a/test/test_cv2.py +++ b/test/test_cv2.py @@ -18,7 +18,7 @@ def test_image_reader(self): try: rdr = OrtPyFunction.from_customop("ImageReader") img_nhwc = rdr([img_file]) - except ONNXRuntimeError as e: + except ONNXRuntimeError: pass if img_nhwc is not None: @@ -59,9 +59,9 @@ def test_image_decoder(self): expected = np.asarray(expected, dtype=np.uint8).copy() # Convert the image to BGR format since cv2 is default BGR format. - red = expected[:,:,0].copy() - expected[:,:,0] = expected[:,:,2].copy() - expected[:,:,2] = red + red = expected[:, :, 0].copy() + expected[:, :, 0] = expected[:, :, 2].copy() + expected[:, :, 2] = red self.assertEqual(actual.shape[0], expected.shape[0]) self.assertEqual(actual.shape[1], expected.shape[1]) diff --git a/test/test_gpt2tok.py b/test/test_gpt2tok.py index ad32762c1..6850b5b3c 100644 --- a/test/test_gpt2tok.py +++ b/test/test_gpt2tok.py @@ -90,10 +90,11 @@ def tearDown(self) -> None: return super().tearDown() def _run_tokenizer(self, test_sentence, padding_length=-1): - model = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, max_length=padding_length, attention_mask=True) + model = _create_test_model(vocab_file=self.tokjson, + merges_file=self.merges, max_length=padding_length, attention_mask=True) so = _ort.SessionOptions() so.register_custom_ops_library(_get_library_path()) - sess = _ort.InferenceSession(model.SerializeToString(), so) + sess = _ort.InferenceSession(model.SerializeToString(), so, providers=['CPUExecutionProvider']) input_text = np.array(test_sentence) input_ids, attention_mask = sess.run(None, {'string_input': input_text}) expect_input_ids, expect_attention_mask = self.tokenizer.tokenizer_sentence(test_sentence, padding_length) @@ -118,10 +119,11 @@ def test_optional_outputs(self): enable_py_op(False) # Test for model without attention mask (input id output is always required) - model = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, max_length=-1, attention_mask=False) + model = _create_test_model(vocab_file=self.tokjson, + merges_file=self.merges, max_length=-1, attention_mask=False) so = _ort.SessionOptions() so.register_custom_ops_library(_get_library_path()) - sess = _ort.InferenceSession(model.SerializeToString(), so) + sess = _ort.InferenceSession(model.SerializeToString(), so, providers=['CPUExecutionProvider']) input_text = np.array(["Hello World"]) outputs = sess.run(None, {'string_input': input_text}) @@ -133,7 +135,6 @@ def test_optional_outputs(self): expect_input_ids = gpt2_out[0] np.testing.assert_array_equal(expect_input_ids, outputs[0]) - def test_tokenizer_pyop(self): self._run_tokenizer(["I can feel the magic, can you?"]) self._run_tokenizer(["Hey Cortana"]) diff --git a/test/test_math_ops.py b/test/test_math_ops.py index 55329b996..f5132e07d 100644 --- a/test/test_math_ops.py +++ b/test/test_math_ops.py @@ -52,7 +52,7 @@ def test_segment_sum_cc(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_segment_sum("") self.assertIn('op_type: "SegmentSum"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) data = np.array([[1, 2, 3, 4], [4, 3, 2, 1], [5, 6, 7, 8]], dtype=np.float32) segment_ids = np.array([0, 0, 1], dtype=np.int64) exp = np.array([[5, 5, 5, 5], [5, 6, 7, 8]], dtype=np.float32) @@ -65,7 +65,7 @@ def test_segment_sum_python(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_segment_sum("Py") self.assertIn('op_type: "PySegmentSum"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) data = np.array([[1, 2, 3, 4], [4, 3, 2, 1], [5, 6, 7, 8]], dtype=np.float32) segment_ids = np.array([0, 0, 1], dtype=np.int64) exp = np.array([[5, 5, 5, 5], [5, 6, 7, 8]], dtype=np.float32) diff --git a/test/test_pyops.py b/test/test_pyops.py index 00dd2d73c..16f463ce7 100644 --- a/test/test_pyops.py +++ b/test/test_pyops.py @@ -154,7 +154,7 @@ def test_python_operator(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model() self.assertIn('op_type: "PyReverseMatrix"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input_1 = np.array( [1, 2, 3, 4, 5, 6]).astype(np.float32).reshape([3, 2]) txout = sess.run(None, {'input_1': input_1}) @@ -165,7 +165,7 @@ def test_add_epsilon_python(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_double('Py') self.assertIn('op_type: "PyAddEpsilon"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input_1 = np.array([[0., 1., 1.5], [7., 8., -5.5]]) txout = sess.run(None, {'input_1': input_1}) diff = txout[0] - input_1 - 1e-3 @@ -176,7 +176,7 @@ def test_python_negpos(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_2outputs('Py') self.assertIn('op_type: "PyNegPos"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) x = np.array([[0., 1., 1.5], [7., 8., -5.5]]).astype(np.float32) neg, pos = sess.run(None, {'x': x}) diff = x - (neg + pos) @@ -187,7 +187,7 @@ def test_cc_negpos(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_2outputs("") self.assertIn('op_type: "NegPos"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) x = np.array([[0., 1., 1.5], [7., 8., -5.5]]).astype(np.float32) neg, pos = sess.run(None, {'x': x}) diff = x - (neg + pos) @@ -210,7 +210,7 @@ def test_cc_operator(self): onnx_content = _create_test_model_test() self.assertIn('op_type: "CustomOpOne"', str(onnx_content)) ser = onnx_content.SerializeToString() - sess0 = _ort.InferenceSession(ser, so) + sess0 = _ort.InferenceSession(ser, so, providers=['CPUExecutionProvider']) res = sess0.run(None, { 'input_1': np.random.rand(3, 5).astype(np.float32), 'input_2': np.random.rand(3, 5).astype(np.float32)}) @@ -221,7 +221,7 @@ def test_python_join(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_join() self.assertIn('op_type: "PyOpJoin"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) arr = np.array([["a", "b"]], dtype=object) txout = sess.run(None, {'input_1': arr}) exp = np.array(["a;b"], dtype=object) diff --git a/test/test_robertatok.py b/test/test_robertatok.py index 49b320aaf..89a0bb206 100644 --- a/test/test_robertatok.py +++ b/test/test_robertatok.py @@ -73,10 +73,11 @@ def setUpClass(cls): cls.tokenizer_cvt = HFTokenizerConverter(cls.slow_tokenizer) def _run_tokenizer(self, test_sentence, padding_length=-1): - model = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, max_length=padding_length, attention_mask=True, offset_map=True) + model = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, + max_length=padding_length, attention_mask=True, offset_map=True) so = _ort.SessionOptions() so.register_custom_ops_library(_get_library_path()) - sess = _ort.InferenceSession(model.SerializeToString(), so) + sess = _ort.InferenceSession(model.SerializeToString(), so, providers=['CPUExecutionProvider']) input_text = np.array(test_sentence) input_ids, attention_mask, offset_mapping = sess.run(None, {'string_input': input_text}) print("\nTest Sentence: " + str(test_sentence)) @@ -128,8 +129,8 @@ def test_optional_outputs(self): model2 = _create_test_model(vocab_file=self.tokjson, merges_file=self.merges, max_length=-1, attention_mask=False, offset_map=False) so = _ort.SessionOptions() so.register_custom_ops_library(_get_library_path()) - sess1 = _ort.InferenceSession(model1.SerializeToString(), so) - sess2 = _ort.InferenceSession(model2.SerializeToString(), so) + sess1 = _ort.InferenceSession(model1.SerializeToString(), so, providers=['CPUExecutionProvider']) + sess2 = _ort.InferenceSession(model2.SerializeToString(), so, providers=['CPUExecutionProvider']) input_text = np.array(["Hello World"]) outputs1 = sess1.run(None, {'string_input': input_text}) outputs2 = sess2.run(None, {'string_input': input_text}) @@ -142,10 +143,9 @@ def test_optional_outputs(self): roberta_out = self.tokenizer(["Hello World"], return_offsets_mapping=True) expect_input_ids = roberta_out['input_ids'] expect_attention_mask = roberta_out['attention_mask'] - expect_offset_mapping = roberta_out['offset_mapping'] np.testing.assert_array_equal(expect_input_ids, outputs1[0]) np.testing.assert_array_equal(expect_attention_mask, outputs1[1]) - np.testing.assert_array_equal(expect_input_ids, outputs2[0]) + if __name__ == "__main__": unittest.main() diff --git a/test/test_string_concat.py b/test/test_string_concat.py index 7db62f21f..71d29b387 100644 --- a/test/test_string_concat.py +++ b/test/test_string_concat.py @@ -29,7 +29,7 @@ def _run_string_concat(input1, input2): so = _ort.SessionOptions() so.register_custom_ops_library(_get_library_path()) - sess = _ort.InferenceSession(model.SerializeToString(), so) + sess = _ort.InferenceSession(model.SerializeToString(), so, providers=['CPUExecutionProvider']) result = sess.run(None, {'input_1': input1, 'input_2': input2}) # verify diff --git a/test/test_string_ecma_regex.py b/test/test_string_ecma_regex.py index 506e28d85..03621665e 100644 --- a/test/test_string_ecma_regex.py +++ b/test/test_string_ecma_regex.py @@ -83,7 +83,7 @@ def test_string_replace_cc(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_replace("") self.assertIn('op_type: "StringECMARegexReplace"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) pattern = np.array([r"def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):"]) rewrite = np.array([r"static PyObject* py_$1(void) {"]) text = np.array([["def myfunc():"], ["def dummy():"]]) @@ -99,7 +99,7 @@ def test_string_replace_cc_first(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_replace("", global_replace=False) self.assertIn('op_type: "StringECMARegexReplace"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) pattern = np.array([r"def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):"]) rewrite = np.array([r"static PyObject* py_$1(void) {"]) text = np.array([["def myfunc():def myfunc():"], ["def dummy():def dummy():"]]) @@ -115,7 +115,7 @@ def test_string_replace_cc_x2(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_replace("") self.assertIn('op_type: "StringECMARegexReplace"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) pattern = np.array([r"def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):"]) rewrite = np.array([r"static PyObject* py_$1(void) {"]) text = np.array([["def myfunc():"], ["def dummy():" * 2]]) @@ -132,7 +132,7 @@ def test_string_replace_uncased(self): onnx_model = _create_test_model_string_replace( "", "ai.onnx.contrib", True, True ) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) pattern = np.array( [ @@ -157,7 +157,7 @@ def test_string_regex_split_cc(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_regex_split("") self.assertIn('op_type: "StringECMARegexSplitWithOffsets"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input = np.array(["hello there", "hello there"]) pattern = np.array(["(\\s)"]) diff --git a/test/test_string_length.py b/test/test_string_length.py index 7af5aa12b..26b2b4b89 100644 --- a/test/test_string_length.py +++ b/test/test_string_length.py @@ -26,7 +26,7 @@ def _run_string_length(input): so = _ort.SessionOptions() so.register_custom_ops_library(_get_library_path()) - sess = _ort.InferenceSession(model.SerializeToString(), so) + sess = _ort.InferenceSession(model.SerializeToString(), so, providers=['CPUExecutionProvider']) result = sess.run(None, {'input_1': input}) # verify diff --git a/test/test_string_ops.py b/test/test_string_ops.py index c6de5ddd2..3064a99c5 100644 --- a/test/test_string_ops.py +++ b/test/test_string_ops.py @@ -441,7 +441,7 @@ def test_string_upper_cc(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_upper('') self.assertIn('op_type: "StringUpper"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input_1 = np.array([["Abc"]]) txout = sess.run(None, {'input_1': input_1}) self.assertEqual(txout[0].tolist(), np.array([["ABC"]]).tolist()) @@ -451,7 +451,7 @@ def test_string_lower_cc(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_lower('') self.assertIn('op_type: "StringLower"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input_1 = np.array([["Abc"]]) txout = sess.run(None, {'input_1': input_1}) self.assertEqual(txout[0].tolist(), np.array([["abc"]]).tolist()) @@ -461,7 +461,7 @@ def test_string_upper_cc_accent(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_upper('') self.assertIn('op_type: "StringUpper"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input_1 = np.array([["R"], ["Abcé"], ["ABC"], ["A"]]) txout = sess.run(None, {'input_1': input_1}) self.assertEqual( @@ -473,7 +473,7 @@ def test_string_lower_cc_accent(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_lower('') self.assertIn('op_type: "StringLower"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input_1 = np.array([["R"], ["Abce"], ["ABC"], ["A"]]) txout = sess.run(None, {'input_1': input_1}) self.assertEqual( @@ -497,7 +497,7 @@ def test_string_upper_python(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_upper('Py') self.assertIn('op_type: "PyStringUpper"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input_1 = np.array([["Abc"]]) txout = sess.run(None, {'input_1': input_1}) self.assertEqual(txout[0].tolist(), np.array([["ABC"]]).tolist()) @@ -507,7 +507,7 @@ def test_string_lower_python(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_lower('Py') self.assertIn('op_type: "PyStringLower"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input_1 = np.array([["Abc"]]) txout = sess.run(None, {'input_1': input_1}) self.assertEqual(txout[0].tolist(), np.array([["abc"]]).tolist()) @@ -517,7 +517,7 @@ def test_string_upper_python_accent(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_upper('Py') self.assertIn('op_type: "PyStringUpper"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input_1 = np.array([["Abcé"]]) txout = sess.run(None, {'input_1': input_1}) self.assertEqual(txout[0].tolist(), @@ -528,7 +528,7 @@ def test_string_lower_python_accent(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_lower('Py') self.assertIn('op_type: "PyStringLower"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input_1 = np.array([["Abcé"]]) txout = sess.run(None, {'input_1': input_1}) self.assertEqual(txout[0].tolist(), @@ -539,7 +539,7 @@ def test_string_join_python(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_join('Py') self.assertIn('op_type: "PyStringJoin"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) text = np.vstack([np.array([["a", "b", "c"]]), np.array([["aa", "bb", ""]])]) self.assertEqual(text.shape, (2, 3)) @@ -560,7 +560,7 @@ def test_string_join_python_3d(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_join('Py') self.assertIn('op_type: "PyStringJoin"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) text = np.vstack([np.array([["a", "b", "c"]]), np.array([["aa", "bb", ""]])]).reshape((2, 3, 1)) sep = np.array([";"]) @@ -575,7 +575,7 @@ def test_string_join_python_1d(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_join('Py') self.assertIn('op_type: "PyStringJoin"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) text = np.array(["a", "b", "cc"]) sep = np.array([";"]) axis = np.array([0], dtype=np.int64) @@ -589,7 +589,7 @@ def test_string_join_cc(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_join('') self.assertIn('op_type: "StringJoin"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) text = np.vstack([np.array([["a", "b", "c"]]), np.array([["aa", "bb", ""]])]) sep = np.array([";"]) @@ -607,7 +607,7 @@ def test_string_join_cc_1d(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_join('') self.assertIn('op_type: "StringJoin"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) text = np.array(["a", "b", "cc"]) sep = np.array([";"]) axis = np.array([0], dtype=np.int64) @@ -620,7 +620,7 @@ def test_string_join_empty(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_join('') self.assertIn('op_type: "StringJoin"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) text = np.array([""]) sep = np.array([" "]) axis = np.array([0], dtype=np.int64) @@ -633,7 +633,7 @@ def test_string_join_scalar(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_join('') self.assertIn('op_type: "StringJoin"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) text = np.array("a scalar string") sep = np.array([" "]) axis = np.array([0], dtype=np.int64) @@ -646,7 +646,7 @@ def test_string_join_cc_3d(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_join('') self.assertIn('op_type: "StringJoin"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) text = np.array(["a", "b", "c", "d", "e", "f", "g", "h"]).reshape(( 2, 2, 2)) sep = np.array([";"]) @@ -671,7 +671,7 @@ def test_string_replace_cc(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_replace('') self.assertIn('op_type: "StringRegexReplace"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) pattern = np.array([r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):']) rewrite = np.array([r'static PyObject* py_\1(void) {']) text = np.array([['def myfunc():'], ['def dummy():']]) @@ -687,7 +687,7 @@ def test_string_replace_cc_first(self): onnx_model = _create_test_model_string_replace( '', global_replace=False) self.assertIn('op_type: "StringRegexReplace"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) pattern = np.array([r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):']) rewrite = np.array([r'static PyObject* py_\1(void) {']) text = np.array([['def myfunc():def myfunc():'], @@ -703,7 +703,7 @@ def test_string_replace_cc_x2(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_replace('') self.assertIn('op_type: "StringRegexReplace"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) pattern = np.array([r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):']) rewrite = np.array([r'static PyObject* py_\1(void) {']) text = np.array([['def myfunc():'], ['def dummy():' * 2]]) @@ -718,7 +718,7 @@ def test_string_replace_python(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_replace('Py') self.assertIn('op_type: "PyStringRegexReplace"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) pattern = np.array([r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):']) rewrite = np.array([r'static PyObject*\npy_\1(void)\n{']) text = np.array([['def myfunc():'], ['def dummy():']]) @@ -733,7 +733,7 @@ def test_string_replace_python_x2(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_replace('Py') self.assertIn('op_type: "PyStringRegexReplace"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) pattern = np.array([r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):']) rewrite = np.array([r'static PyObject*\npy_\1(void)\n{']) text = np.array([['def myfunc():'], ['def dummy():' * 2]]) @@ -748,7 +748,7 @@ def test_string_to_crc32_python(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_to_hash('Py', kind='crc32') self.assertIn('op_type: "PyStringToCRC32"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) text = np.array([["abc", "abcdé"], ["$$^l!%*ù", ""]]) num_buckets = np.array([44], dtype=np.uint32) res = self._string_to_crc32(text, num_buckets) @@ -765,7 +765,7 @@ def test_string_to_hash_bucket_cc(self): onnx_model = _create_test_model_string_to_hash( '', kind='hash_bucket') self.assertIn('op_type: "StringToHashBucket"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) raw = ["abc", "abcdé", "$$^l!%*ù", "", "a", "A"] text = np.array(raw).reshape((3, 2)) num_buckets = np.array([NUM_BUCKETS], dtype=np.int64) @@ -791,7 +791,7 @@ def test_string_to_hash_bucket_fast_cc(self): onnx_model = _create_test_model_string_to_hash( '', kind='hash_bucket_fast') self.assertIn('op_type: "StringToHashBucketFast"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) raw = ["abc", "abcdé", "$$^l!%*ù", "", "a", "A"] text = np.array(raw).reshape((3, 2)) num_buckets = np.array([NUM_BUCKETS], dtype=np.int64) @@ -817,7 +817,7 @@ def test_string_to_hash_bucket_python(self): onnx_model = _create_test_model_string_to_hash( 'Py', kind='hash_bucket') self.assertIn('op_type: "PyStringToHashBucket"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) raw = ["abc", "abcdé", "$$^l!%*ù", "", "a", "A"] text = np.array(raw).reshape((3, 2)) num_buckets = np.array([NUM_BUCKETS], dtype=np.int64) @@ -850,7 +850,7 @@ def test_string_equal_python(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_equal('Py') self.assertIn('op_type: "PyStringEqual"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) for x, y in self.enumerate_matrix_couples(): txout = sess.run(None, {'x': x, 'y': y}) @@ -863,7 +863,7 @@ def test_string_equal_cc(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_equal('') self.assertIn('op_type: "StringEqual"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) for x, y in self.enumerate_matrix_couples(): txout = sess.run(None, {'x': x, 'y': y}) @@ -876,7 +876,7 @@ def test_string_split_python(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_split('Py') self.assertIn('op_type: "PyStringSplit"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input = np.array(["a,,b", "", "aa,b,c", "dddddd"]) delimiter = np.array([","]) @@ -908,7 +908,7 @@ def test_string_split_cc(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_split('') self.assertIn('op_type: "StringSplit"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input = np.array(["a,,b", "", "aa,b,c", "dddddd"]) delimiter = np.array([","]) @@ -956,7 +956,7 @@ def test_string_split_cc_sep2(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_split('') self.assertIn('op_type: "StringSplit"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input = np.array(["a*b", "a,*b", "aa,b,,c", 'z', "dddddd,", "**"]) delimiter = np.array([",*"]) @@ -1009,7 +1009,7 @@ def test_string_split_cc_sep0(self): so.register_custom_ops_library(_get_library_path()) onnx_model = _create_test_model_string_split('') self.assertIn('op_type: "StringSplit"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input = np.array(["a*b", "a,*b"]) delimiter = np.array([""]) @@ -1051,7 +1051,7 @@ def test_string_regex_split_cc(self): onnx_model = _create_test_model_string_regex_split('') self.assertIn('op_type: "StringRegexSplitWithOffsets"', str(onnx_model)) - sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) input = np.array(["hello there", "hello there"]) pattern = np.array(["(\\s)"]) @@ -1114,7 +1114,7 @@ def test_string_wordpiece_tokenizer_cc(self): so.register_custom_ops_library(_get_library_path()) cc_onnx_model = _create_test_model_wordpiece('') self.assertIn('op_type: "WordpieceTokenizer"', str(cc_onnx_model)) - cc_sess = _ort.InferenceSession(cc_onnx_model.SerializeToString(), so) + cc_sess = _ort.InferenceSession(cc_onnx_model.SerializeToString(), so, providers=['CPUExecutionProvider']) inputs = dict(text=np.array(["unwanted running", "unwantedX running"], dtype=object)) @@ -1149,7 +1149,6 @@ def _CreateTable(vocab, num_oov=1): value_dtype=tf.int64) res = tf.lookup.StaticVocabularyTable(init, num_oov, lookup_key_dtype=tf.string) res.__len__ = lambda self: len(vocab) - vocab_table = _CreateTable(["want", "##want", "##ed", "wa", "un", "runn", "##ing"]) diff --git a/test/test_tools_customop_template.py b/test/test_tools_customop_template.py index a650401e5..30d54cbfe 100644 --- a/test/test_tools_customop_template.py +++ b/test/test_tools_customop_template.py @@ -15,7 +15,8 @@ test_data_dir = os.path.join(ort_ext_root, "test", "data") sys.path.append(tools_dir) -import gen_customop_template +import gen_customop_template # noqa: E402 + # create generic custom op models with some basic math ops for testing purposes def _create_test_model_1(): @@ -34,6 +35,7 @@ def _create_test_model_1(): model = make_onnx_model(graph) return model + def _create_test_model_2(prefix=""): nodes = [ helper.make_node("Identity", ["data"], ["id1"]), @@ -51,8 +53,19 @@ def _create_test_model_2(prefix=""): model = make_onnx_model(graph) return model + class TestCustomOpTemplate(unittest.TestCase): + @classmethod + def tearDownClass(cls) -> None: + # remove generated files + template_output_path = os.path.join(test_data_dir, "generated") + if os.path.exists(template_output_path): + for file in os.listdir(template_output_path): + os.remove(os.path.join(template_output_path, file)) + os.rmdir(template_output_path) + return super().tearDownClass() + # check input and output type count of models extracted by template generator def check_io_count(self, model_name, output_path, expected_input_count, expected_output_count): model_path = os.path.join(test_data_dir, "generated", model_name) @@ -63,14 +76,19 @@ def check_io_count(self, model_name, output_path, expected_input_count, expected def test_template(self): template_output_path = os.path.join(test_data_dir, "generated") os.mkdir(template_output_path) - + onnx.save(_create_test_model_1(), os.path.join(template_output_path, "test_model_1.onnx")) test1_template_output_path = os.path.join(template_output_path, "custom_op_template_test1.hpp") - self.check_io_count(model_name = "test_model_1.onnx", output_path = test1_template_output_path, expected_input_count = 1, expected_output_count = 1) - + self.check_io_count(model_name="test_model_1.onnx", + output_path=test1_template_output_path, + expected_input_count=1, expected_output_count=1) + onnx.save(_create_test_model_2(), os.path.join(template_output_path, "test_model_2.onnx")) test2_template_output_path = os.path.join(template_output_path, "custom_op_template_test2.hpp") - self.check_io_count(model_name = "test_model_2.onnx", output_path = test2_template_output_path, expected_input_count = 2, expected_output_count = 1) + self.check_io_count(model_name="test_model_2.onnx", + output_path=test2_template_output_path, + expected_input_count=2, expected_output_count=1) + if __name__ == "__main__": unittest.main() From 30eb7afcfa1202ff181f6ba2df407f2a1be12039 Mon Sep 17 00:00:00 2001 From: "Aidan Ryan (MSFT)" <109703696+aidanryan-msft@users.noreply.github.com> Date: Tue, 30 May 2023 16:52:59 -0400 Subject: [PATCH 17/17] Add string strip text operator (#460) * add string strip text operator --------- Co-authored-by: Wenbing Li <10278425+wenbingl@users.noreply.github.com> --- operators/text/string_strip.cc | 54 +++++++++++++++++++++++++++++++++ operators/text/string_strip.hpp | 20 ++++++++++++ operators/text/text.cc | 14 ++++----- test/test_string_ops.py | 37 +++++++++++++++++++++- 4 files changed, 117 insertions(+), 8 deletions(-) create mode 100644 operators/text/string_strip.cc create mode 100644 operators/text/string_strip.hpp diff --git a/operators/text/string_strip.cc b/operators/text/string_strip.cc new file mode 100644 index 000000000..a55c54d20 --- /dev/null +++ b/operators/text/string_strip.cc @@ -0,0 +1,54 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "string_strip.hpp" +#include "string_tensor.h" +#include +#include +#include + +const char* WHITE_SPACE_CHARS = " \t\n\r\f\v"; + +KernelStringStrip::KernelStringStrip(const OrtApi& api, const OrtKernelInfo& info) : BaseKernel(api, info) { +} + +void KernelStringStrip::Compute(OrtKernelContext* context) { + // Setup inputs + const OrtValue* input_X = ort_.KernelContext_GetInput(context, 0); + std::vector X; + GetTensorMutableDataString(api_, ort_, context, input_X, X); + + // For each string in input, replace with whitespace-trimmed version. + for (size_t i = 0; i < X.size(); ++i) { + size_t nonWhitespaceBegin = X[i].find_first_not_of(WHITE_SPACE_CHARS); + if (nonWhitespaceBegin != std::string::npos) { + size_t nonWhitespaceEnd = X[i].find_last_not_of(WHITE_SPACE_CHARS); + size_t nonWhitespaceRange = nonWhitespaceEnd - nonWhitespaceBegin + 1; + + X[i] = X[i].substr(nonWhitespaceBegin, nonWhitespaceRange); + } + } + + // Fills the output + OrtTensorDimensions dimensions(ort_, input_X); + OrtValue* output = ort_.KernelContext_GetOutput(context, 0, dimensions.data(), dimensions.size()); + FillTensorDataString(api_, ort_, context, X, output); +} + +const char* CustomOpStringStrip::GetName() const { return "StringStrip"; }; + +size_t CustomOpStringStrip::GetInputTypeCount() const { + return 1; +}; + +ONNXTensorElementDataType CustomOpStringStrip::GetInputType(size_t /*index*/) const { + return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING; +}; + +size_t CustomOpStringStrip::GetOutputTypeCount() const { + return 1; +}; + +ONNXTensorElementDataType CustomOpStringStrip::GetOutputType(size_t /*index*/) const { + return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING; +}; diff --git a/operators/text/string_strip.hpp b/operators/text/string_strip.hpp new file mode 100644 index 000000000..a8c181c9c --- /dev/null +++ b/operators/text/string_strip.hpp @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "ocos.h" +#include "string_utils.h" + +struct KernelStringStrip : BaseKernel { + KernelStringStrip(const OrtApi& api, const OrtKernelInfo& info); + void Compute(OrtKernelContext* context); +}; + +struct CustomOpStringStrip : OrtW::CustomOpBase { + const char* GetName() const; + size_t GetInputTypeCount() const; + ONNXTensorElementDataType GetInputType(size_t index) const; + size_t GetOutputTypeCount() const; + ONNXTensorElementDataType GetOutputType(size_t index) const; +}; diff --git a/operators/text/text.cc b/operators/text/text.cc index d97c1fd67..5c1ceab64 100644 --- a/operators/text/text.cc +++ b/operators/text/text.cc @@ -4,6 +4,7 @@ #include "text/string_join.hpp" #include "text/string_lower.hpp" #include "text/string_split.hpp" +#include "text/string_strip.hpp" #include "text/string_to_vector.hpp" #include "text/string_upper.hpp" #include "text/vector_to_string.hpp" @@ -17,15 +18,14 @@ #if defined(ENABLE_RE2_REGEX) #include "text/re2_strings/string_regex_replace.hpp" #include "text/re2_strings/string_regex_split.hpp" -#endif // ENABLE_RE2_REGEX +#endif // ENABLE_RE2_REGEX - -FxLoadCustomOpFactory LoadCustomOpClasses_Text = - LoadCustomOpClasses; + CustomOpStringECMARegexSplitWithOffsets>; diff --git a/test/test_string_ops.py b/test/test_string_ops.py index 3064a99c5..c49d2a297 100644 --- a/test/test_string_ops.py +++ b/test/test_string_ops.py @@ -173,6 +173,22 @@ def _create_test_model_string_equal(prefix, domain='ai.onnx.contrib'): return model +def _create_test_model_string_strip(prefix, domain='ai.onnx.contrib'): + nodes = [] + nodes[0:] = [helper.make_node('Identity', ['input_1'], ['identity1'])] + nodes[1:] = [helper.make_node('%sStringStrip' % prefix, + ['identity1'], ['customout'], + domain=domain)] + + input0 = helper.make_tensor_value_info( + 'input_1', onnx_proto.TensorProto.STRING, [None, None]) + output0 = helper.make_tensor_value_info( + 'customout', onnx_proto.TensorProto.STRING, [None, None]) + + graph = helper.make_graph(nodes, 'test0', [input0], [output0]) + model = make_onnx_model(graph) + return model + def _create_test_model_string_split(prefix, domain='ai.onnx.contrib'): nodes = [] nodes.append(helper.make_node('Identity', ['input'], ['id1'])) @@ -436,6 +452,26 @@ def test_check_types(self): for t in type_list: self.assertIn(t, def_list) + def test_string_strip_cc(self): + so = _ort.SessionOptions() + so.register_custom_ops_library(_get_library_path()) + onnx_model = _create_test_model_string_strip('') + self.assertIn('op_type: "StringStrip"', str(onnx_model)) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + input_1 = np.array([[" a b c "]]) + txout = sess.run(None, {'input_1': input_1}) + self.assertEqual(txout[0].tolist(), np.array([["a b c"]]).tolist()) + + def test_string_strip_cc_empty(self): + so = _ort.SessionOptions() + so.register_custom_ops_library(_get_library_path()) + onnx_model = _create_test_model_string_strip('') + self.assertIn('op_type: "StringStrip"', str(onnx_model)) + sess = _ort.InferenceSession(onnx_model.SerializeToString(), so) + input_1 = np.array([[""]]) + txout = sess.run(None, {'input_1': input_1}) + self.assertEqual(txout[0].tolist(), np.array([[""]]).tolist()) + def test_string_upper_cc(self): so = _ort.SessionOptions() so.register_custom_ops_library(_get_library_path()) @@ -1151,7 +1187,6 @@ def _CreateTable(vocab, num_oov=1): res.__len__ = lambda self: len(vocab) vocab_table = _CreateTable(["want", "##want", "##ed", "wa", "un", "runn", "##ing"]) - text = tf.convert_to_tensor(["unwanted running", "unwantedX running"], dtype=tf.string) try: tf_tokens, tf_rows, tf_begins, tf_ends = (