From adf8753a15357859dbcaded2e60c7bd1778635c3 Mon Sep 17 00:00:00 2001
From: Ming Du <ming.du101@gmail.com>
Date: Fri, 26 Apr 2024 16:17:25 -0500
Subject: [PATCH] Allow using ONNX for testing (WIP)

---
 generic_trainer/inference_util.py | 69 +++++++++++++++++++++++++++++++
 generic_trainer/tester.py         | 57 ++++++++++++++++++++++---
 2 files changed, 121 insertions(+), 5 deletions(-)
 create mode 100644 generic_trainer/inference_util.py

diff --git a/generic_trainer/inference_util.py b/generic_trainer/inference_util.py
new file mode 100644
index 0000000..67433f8
--- /dev/null
+++ b/generic_trainer/inference_util.py
@@ -0,0 +1,69 @@
+import logging
+
+try:
+    import pycuda.driver as cuda
+    import tensorrt as trt
+except ImportError:
+    print('Unable to import pycuda and tensorrt. If you do not intend to use the ONNX inferencer, ignore '
+          'this message. ')
+
+
+def engine_build_from_onnx(onnx_mdl):
+    EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
+    builder = trt.Builder(TRT_LOGGER)
+    config = builder.create_builder_config()
+    # config.set_flag(trt.BuilderFlag.FP16)
+    config.set_flag(trt.BuilderFlag.TF32)
+    # config.max_workspace_size = 1 * (1 << 30)  # the maximum size that any layer in the network can use
+
+    network = builder.create_network(EXPLICIT_BATCH)
+    parser = trt.OnnxParser(network, TRT_LOGGER)
+    # Load the Onnx model and parse it in order to populate the TensorRT network.
+    success = parser.parse_from_file(onnx_mdl)
+    for idx in range(parser.num_errors):
+        print(parser.get_error(idx))
+
+    if not success:
+        return None
+
+    return builder.build_engine(network, config)
+
+
+def mem_allocation(engine):
+    """
+    Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host
+    inputs/outputs.
+    """
+    logging.info('Expected input node shape is {}'.format(engine.get_binding_shape(0)))
+    in_sz = trt.volume(engine.get_binding_shape(0)) * engine.max_batch_size
+    logging.info('Input size: {}'.format(in_sz))
+    h_input = cuda.pagelocked_empty(in_sz, dtype='float32')
+
+    out_sz = trt.volume(engine.get_binding_shape(1)) * engine.max_batch_size
+    h_output = cuda.pagelocked_empty(out_sz, dtype='float32')
+
+    # Allocate device memory for inputs and outputs.
+    d_input = cuda.mem_alloc(h_input.nbytes)
+    d_output = cuda.mem_alloc(h_output.nbytes)
+
+    # Create a stream in which to copy inputs/outputs and run inference.
+    stream = cuda.Stream()
+
+    return h_input, h_output, d_input, d_output, stream
+
+
+def inference(context, h_input, h_output, d_input, d_output, stream):
+    # Transfer input data to the GPU.
+    cuda.memcpy_htod_async(d_input, h_input, stream)
+
+    # Run inference.
+    context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
+
+    # Transfer predictions back from the GPU.
+    cuda.memcpy_dtoh_async(h_output, d_output, stream)
+
+    # Synchronize the stream
+    stream.synchronize()
+    # Return the host
+    return h_output
diff --git a/generic_trainer/tester.py b/generic_trainer/tester.py
index 448ca5a..526e89f 100644
--- a/generic_trainer/tester.py
+++ b/generic_trainer/tester.py
@@ -1,10 +1,13 @@
+import logging
 import os
 
+import numpy as np
 import torch
 from torch.utils.data import Dataset, DataLoader
 
 import generic_trainer.trainer as trainer
 from generic_trainer.configs import *
+from generic_trainer.inference_util import *
 
 
 class Tester(trainer.Trainer):
@@ -18,6 +21,20 @@ def __init__(self, configs: InferenceConfig):
         self.sampler = None
         self.dataloader = None
         self.parallelization_type = self.configs.parallelization_params.parallelization_type
+        self.mode = 'state_dict'
+
+        # Attributes below are used for ONNX
+        self.onnx_mdl = None
+
+        self.trt_hin = None
+        self.trt_din = None
+        self.trt_hout = None
+        self.trt_dout = None
+
+        self.trt_engine = None
+        self.trt_stream = None
+        self.trt_context = None
+        self.context = None
 
     def build(self):
         self.build_ranks()
@@ -28,6 +45,22 @@ def build(self):
         self.build_model()
         self.build_dir()
 
+    def build_model(self):
+        if self.configs.pretrained_model_path.endswith('onnx'):
+            logging.info('An ONNX model is given. This model will be loaded and run with TensorRT.')
+            self.build_onnx_model()
+            self.mode = 'onnx'
+        else:
+            super().build_model()
+
+    def build_onnx_model(self):
+        import pycuda.autoinit
+        self.context = pycuda.autoinit.context
+        self.onnx_mdl = self.configs.pretrained_model_path
+        self.trt_engine = engine_build_from_onnx(self.onnx_mdl)
+        self.trt_hin, self.trt_hout, self.trt_din, self.trt_dout, self.trt_stream = mem_allocation(self.trt_engine)
+        self.trt_context = self.trt_engine.create_execution_context()
+
     def build_scalable_parameters(self):
         self.all_proc_batch_size = self.configs.batch_size_per_process * self.num_processes
 
@@ -59,11 +92,25 @@ def build_dir(self):
         self.barrier()
 
     def run(self):
-        self.model.eval()
+        if self.mode == 'state_dict':
+            self.model.eval()
         for j, data_and_labels in enumerate(self.dataloader):
-            data, _ = self.process_data_loader_yield(data_and_labels)
-            preds = self.model(*data)
-            self.save_predictions(preds)
+            data, labels = self.process_data_loader_yield(data_and_labels)
+            if self.mode == 'state_dict':
+                preds = self.model(*data)
+            else:
+                preds = self.run_onnx_inference(*data)
+            self.update_result_holders(preds, labels)
+
+    def run_onnx_inference(self, data):
+        data = data.cpu().numpy()
+        orig_shape = data.shape
+        np.copyto(self.trt_hin, data.astype(np.float32).ravel())
+        pred = np.array(inference(self.trt_context, self.trt_hin, self.trt_hout,
+                                  self.trt_din, self.trt_dout, self.trt_stream))
+
+        pred = pred.reshape(orig_shape)
+        return pred
 
-    def save_predictions(self, preds):
+    def update_result_holders(self, preds, *args, **kwargs):
         pass