From 64f12e3a066841675269ac831a6dba37cb961e69 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Thu, 19 Oct 2023 17:29:48 +0000
Subject: [PATCH 01/30] init

---
 optimum/exporters/neuron/config.py         |  72 +++++++-
 optimum/exporters/neuron/model_configs.py  |  30 ++--
 optimum/exporters/neuron/model_wrappers.py |  32 ++++
 optimum/neuron/modeling_seq2seq.py         | 199 +++++++++++++++++++++
 optimum/neuron/utils/__init__.py           |   2 +
 optimum/neuron/utils/constant.py           |   2 +
 6 files changed, 318 insertions(+), 19 deletions(-)
 create mode 100644 optimum/exporters/neuron/model_wrappers.py
 create mode 100644 optimum/neuron/modeling_seq2seq.py

diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py
index 0e3d61bc8..1564de82d 100644
--- a/optimum/exporters/neuron/config.py
+++ b/optimum/exporters/neuron/config.py
@@ -16,14 +16,17 @@
 Common Neuron configuration classes that handle most of the features for building model specific
 configurations.
 """
-
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
 from ...utils import (
+    DummyInputGenerator,
     DummyBboxInputGenerator,
     DummyTextInputGenerator,
+    DummySeq2SeqDecoderTextInputGenerator,
+    DummySeq2SeqPastKeyValuesGenerator,
     DummyVisionInputGenerator,
     logging,
 )
-from .base import NeuronConfig, NeuronDecoderConfig
+from .base import NeuronConfig, NeuronDecoderConfig, NeuronSeq2SeqConfigWithPast
 
 
 logger = logging.get_logger(__name__)
@@ -61,3 +64,68 @@ class TextNeuronDecoderConfig(NeuronDecoderConfig):
     """
 
     pass
+
+
+class TextSeq2SeqNeuronConfig(NeuronConfig):
+    """
+    Handles encoder-decoder-based text architectures.
+    """
+    
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        DummyTextInputGenerator,
+        DummySeq2SeqDecoderTextInputGenerator,
+        DummySeq2SeqPastKeyValuesGenerator,
+    )
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        common_inputs = []
+        # encoder + decoder without past
+        if "encoder" in self.MODEL_TYPE:
+            common_inputs = ["input_ids", "attention_mask"]
+
+        # decoder with past
+        if "decoder" in self.MODEL_TYPE:
+            common_inputs = [
+                "decoder_input_ids", 
+                "decoder_attention_mask", 
+                "encoder_hidden_states", 
+                "encoder_attention_mask",
+                "beam_idx",
+                "beam_scores",
+            ]
+
+        return common_inputs
+    
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        # encoder + decoder without past
+        if "encoder" in self.MODEL_TYPE:
+            common_outputs = ["past_key_values"]
+        # decoder with past
+        if "decoder" in self.MODEL_TYPE:
+            common_outputs = ["next_tokens", ""]
+        return common_outputs
+    
+    def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]:
+        dummy_text_input_generator = self.DUMMY_INPUT_GENERATOR_CLASSES[0](
+            self.task, self._normalized_config, **kwargs
+        )
+        dummy_decoder_text_input_generator = self.DUMMY_INPUT_GENERATOR_CLASSES[1](
+            self.task,
+            self._normalized_config,
+            **kwargs,
+        )
+        dummy_seq2seq_past_key_values_generator = self.DUMMY_INPUT_GENERATOR_CLASSES[2](
+            self.task,
+            self._normalized_config,
+            encoder_sequence_length=dummy_text_input_generator.sequence_length,
+            **kwargs,
+        )
+        dummy_inputs_generators = [
+            dummy_text_input_generator,
+            dummy_decoder_text_input_generator,
+            dummy_seq2seq_past_key_values_generator,
+        ]
+
+        return dummy_inputs_generators
\ No newline at end of file
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index d603d7379..31cbae6f9 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -35,6 +35,9 @@
     TextNeuronDecoderConfig,
     VisionNeuronConfig,
 )
+from .model_wrappers import (
+    UnetNeuronWrapper,
+)
 
 
 if TYPE_CHECKING:
@@ -278,26 +281,10 @@ def generate_dummy_inputs(self, return_tuple: bool = False, **kwargs):
         else:
             return dummy_inputs
 
-    class ModelWrapper(torch.nn.Module):
-        def __init__(self, model):
-            super().__init__()
-            self.model = model
-
-        def forward(self, sample, timestep, encoder_hidden_states, text_embeds=None, time_ids=None):
-            out_tuple = self.model(
-                sample,
-                timestep.float().expand((sample.shape[0],)),
-                encoder_hidden_states,
-                added_cond_kwargs={"text_embeds": text_embeds, "time_ids": time_ids},
-                return_dict=False,
-            )
-
-            return out_tuple
-
     def check_model_inputs_order(self, model, dummy_inputs):
         return super().check_model_inputs_order(
             model=model,
-            custom_model_wrapper=self.ModelWrapper,
+            custom_model_wrapper=UnetNeuronWrapper,
         )
 
 
@@ -372,3 +359,12 @@ class GPT2NeuronConfig(TextNeuronDecoderConfig):
 @register_in_tasks_manager("llama", "text-generation")
 class LLamaNeuronConfig(TextNeuronDecoderConfig):
     NEURONX_CLASS = "llama.model.LlamaForSampling"
+
+
+@register_in_tasks_manager("t5", "text2text-generation")
+class T5EncoderNeuronConfig(TextNeuronDecoderConfig):
+    ATOL_FOR_VALIDATION = 1e-3
+    MANDATORY_AXES = ("batch_size", "sequence_length")
+    MODEL_TYPE = "t5-encoder"
+    
+    
diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py
new file mode 100644
index 000000000..d7f9d0ade
--- /dev/null
+++ b/optimum/exporters/neuron/model_wrappers.py
@@ -0,0 +1,32 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model wrappers for Neuron export."""
+import torch
+
+class UnetNeuronWrapper(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, sample, timestep, encoder_hidden_states, text_embeds=None, time_ids=None):
+        out_tuple = self.model(
+            sample,
+            timestep.float().expand((sample.shape[0],)),
+            encoder_hidden_states,
+            added_cond_kwargs={"text_embeds": text_embeds, "time_ids": time_ids},
+            return_dict=False,
+        )
+
+        return out_tuple
\ No newline at end of file
diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
new file mode 100644
index 000000000..555e77f56
--- /dev/null
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -0,0 +1,199 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NeuroModelForXXX classes for seq2seq models' inference on neuron devices."""
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from abc import abstractmethod
+from pathlib import Path
+from tempfile import TemporaryDirectory
+import torch
+from transformers import AutoModelForSeq2SeqLM
+from .modeling_base import NeuronBaseModel, NeuronConfig
+from .generation import NeuronGenerationMixin
+from .utils import (
+    ENCODER_NAME,
+    DECODER_NAME,
+    NEURON_FILE_NAME,
+    is_neuronx_available,
+)
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel
+
+if is_neuronx_available():
+    torch_neuronx
+
+
+class NeuronModelForConditionalGeneration(NeuronBaseModel):
+    base_model_prefix = "neuron_model"
+    
+    def __init__(
+        self,
+        encoder: torch.jit._script.ScriptModule,
+        decoder: torch.jit._script.ScriptModule,
+        config: "PretrainedConfig",
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        model_file_name: Optional[str] = None,
+        preprocessors: Optional[List] = None,
+        neuron_config: Optional["NeuronConfig"] = None,
+        **kwargs,
+    ):
+        pass
+    
+    @staticmethod
+    def load_model(
+        encoder_path: Union[str, Path],
+        decoder_path: Optional[Union[str, Path]] = None,
+        device_ids: Optional[List[int]] = None,
+        dynamic_batch_size: bool = False,
+    ):
+        pass
+    
+    def _save_pretrained(
+        self,
+        save_directory: Union[str, Path],
+        encoder_file_name: str = NEURON_FILE_NAME,
+        decoder_file_name: str = NEURON_FILE_NAME,
+    ):
+        """
+        Saves the model encoder and decoder as well as their configuration files to a
+        directory, so that it can be re-loaded using the
+        [`~optimum.neuron.modeling_seq2seq.NeuronModelForSeq2SeqLM.from_pretrained`] class method.
+
+        Args:
+            save_directory (`Union[str, Path`]):
+                The directory where to save the model files.
+        """
+        pass
+    
+    @classmethod
+    def _from_pretrained(
+        cls,
+        model_id: Union[str, Path],
+        config: Dict[str, Any],
+        use_auth_token: Optional[Union[bool, str]] = None,
+        revision: Optional[str] = None,
+        force_download: bool = False,
+        cache_dir: Optional[str] = None,
+        encoder_file_name: Optional[str] = NEURON_FILE_NAME,
+        decoder_file_name: Optional[str] = NEURON_FILE_NAME,
+        subfolder: str = "",
+        local_files_only: bool = False,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        device_ids: Optional[List[int]] = None,
+        **kwargs,
+    ):
+        pass
+    
+    @classmethod
+    def _from_transformers(
+        cls,
+        model_id: str,
+        config: "PretrainedConfig",
+        use_auth_token: Optional[Union[bool, str]] = None,
+        revision: str = "main",
+        force_download: bool = True,
+        cache_dir: Optional[str] = None,
+        subfolder: str = "",
+        local_files_only: bool = False,
+        trust_remote_code: bool = False,
+        task: Optional[str] = None,
+        auto_cast: Optional[str] = "matmul",
+        auto_cast_type: Optional[str] = "bf16",
+        disable_fast_relayout: Optional[bool] = False,
+        disable_fallback: bool = False,
+        dynamic_batch_size: bool = False,
+        device_ids: Optional[List[int]] = None,
+    ) -> "NeuronModelForConditionalGeneration":
+        pass 
+
+
+class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin):
+    auto_model_class = AutoModelForSeq2SeqLM
+    main_input_name = "input_ids"
+   
+
+
+class _NeuronSeq2SeqModelPart:
+    """
+    For Seq2Seq architecture, we usually compile it to multiple neuron models. Each represents a part of the model.
+    """
+
+    def __init__(
+        self,
+        model: torch.jit._script.ScriptModule,
+        parent_model: NeuronBaseModel,
+        config: Optional["PretrainedConfig"] = None,
+        neuron_config: Optional["NeuronConfig"] = None,
+        model_type: str = "encoder",
+        device: Optional[int] = None,
+    ):
+        self.model = model
+        self.parent_model = parent_model
+        self.config = config
+        self.neuron_config = neuron_config
+        self.model_type = model_type
+        self.device = device
+
+    @abstractmethod
+    def forward(self, *args, **kwargs):
+        pass
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+
+class NeuronEncoder(_NeuronSeq2SeqModelPart):
+    """
+    Encoder part of the encoder-decoder model for Neuron inference. (Actually it's a monolith of encoder + decoder without past_key_values to workaround the control flow in the decoder).
+    """
+    def __init__(
+        self,
+        model: torch.jit._script.ScriptModule,
+        parent_model: NeuronBaseModel,
+        config: Optional["PretrainedConfig"] = None,
+        neuron_config: Optional[Dict[str, str]] = None,
+    ):
+        super().__init__(model, parent_model, config, neuron_config, "encoder")
+    
+    def forward(self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor):
+        inputs = (input_ids, attention_mask, )
+        outputs = self.model(*inputs)
+        return outputs
+
+class NeuronDecoder(_NeuronSeq2SeqModelPart):
+    """
+    Decoder part of the encoder-decoder model for Neuron inference. (Actually it's decoder with past_key_values).
+    """
+    def __init__(
+        self,
+        model: torch.jit._script.ScriptModule,
+        parent_model: NeuronBaseModel,
+        config: Optional["PretrainedConfig"] = None,
+        neuron_config: Optional[Dict[str, str]] = None,
+    ):
+        super().__init__(model, parent_model, config, neuron_config, "decoder")
+    
+    def forward(
+        self, 
+        input_ids: torch.LongTensor, 
+        decoder_attention_mask: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        encoder_attention_mask: torch.FloatTensor,
+        beam_idx: torch.LongTensor,
+        beam_scores: torch.FloatTensor,
+    ):
+        inputs = (input_ids, decoder_attention_mask, encoder_hidden_states, encoder_attention_mask, beam_idx, beam_scores)
+        outputs = self.model(*inputs)
+        return outputs
\ No newline at end of file
diff --git a/optimum/neuron/utils/__init__.py b/optimum/neuron/utils/__init__.py
index 559f501c3..8eee6dbe9 100644
--- a/optimum/neuron/utils/__init__.py
+++ b/optimum/neuron/utils/__init__.py
@@ -21,6 +21,8 @@
     DIFFUSION_MODEL_VAE_DECODER_NAME,
     DIFFUSION_MODEL_VAE_ENCODER_NAME,
     NEURON_FILE_NAME,
+    ENCODER_NAME,
+    DECODER_NAME,
 )
 from .import_utils import (
     is_accelerate_available,
diff --git a/optimum/neuron/utils/constant.py b/optimum/neuron/utils/constant.py
index 7719ce8a2..edc6eebb8 100644
--- a/optimum/neuron/utils/constant.py
+++ b/optimum/neuron/utils/constant.py
@@ -15,6 +15,8 @@
 """Constants used as default values."""
 
 NEURON_FILE_NAME = "model.neuron"
+ENCODER_NAME = "encoder"
+DECODER_NAME = "decoder"
 DIFFUSION_MODEL_TEXT_ENCODER_NAME = "text_encoder"
 DIFFUSION_MODEL_TEXT_ENCODER_2_NAME = "text_encoder_2"
 DIFFUSION_MODEL_UNET_NAME = "unet"

From aa5a3794f1cacda333449370a733d7cac4a6e56f Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Fri, 20 Oct 2023 22:11:01 +0000
Subject: [PATCH 02/30] update wrappers

---
 optimum/exporters/neuron/base.py           |   4 +-
 optimum/exporters/neuron/convert.py        |   4 +-
 optimum/exporters/neuron/model_configs.py  |  28 ++-
 optimum/exporters/neuron/model_wrappers.py | 215 ++++++++++++++++++++-
 optimum/exporters/neuron/utils.py          |  48 ++++-
 5 files changed, 290 insertions(+), 9 deletions(-)

diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py
index 57ececa61..8255b1b4e 100644
--- a/optimum/exporters/neuron/base.py
+++ b/optimum/exporters/neuron/base.py
@@ -119,6 +119,7 @@ def __init__(
         audio_sequence_length: Optional[int] = None,
         point_batch_size: Optional[int] = None,
         nb_points_per_image: Optional[int] = None,
+        num_beams: Optional[int] = None,
         # TODO: add custom dtype after optimum 1.13 release
         # int_dtype: str = "int64",
         # float_dtype: str = "fp32",
@@ -147,6 +148,7 @@ def __init__(
             "audio_sequence_length": audio_sequence_length,
             "point_batch_size": point_batch_size,
             "nb_points_per_image": nb_points_per_image,
+            "num_beams": num_beams,
         }
         input_shapes = {}
         for name, value in axes_values.items():
@@ -290,7 +292,7 @@ def flatten_inputs(cls, inputs: Dict[str, Any]) -> Dict[str, Any]:
                 flatten[name] = value
         return flatten
 
-    def check_model_inputs_order(
+    def patch_model(
         self,
         model: "PreTrainedModel",
         dummy_inputs: Optional[Dict[str, torch.Tensor]] = None,
diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index c19d19530..70322634e 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -424,7 +424,7 @@ def export_neuronx(
     dummy_inputs = config.generate_dummy_inputs(**input_shapes)
     dummy_inputs = config.flatten_inputs(dummy_inputs)
     dummy_inputs_tuple = tuple(dummy_inputs.values())
-    checked_model = config.check_model_inputs_order(model, dummy_inputs)
+    checked_model = config.patch_model(model, dummy_inputs)
 
     if auto_cast is not None:
         logger.info(f"Using Neuron: --auto-cast {auto_cast}")
@@ -533,7 +533,7 @@ def export_neuron(
 
     dummy_inputs = config.generate_dummy_inputs(**input_shapes)
     dummy_inputs_tuple = tuple(dummy_inputs.values())
-    checked_model = config.check_model_inputs_order(model, dummy_inputs)
+    checked_model = config.patch_model(model, dummy_inputs)
     compiler_args = convert_neuronx_compiler_args_to_neuron(auto_cast, auto_cast_type, disable_fast_relayout)
 
     neuron_model = neuron.trace(
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index 31cbae6f9..6c6ad0291 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -37,6 +37,8 @@
 )
 from .model_wrappers import (
     UnetNeuronWrapper,
+    T5EncoderWrapper,
+    T5DecoderWrapper,
 )
 
 
@@ -281,8 +283,8 @@ def generate_dummy_inputs(self, return_tuple: bool = False, **kwargs):
         else:
             return dummy_inputs
 
-    def check_model_inputs_order(self, model, dummy_inputs):
-        return super().check_model_inputs_order(
+    def patch_model(self, model, dummy_inputs):
+        return super().patch_model(
             model=model,
             custom_model_wrapper=UnetNeuronWrapper,
         )
@@ -342,13 +344,13 @@ def inputs(self) -> List[str]:
     def outputs(self) -> List[str]:
         return ["sample"]
 
-    def check_model_inputs_order(
+    def patch_model(
         self,
         model: "VaeDecoder",
         dummy_inputs: Dict[str, torch.Tensor],
         **kwargs,
     ):
-        return super().check_model_inputs_order(model=model, dummy_inputs=dummy_inputs, forward_with_tuple=True)
+        return super().patch_model(model=model, dummy_inputs=dummy_inputs, forward_with_tuple=True)
 
 
 @register_in_tasks_manager("gpt2", "text-generation")
@@ -367,4 +369,22 @@ class T5EncoderNeuronConfig(TextNeuronDecoderConfig):
     MANDATORY_AXES = ("batch_size", "sequence_length")
     MODEL_TYPE = "t5-encoder"
     
+    def patch_model(self, model, num_beams=1):
+        return super().patch_model(
+            model=model,
+            custom_model_wrapper=T5EncoderWrapper,
+            custom_wrapper_kwargs={"num_beams": num_beams}
+        )
     
+    
+@register_in_tasks_manager("t5", "text2text-generation")
+class T5DecoderNeuronConfig(TextNeuronDecoderConfig):
+    ATOL_FOR_VALIDATION = 1e-3
+    MANDATORY_AXES = ("batch_size", "sequence_length", "num_beams")
+    MODEL_TYPE = "t5-decoder"
+    
+    def patch_model(self, model, dummy_inputs):
+        return super().patch_model(
+            model=model,
+            custom_model_wrapper=T5DecoderWrapper,
+        )
\ No newline at end of file
diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py
index d7f9d0ade..5eed4891f 100644
--- a/optimum/exporters/neuron/model_wrappers.py
+++ b/optimum/exporters/neuron/model_wrappers.py
@@ -13,8 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Model wrappers for Neuron export."""
+from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
 import torch
 
+from transformers.models.t5.modeling_t5 import T5Stack, T5LayerCrossAttention
+
+
+if TYPE_CHECKING:
+    from transformers.modeling_utils import PreTrainedModel
+
+
 class UnetNeuronWrapper(torch.nn.Module):
     def __init__(self, model):
         super().__init__()
@@ -29,4 +37,209 @@ def forward(self, sample, timestep, encoder_hidden_states, text_embeds=None, tim
             return_dict=False,
         )
 
-        return out_tuple
\ No newline at end of file
+        return out_tuple
+
+
+# Adapted from https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/torch-neuronx/t5-inference-tutorial.html
+class T5EncoderWrapper(torch.nn.Module):
+    """Wrapper to trace the encoder and the kv cache initialization in the decoder."""
+    def __init__(
+        self, 
+        model: "PreTrainedModel" ,
+        num_beams: int = 1,  # defaults to greedy search
+        tp_degree=None,
+    ):
+        super().__init__()
+        self.model = model
+        self.config = model.config
+        self.num_beams = num_beams
+        self.device = "xla"
+        self.tp_degree = tp_degree
+
+    def forward(self, input_ids, attention_mask):
+        # Infer shapes
+        batch_size = input_ids.shape[0]
+        sequence_length = input_ids.shape[1]
+        
+        encoder_output = self.model.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=False,
+            output_hidden_states=False
+        )
+
+        last_hidden_state = encoder_output["last_hidden_state"]
+        encoder_hidden_states = torch.concat([tensor.unsqueeze(0).repeat(self.num_beams, 1, 1) for tensor in last_hidden_state])
+
+        decoder_blocks = self.model.decoder.block
+        present_key_value_states_sa = []
+        present_key_value_states_ca = []
+
+        for block in decoder_blocks:
+
+            # Cross attention has to be initialized with the encoder hidden state
+            cross_attention: T5LayerCrossAttention = block.layer[1]
+            attention = cross_attention.EncDecAttention
+
+            def shape(states):
+                """projection"""
+                return states.view(batch_size, -1, self.config.num_heads, attention.key_value_proj_dim).transpose(1, 2)
+
+            key_states = shape(attention.k(encoder_hidden_states))
+            value_states = shape(attention.v(encoder_hidden_states))
+
+            # cross_attn_kv_state
+            present_key_value_states_ca.append(key_states)
+            present_key_value_states_ca.append(value_states)
+
+            # Self attention kv states are initialized to zeros. This is done to keep the size of the kv cache tensor constant.
+            # The kv cache is padded here to keep a fixed shape.
+            # [key states]
+            present_key_value_states_sa.append(torch.zeros((
+                batch_size,
+                self.config.num_heads,
+                sequence_length-1,
+                self.config.d_kv), dtype=torch.float32, device=self.device))
+            # [value states]
+            present_key_value_states_sa.append(torch.zeros((
+                batch_size,
+                self.config.num_heads,
+                sequence_length-1,
+                self.config.d_kv), dtype=torch.float32, device=self.device))
+
+        return present_key_value_states_sa + present_key_value_states_ca
+
+
+# Adapted from https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/torch-neuronx/t5-inference-tutorial.html
+class T5DecoderWrapper(torch.nn.Module):
+    """Wrapper to trace the decoder with past with a language head."""
+    def __init__(self,
+                 model: "PreTrainedModel" ,
+                 num_beams: int,
+                 sequence_length: int,
+                 tp_degree=None):
+        super().__init__()
+        self.model = model
+        self.config = model.config
+        self.num_beams = num_beams
+        self.device = "xla"
+        self.tp_degree = tp_degree
+        
+        # Initialize KV cache (num_beams, n_heads, seq_length, dim_per_head)
+        self.past_key_values_sa = torch.nn.ParameterList(
+            [torch.nn.Parameter(
+                torch.ones(
+                    (num_beams, self.config.num_heads, sequence_length - 1, self.config.d_kv), 
+                    dtype=torch.float32
+                ), 
+                requires_grad=False
+            ) for _ in range(self.config.num_decoder_layers * 2)]
+        )
+        self.past_key_values_ca = torch.nn.ParameterList(
+            [torch.nn.Parameter(
+                torch.ones(
+                    (num_beams, self.config.num_heads, sequence_length, self.config.d_kv), 
+                    dtype=torch.float32
+                ), 
+                requires_grad=False
+            ) for _ in range(self.config.num_decoder_layers * 2)]
+        )
+
+    def update_past(self, past_key_values):
+        new_past_sa = []
+        new_past_ca = []
+        for past_layer in past_key_values:
+            new_past_layer = list(past_layer)
+            for i in range(len(new_past_layer[:2])):
+                new_past_layer[i] = past_layer[i][:, :, 1:]
+            new_past_sa += [new_past_layer[:2],]
+            new_past_ca += [new_past_layer[2:],]
+        return new_past_sa, new_past_ca
+
+    def reorder_cache(self, past_key_values, beam_idx):
+        for i in range(len(past_key_values)):
+            gather_index = beam_idx.view([beam_idx.shape[0],1,1,1]).expand_as(past_key_values[i])
+            past_key_values[i] = torch.gather(past_key_values[i], dim = 0, index=gather_index)
+        return past_key_values
+
+    def forward(
+        self,
+        input_ids,
+        decoder_attention_mask,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        beam_idx,
+        beam_scores,
+        **kwargs
+    ):
+        # Infer shapes
+        batch_size = input_ids.shape[0] or 1
+        
+        if self.num_beams > 1:
+            # We reorder the cache based on the beams selected in each iteration. Required step for beam search.
+            past_key_values_sa = self.reorder_cache(self.past_key_values_sa, beam_idx)
+            past_key_values_ca = self.reorder_cache(self.past_key_values_ca, beam_idx)
+        else:
+            # We do not need to reorder for greedy sampling
+            past_key_values_sa = self.past_key_values_sa
+            past_key_values_ca = self.past_key_values_ca
+
+        # The cache is stored in a flatten form. We order the cache per layer before passing it to the decoder.
+        # Each layer has 4 tensors, so we group by 4.
+        past_key_values = [[*past_key_values_sa[i*2:i*2+2], *past_key_values_ca[i*2:i*2+2]] for i in range(0, int(len(past_key_values_ca)/2))]
+
+        decoder_output = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=decoder_attention_mask,
+            past_key_values=past_key_values,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+            output_attentions=False,
+            output_hidden_states=False)
+
+        last_hidden_state = decoder_output['last_hidden_state']
+        past_key_values = decoder_output['past_key_values']
+
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            last_hidden_state = last_hidden_state * (self.model.config.d_model**-0.5)
+
+        lm_logits = self.model.lm_head(last_hidden_state)
+
+        past_key_values_sa, past_key_values_ca = self.update_past(past_key_values)
+
+        # We flatten the cache to a single array. This is required for the input output aliasing to work
+        past_key_values_sa = [vec for kv_per_layer in past_key_values_sa for vec in kv_per_layer]
+        past_key_values_ca = [vec for kv_per_layer in past_key_values_ca for vec in kv_per_layer]
+
+        # We calculate topk inside the wrapper
+        next_token_logits = lm_logits[:, -1, :]
+
+        if self.num_beams > 1:
+            # This section of beam search is run outside the decoder in the huggingface t5 implementation.
+            # To maximize the computation within the neuron device, we move this within the wrapper
+            logit_max, _ = torch.max(next_token_logits, dim=-1, keepdim=True)
+            logsumexp = torch.log(torch.exp(next_token_logits - logit_max).sum(dim=-1, keepdim=True))
+            next_token_scores = next_token_logits - logit_max - logsumexp
+            next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores)
+
+            # reshape for beam search
+            vocab_size = next_token_scores.shape[-1]
+            next_token_scores = next_token_scores.view(batch_size, self.num_beams * vocab_size)
+            next_token_scores = next_token_scores * 1
+
+            # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search)
+            next_token_scores, next_tokens = torch.topk(
+                next_token_scores, 2 * self.num_beams, dim=1, largest=True, sorted=True
+            )
+
+            next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
+            next_tokens = next_tokens % vocab_size
+
+            return [next_token_scores, next_tokens, next_indices] + past_key_values_sa + past_key_values_ca
+        else:
+            # Greedy
+            next_tokens = torch.argmax(next_token_logits, dim=-1)
+            return [next_tokens] + past_key_values_sa + past_key_values_ca
\ No newline at end of file
diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py
index 6b69429cf..ead9ab71f 100644
--- a/optimum/exporters/neuron/utils.py
+++ b/optimum/exporters/neuron/utils.py
@@ -156,7 +156,7 @@ def get_stable_diffusion_models_for_export(
             Whether the Neuron compiled model supports dynamic batch size.
 
     Returns:
-        `Dict[str, Tuple[Union[`PreTrainedModel`, `ModelMixin`], `NeuronConfig`]: A Dict containing the model and
+        `Dict[str, Tuple[Union[`PreTrainedModel`, `ModelMixin`], `NeuronConfig`]`: A Dict containing the model and
         Neuron configs for the different components of the model.
     """
     models_for_export = _get_submodels_for_export_stable_diffusion(pipeline=pipeline, task=task)
@@ -320,3 +320,49 @@ def override_diffusers_2_0_attn_processors(model):
             elif isinstance(submodule.processor, AttnAddedKVProcessor2_0):
                 submodule.set_processor(AttnAddedKVProcessor())
     return model
+
+
+def get_encoder_decoder_models_for_export(
+    model: "PreTrainedModel",
+    encoder_input_shapes: Dict[str, int],
+    decoder_input_shapes: Dict[str, int],
+    dynamic_batch_size: Optional[bool] = False,
+) -> Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]:
+    """
+    Returns the components of an encoder-decoder model and their subsequent neuron configs.
+    The encoder includes the compute of encoder hidden states and the initialization of KV
+    cache. The decoder the autoprogressive process of generating tokens, which takes past 
+    key values as inputs to save the compute.
+
+    Args:
+        model ("PreTrainedModel"):
+            The model to export.
+        encoder_input_shapes (`Dict[str, int]`):
+            Static shapes used for compiling the encoder.
+        decoder_input_shapes (`Dict[str, int]`):
+            Static shapes used for compiling the decoder.
+        dynamic_batch_size (`bool`, defaults to `False`):
+            Whether the Neuron compiled model supports dynamic batch size.
+
+    Returns:
+        `Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]`: A Dict containing the model and
+        Neuron configs for the different components of the model.
+    """
+    # Encoder
+    encoder = {"encoder": model.encoder, "decoder": model.decoder}
+    encoder_config_constructor = TasksManager.get_exporter_config_constructor(
+        model=model, exporter="neuron", task="feature-extraction"
+    )
+    encoder_neuron_config = encoder_config_constructor(
+        text_encoder.config,
+        task="feature-extraction",
+        dynamic_batch_size=dynamic_batch_size,
+        **encoder_input_shapes,
+    )
+    models_for_export[DIFFUSION_MODEL_TEXT_ENCODER_NAME] = (text_encoder, encoder_neuron_config)
+    
+    # Decoder
+    decoder = {"decoder": model.decoder, "lm_head": model.lm_head}
+    decoder_config_constructor = TasksManager.get_exporter_config_constructor(
+        model=model, exporter="neuron", task="feature-extraction"
+    )

From 658087518d33fcf29d850fb365f4ec398797217b Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Mon, 23 Oct 2023 14:14:50 +0000
Subject: [PATCH 03/30] encoder support

---
 optimum/exporters/neuron/base.py           |   4 +-
 optimum/exporters/neuron/config.py         |  29 +++--
 optimum/exporters/neuron/convert.py        |   4 +-
 optimum/exporters/neuron/model_configs.py  |  70 +++++++++---
 optimum/exporters/neuron/model_wrappers.py | 121 +++++++++++----------
 optimum/exporters/neuron/utils.py          |  31 ++++--
 optimum/neuron/modeling_seq2seq.py         |  53 +++++----
 optimum/neuron/utils/__init__.py           |   4 +-
 8 files changed, 190 insertions(+), 126 deletions(-)

diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py
index 8255b1b4e..6414996f8 100644
--- a/optimum/exporters/neuron/base.py
+++ b/optimum/exporters/neuron/base.py
@@ -119,7 +119,6 @@ def __init__(
         audio_sequence_length: Optional[int] = None,
         point_batch_size: Optional[int] = None,
         nb_points_per_image: Optional[int] = None,
-        num_beams: Optional[int] = None,
         # TODO: add custom dtype after optimum 1.13 release
         # int_dtype: str = "int64",
         # float_dtype: str = "fp32",
@@ -148,7 +147,6 @@ def __init__(
             "audio_sequence_length": audio_sequence_length,
             "point_batch_size": point_batch_size,
             "nb_points_per_image": nb_points_per_image,
-            "num_beams": num_beams,
         }
         input_shapes = {}
         for name, value in axes_values.items():
@@ -292,7 +290,7 @@ def flatten_inputs(cls, inputs: Dict[str, Any]) -> Dict[str, Any]:
                 flatten[name] = value
         return flatten
 
-    def patch_model(
+    def patch_model_for_export(
         self,
         model: "PreTrainedModel",
         dummy_inputs: Optional[Dict[str, torch.Tensor]] = None,
diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py
index 1564de82d..e9cbc37fe 100644
--- a/optimum/exporters/neuron/config.py
+++ b/optimum/exporters/neuron/config.py
@@ -16,17 +16,18 @@
 Common Neuron configuration classes that handle most of the features for building model specific
 configurations.
 """
-from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Dict, List
+
 from ...utils import (
-    DummyInputGenerator,
     DummyBboxInputGenerator,
-    DummyTextInputGenerator,
+    DummyInputGenerator,
     DummySeq2SeqDecoderTextInputGenerator,
     DummySeq2SeqPastKeyValuesGenerator,
+    DummyTextInputGenerator,
     DummyVisionInputGenerator,
     logging,
 )
-from .base import NeuronConfig, NeuronDecoderConfig, NeuronSeq2SeqConfigWithPast
+from .base import NeuronConfig, NeuronDecoderConfig
 
 
 logger = logging.get_logger(__name__)
@@ -70,7 +71,7 @@ class TextSeq2SeqNeuronConfig(NeuronConfig):
     """
     Handles encoder-decoder-based text architectures.
     """
-    
+
     DUMMY_INPUT_GENERATOR_CLASSES = (
         DummyTextInputGenerator,
         DummySeq2SeqDecoderTextInputGenerator,
@@ -87,26 +88,24 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
         # decoder with past
         if "decoder" in self.MODEL_TYPE:
             common_inputs = [
-                "decoder_input_ids", 
-                "decoder_attention_mask", 
-                "encoder_hidden_states", 
+                "decoder_input_ids",
+                "decoder_attention_mask",
+                "encoder_hidden_states",
                 "encoder_attention_mask",
-                "beam_idx",
-                "beam_scores",
             ]
 
         return common_inputs
-    
+
     @property
     def outputs(self) -> Dict[str, Dict[int, str]]:
         # encoder + decoder without past
         if "encoder" in self.MODEL_TYPE:
-            common_outputs = ["past_key_values"]
+            common_outputs = ["present_key_values_self_attn", "past_key_values_cross_attn"]
         # decoder with past
         if "decoder" in self.MODEL_TYPE:
-            common_outputs = ["next_tokens", ""]
+            common_outputs = ["next_tokens", "past_key_values_self_attn", "past_key_values_cross_attn"]
         return common_outputs
-    
+
     def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]:
         dummy_text_input_generator = self.DUMMY_INPUT_GENERATOR_CLASSES[0](
             self.task, self._normalized_config, **kwargs
@@ -128,4 +127,4 @@ def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGen
             dummy_seq2seq_past_key_values_generator,
         ]
 
-        return dummy_inputs_generators
\ No newline at end of file
+        return dummy_inputs_generators
diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index 70322634e..bd7a894bc 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -424,7 +424,7 @@ def export_neuronx(
     dummy_inputs = config.generate_dummy_inputs(**input_shapes)
     dummy_inputs = config.flatten_inputs(dummy_inputs)
     dummy_inputs_tuple = tuple(dummy_inputs.values())
-    checked_model = config.patch_model(model, dummy_inputs)
+    checked_model = config.patch_model_for_export(model, dummy_inputs)
 
     if auto_cast is not None:
         logger.info(f"Using Neuron: --auto-cast {auto_cast}")
@@ -533,7 +533,7 @@ def export_neuron(
 
     dummy_inputs = config.generate_dummy_inputs(**input_shapes)
     dummy_inputs_tuple = tuple(dummy_inputs.values())
-    checked_model = config.patch_model(model, dummy_inputs)
+    checked_model = config.patch_model_for_export(model, dummy_inputs)
     compiler_args = convert_neuronx_compiler_args_to_neuron(auto_cast, auto_cast_type, disable_fast_relayout)
 
     neuron_model = neuron.trace(
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index 6c6ad0291..8edabd177 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -25,6 +25,7 @@
     DummyVisionInputGenerator,
     NormalizedConfig,
     NormalizedConfigManager,
+    NormalizedSeq2SeqConfig,
     NormalizedTextAndVisionConfig,
     is_diffusers_available,
 )
@@ -33,12 +34,13 @@
     TextAndVisionNeuronConfig,
     TextEncoderNeuronConfig,
     TextNeuronDecoderConfig,
+    TextSeq2SeqNeuronConfig,
     VisionNeuronConfig,
 )
 from .model_wrappers import (
-    UnetNeuronWrapper,
-    T5EncoderWrapper,
     T5DecoderWrapper,
+    T5EncoderWrapper,
+    UnetNeuronWrapper,
 )
 
 
@@ -344,13 +346,13 @@ def inputs(self) -> List[str]:
     def outputs(self) -> List[str]:
         return ["sample"]
 
-    def patch_model(
+    def patch_model_for_export(
         self,
         model: "VaeDecoder",
         dummy_inputs: Dict[str, torch.Tensor],
         **kwargs,
     ):
-        return super().patch_model(model=model, dummy_inputs=dummy_inputs, forward_with_tuple=True)
+        return super().patch_model_for_export(model=model, dummy_inputs=dummy_inputs, forward_with_tuple=True)
 
 
 @register_in_tasks_manager("gpt2", "text-generation")
@@ -363,28 +365,60 @@ class LLamaNeuronConfig(TextNeuronDecoderConfig):
     NEURONX_CLASS = "llama.model.LlamaForSampling"
 
 
-@register_in_tasks_manager("t5", "text2text-generation")
-class T5EncoderNeuronConfig(TextNeuronDecoderConfig):
+@register_in_tasks_manager("t5-encoder", "text2text-generation")
+class T5EncoderNeuronConfig(TextSeq2SeqNeuronConfig):
     ATOL_FOR_VALIDATION = 1e-3
     MANDATORY_AXES = ("batch_size", "sequence_length")
     MODEL_TYPE = "t5-encoder"
-    
-    def patch_model(self, model, num_beams=1):
-        return super().patch_model(
+    NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
+        hidden_size="d_model",
+        num_attention_heads="num_heads",
+        encoder_num_layers="num_layers",
+        decoder_num_layers="num_decoder_layers",
+        key_value_dim="d_kv",
+        allow_new=True,
+    )
+
+    def generate_dummy_inputs(self, **kwargs):
+        dummy_inputs = super().generate_dummy_inputs(**kwargs)
+
+        return dummy_inputs
+
+    def patch_model_for_export(self, model, num_beams=1):
+        return super().patch_model_for_export(
             model=model,
             custom_model_wrapper=T5EncoderWrapper,
-            custom_wrapper_kwargs={"num_beams": num_beams}
         )
-    
-    
-@register_in_tasks_manager("t5", "text2text-generation")
-class T5DecoderNeuronConfig(TextNeuronDecoderConfig):
+
+
+@register_in_tasks_manager("t5-decoder", "text2text-generation")
+class T5DecoderNeuronConfig(TextSeq2SeqNeuronConfig):
     ATOL_FOR_VALIDATION = 1e-3
-    MANDATORY_AXES = ("batch_size", "sequence_length", "num_beams")
+    MANDATORY_AXES = ("batch_size", "sequence_length")
     MODEL_TYPE = "t5-decoder"
-    
-    def patch_model(self, model, dummy_inputs):
+    NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
+        hidden_size="d_model",
+        num_attention_heads="num_heads",
+        encoder_num_layers="num_layers",
+        decoder_num_layers="num_decoder_layers",
+        key_value_dim="d_kv",
+        allow_new=True,
+    )
+
+    @property
+    def inputs(self) -> List[str]:
+        common_inputs = super().inputs() + ["beam_idx", "beam_scores"]
+
+        return common_inputs
+
+    def patch_model_for_export(self, model, dummy_inputs):
         return super().patch_model(
             model=model,
             custom_model_wrapper=T5DecoderWrapper,
-        )
\ No newline at end of file
+        )
+
+    def generate_io_aliases(self, model, dummy_inputs):
+        return super().patch_model(
+            model=model,
+            custom_model_wrapper=T5DecoderWrapper,
+        )
diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py
index 5eed4891f..3c51668e2 100644
--- a/optimum/exporters/neuron/model_wrappers.py
+++ b/optimum/exporters/neuron/model_wrappers.py
@@ -13,10 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Model wrappers for Neuron export."""
-from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
-import torch
+from typing import TYPE_CHECKING
 
-from transformers.models.t5.modeling_t5 import T5Stack, T5LayerCrossAttention
+import torch
+from transformers.models.t5.modeling_t5 import T5LayerCrossAttention
 
 
 if TYPE_CHECKING:
@@ -43,16 +43,15 @@ def forward(self, sample, timestep, encoder_hidden_states, text_embeds=None, tim
 # Adapted from https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/torch-neuronx/t5-inference-tutorial.html
 class T5EncoderWrapper(torch.nn.Module):
     """Wrapper to trace the encoder and the kv cache initialization in the decoder."""
+
     def __init__(
-        self, 
-        model: "PreTrainedModel" ,
-        num_beams: int = 1,  # defaults to greedy search
+        self,
+        model: "PreTrainedModel",
         tp_degree=None,
     ):
         super().__init__()
         self.model = model
         self.config = model.config
-        self.num_beams = num_beams
         self.device = "xla"
         self.tp_degree = tp_degree
 
@@ -60,23 +59,18 @@ def forward(self, input_ids, attention_mask):
         # Infer shapes
         batch_size = input_ids.shape[0]
         sequence_length = input_ids.shape[1]
-        
+
         encoder_output = self.model.encoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            output_attentions=False,
-            output_hidden_states=False
+            input_ids=input_ids, attention_mask=attention_mask, output_attentions=False, output_hidden_states=False
         )
 
         last_hidden_state = encoder_output["last_hidden_state"]
-        encoder_hidden_states = torch.concat([tensor.unsqueeze(0).repeat(self.num_beams, 1, 1) for tensor in last_hidden_state])
 
         decoder_blocks = self.model.decoder.block
         present_key_value_states_sa = []
         present_key_value_states_ca = []
 
         for block in decoder_blocks:
-
             # Cross attention has to be initialized with the encoder hidden state
             cross_attention: T5LayerCrossAttention = block.layer[1]
             attention = cross_attention.EncDecAttention
@@ -85,8 +79,8 @@ def shape(states):
                 """projection"""
                 return states.view(batch_size, -1, self.config.num_heads, attention.key_value_proj_dim).transpose(1, 2)
 
-            key_states = shape(attention.k(encoder_hidden_states))
-            value_states = shape(attention.v(encoder_hidden_states))
+            key_states = shape(attention.k(last_hidden_state))
+            value_states = shape(attention.v(last_hidden_state))
 
             # cross_attn_kv_state
             present_key_value_states_ca.append(key_states)
@@ -95,17 +89,21 @@ def shape(states):
             # Self attention kv states are initialized to zeros. This is done to keep the size of the kv cache tensor constant.
             # The kv cache is padded here to keep a fixed shape.
             # [key states]
-            present_key_value_states_sa.append(torch.zeros((
-                batch_size,
-                self.config.num_heads,
-                sequence_length-1,
-                self.config.d_kv), dtype=torch.float32, device=self.device))
+            present_key_value_states_sa.append(
+                torch.zeros(
+                    (batch_size, self.config.num_heads, sequence_length - 1, self.config.d_kv),
+                    dtype=torch.float32,
+                    device=self.device,
+                )
+            )
             # [value states]
-            present_key_value_states_sa.append(torch.zeros((
-                batch_size,
-                self.config.num_heads,
-                sequence_length-1,
-                self.config.d_kv), dtype=torch.float32, device=self.device))
+            present_key_value_states_sa.append(
+                torch.zeros(
+                    (batch_size, self.config.num_heads, sequence_length - 1, self.config.d_kv),
+                    dtype=torch.float32,
+                    device=self.device,
+                )
+            )
 
         return present_key_value_states_sa + present_key_value_states_ca
 
@@ -113,36 +111,37 @@ def shape(states):
 # Adapted from https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/torch-neuronx/t5-inference-tutorial.html
 class T5DecoderWrapper(torch.nn.Module):
     """Wrapper to trace the decoder with past with a language head."""
-    def __init__(self,
-                 model: "PreTrainedModel" ,
-                 num_beams: int,
-                 sequence_length: int,
-                 tp_degree=None):
+
+    def __init__(self, model: "PreTrainedModel", num_beams: int, sequence_length: int, tp_degree=None):
         super().__init__()
         self.model = model
         self.config = model.config
         self.num_beams = num_beams
         self.device = "xla"
         self.tp_degree = tp_degree
-        
+
         # Initialize KV cache (num_beams, n_heads, seq_length, dim_per_head)
         self.past_key_values_sa = torch.nn.ParameterList(
-            [torch.nn.Parameter(
-                torch.ones(
-                    (num_beams, self.config.num_heads, sequence_length - 1, self.config.d_kv), 
-                    dtype=torch.float32
-                ), 
-                requires_grad=False
-            ) for _ in range(self.config.num_decoder_layers * 2)]
+            [
+                torch.nn.Parameter(
+                    torch.ones(
+                        (num_beams, self.config.num_heads, sequence_length - 1, self.config.d_kv), dtype=torch.float32
+                    ),
+                    requires_grad=False,
+                )
+                for _ in range(self.config.num_decoder_layers * 2)
+            ]
         )
         self.past_key_values_ca = torch.nn.ParameterList(
-            [torch.nn.Parameter(
-                torch.ones(
-                    (num_beams, self.config.num_heads, sequence_length, self.config.d_kv), 
-                    dtype=torch.float32
-                ), 
-                requires_grad=False
-            ) for _ in range(self.config.num_decoder_layers * 2)]
+            [
+                torch.nn.Parameter(
+                    torch.ones(
+                        (num_beams, self.config.num_heads, sequence_length, self.config.d_kv), dtype=torch.float32
+                    ),
+                    requires_grad=False,
+                )
+                for _ in range(self.config.num_decoder_layers * 2)
+            ]
         )
 
     def update_past(self, past_key_values):
@@ -152,14 +151,18 @@ def update_past(self, past_key_values):
             new_past_layer = list(past_layer)
             for i in range(len(new_past_layer[:2])):
                 new_past_layer[i] = past_layer[i][:, :, 1:]
-            new_past_sa += [new_past_layer[:2],]
-            new_past_ca += [new_past_layer[2:],]
+            new_past_sa += [
+                new_past_layer[:2],
+            ]
+            new_past_ca += [
+                new_past_layer[2:],
+            ]
         return new_past_sa, new_past_ca
 
     def reorder_cache(self, past_key_values, beam_idx):
         for i in range(len(past_key_values)):
-            gather_index = beam_idx.view([beam_idx.shape[0],1,1,1]).expand_as(past_key_values[i])
-            past_key_values[i] = torch.gather(past_key_values[i], dim = 0, index=gather_index)
+            gather_index = beam_idx.view([beam_idx.shape[0], 1, 1, 1]).expand_as(past_key_values[i])
+            past_key_values[i] = torch.gather(past_key_values[i], dim=0, index=gather_index)
         return past_key_values
 
     def forward(
@@ -170,11 +173,11 @@ def forward(
         encoder_attention_mask,
         beam_idx,
         beam_scores,
-        **kwargs
+        **kwargs,
     ):
         # Infer shapes
         batch_size = input_ids.shape[0] or 1
-        
+
         if self.num_beams > 1:
             # We reorder the cache based on the beams selected in each iteration. Required step for beam search.
             past_key_values_sa = self.reorder_cache(self.past_key_values_sa, beam_idx)
@@ -186,7 +189,10 @@ def forward(
 
         # The cache is stored in a flatten form. We order the cache per layer before passing it to the decoder.
         # Each layer has 4 tensors, so we group by 4.
-        past_key_values = [[*past_key_values_sa[i*2:i*2+2], *past_key_values_ca[i*2:i*2+2]] for i in range(0, int(len(past_key_values_ca)/2))]
+        past_key_values = [
+            [*past_key_values_sa[i * 2 : i * 2 + 2], *past_key_values_ca[i * 2 : i * 2 + 2]]
+            for i in range(0, int(len(past_key_values_ca) / 2))
+        ]
 
         decoder_output = self.model.decoder(
             input_ids=input_ids,
@@ -196,10 +202,11 @@ def forward(
             encoder_attention_mask=encoder_attention_mask,
             use_cache=True,
             output_attentions=False,
-            output_hidden_states=False)
+            output_hidden_states=False,
+        )
 
-        last_hidden_state = decoder_output['last_hidden_state']
-        past_key_values = decoder_output['past_key_values']
+        last_hidden_state = decoder_output["last_hidden_state"]
+        past_key_values = decoder_output["past_key_values"]
 
         if self.config.tie_word_embeddings:
             # Rescale output before projecting on vocab
@@ -242,4 +249,4 @@ def forward(
         else:
             # Greedy
             next_tokens = torch.argmax(next_token_logits, dim=-1)
-            return [next_tokens] + past_key_values_sa + past_key_values_ca
\ No newline at end of file
+            return [next_tokens] + past_key_values_sa + past_key_values_ca
diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py
index ead9ab71f..d4c9a85bf 100644
--- a/optimum/exporters/neuron/utils.py
+++ b/optimum/exporters/neuron/utils.py
@@ -23,11 +23,13 @@
 from transformers import PretrainedConfig
 
 from ...neuron.utils import (
+    DECODER_NAME,
     DIFFUSION_MODEL_TEXT_ENCODER_2_NAME,
     DIFFUSION_MODEL_TEXT_ENCODER_NAME,
     DIFFUSION_MODEL_UNET_NAME,
     DIFFUSION_MODEL_VAE_DECODER_NAME,
     DIFFUSION_MODEL_VAE_ENCODER_NAME,
+    ENCODER_NAME,
     get_attention_scores_sd,
     get_attention_scores_sdxl,
 )
@@ -331,7 +333,7 @@ def get_encoder_decoder_models_for_export(
     """
     Returns the components of an encoder-decoder model and their subsequent neuron configs.
     The encoder includes the compute of encoder hidden states and the initialization of KV
-    cache. The decoder the autoprogressive process of generating tokens, which takes past 
+    cache. The decoder the autoprogressive process of generating tokens, which takes past
     key values as inputs to save the compute.
 
     Args:
@@ -348,21 +350,32 @@ def get_encoder_decoder_models_for_export(
         `Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]`: A Dict containing the model and
         Neuron configs for the different components of the model.
     """
+    models_for_export = []
+
     # Encoder
-    encoder = {"encoder": model.encoder, "decoder": model.decoder}
+    model_type = getattr(model.config, "model_type") + "-encoder"
     encoder_config_constructor = TasksManager.get_exporter_config_constructor(
-        model=model, exporter="neuron", task="feature-extraction"
+        exporter="neuron", model_type=model_type, task="text2text-generation"
     )
     encoder_neuron_config = encoder_config_constructor(
-        text_encoder.config,
-        task="feature-extraction",
+        config=model.config,
+        task="text2text-generation",
         dynamic_batch_size=dynamic_batch_size,
         **encoder_input_shapes,
     )
-    models_for_export[DIFFUSION_MODEL_TEXT_ENCODER_NAME] = (text_encoder, encoder_neuron_config)
-    
+    models_for_export[ENCODER_NAME] = (model, encoder_neuron_config)
+
     # Decoder
-    decoder = {"decoder": model.decoder, "lm_head": model.lm_head}
+    model_type = getattr(model.config, "model_type") + "-decoder"
     decoder_config_constructor = TasksManager.get_exporter_config_constructor(
-        model=model, exporter="neuron", task="feature-extraction"
+        exporter="neuron", model_type=model_type, task="text2text-generation"
     )
+    decoder_neuron_config = decoder_config_constructor(
+        config=model.config,
+        task="text2text-generation",
+        dynamic_batch_size=dynamic_batch_size,
+        **decoder_input_shapes,
+    )
+    models_for_export[DECODER_NAME] = (model, decoder_neuron_config)
+
+    return models_for_export
diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index 555e77f56..f39505cba 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -13,31 +13,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """NeuroModelForXXX classes for seq2seq models' inference on neuron devices."""
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 from abc import abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
 import torch
 from transformers import AutoModelForSeq2SeqLM
-from .modeling_base import NeuronBaseModel, NeuronConfig
+
 from .generation import NeuronGenerationMixin
+from .modeling_base import NeuronBaseModel, NeuronConfig
 from .utils import (
-    ENCODER_NAME,
-    DECODER_NAME,
     NEURON_FILE_NAME,
     is_neuronx_available,
 )
 
+
 if TYPE_CHECKING:
-    from transformers import PretrainedConfig, PreTrainedModel
+    from transformers import PretrainedConfig
 
 if is_neuronx_available():
-    torch_neuronx
+    pass
 
 
 class NeuronModelForConditionalGeneration(NeuronBaseModel):
     base_model_prefix = "neuron_model"
-    
+
     def __init__(
         self,
         encoder: torch.jit._script.ScriptModule,
@@ -50,7 +51,7 @@ def __init__(
         **kwargs,
     ):
         pass
-    
+
     @staticmethod
     def load_model(
         encoder_path: Union[str, Path],
@@ -59,7 +60,7 @@ def load_model(
         dynamic_batch_size: bool = False,
     ):
         pass
-    
+
     def _save_pretrained(
         self,
         save_directory: Union[str, Path],
@@ -76,7 +77,7 @@ def _save_pretrained(
                 The directory where to save the model files.
         """
         pass
-    
+
     @classmethod
     def _from_pretrained(
         cls,
@@ -95,7 +96,7 @@ def _from_pretrained(
         **kwargs,
     ):
         pass
-    
+
     @classmethod
     def _from_transformers(
         cls,
@@ -116,13 +117,12 @@ def _from_transformers(
         dynamic_batch_size: bool = False,
         device_ids: Optional[List[int]] = None,
     ) -> "NeuronModelForConditionalGeneration":
-        pass 
+        pass
 
 
 class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin):
     auto_model_class = AutoModelForSeq2SeqLM
     main_input_name = "input_ids"
-   
 
 
 class _NeuronSeq2SeqModelPart:
@@ -158,6 +158,7 @@ class NeuronEncoder(_NeuronSeq2SeqModelPart):
     """
     Encoder part of the encoder-decoder model for Neuron inference. (Actually it's a monolith of encoder + decoder without past_key_values to workaround the control flow in the decoder).
     """
+
     def __init__(
         self,
         model: torch.jit._script.ScriptModule,
@@ -166,16 +167,21 @@ def __init__(
         neuron_config: Optional[Dict[str, str]] = None,
     ):
         super().__init__(model, parent_model, config, neuron_config, "encoder")
-    
+
     def forward(self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor):
-        inputs = (input_ids, attention_mask, )
+        inputs = (
+            input_ids,
+            attention_mask,
+        )
         outputs = self.model(*inputs)
         return outputs
 
+
 class NeuronDecoder(_NeuronSeq2SeqModelPart):
     """
     Decoder part of the encoder-decoder model for Neuron inference. (Actually it's decoder with past_key_values).
     """
+
     def __init__(
         self,
         model: torch.jit._script.ScriptModule,
@@ -184,16 +190,23 @@ def __init__(
         neuron_config: Optional[Dict[str, str]] = None,
     ):
         super().__init__(model, parent_model, config, neuron_config, "decoder")
-    
+
     def forward(
-        self, 
-        input_ids: torch.LongTensor, 
+        self,
+        input_ids: torch.LongTensor,
         decoder_attention_mask: torch.FloatTensor,
         encoder_hidden_states: torch.FloatTensor,
         encoder_attention_mask: torch.FloatTensor,
         beam_idx: torch.LongTensor,
         beam_scores: torch.FloatTensor,
     ):
-        inputs = (input_ids, decoder_attention_mask, encoder_hidden_states, encoder_attention_mask, beam_idx, beam_scores)
+        inputs = (
+            input_ids,
+            decoder_attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            beam_idx,
+            beam_scores,
+        )
         outputs = self.model(*inputs)
-        return outputs
\ No newline at end of file
+        return outputs
diff --git a/optimum/neuron/utils/__init__.py b/optimum/neuron/utils/__init__.py
index 8eee6dbe9..96af3e158 100644
--- a/optimum/neuron/utils/__init__.py
+++ b/optimum/neuron/utils/__init__.py
@@ -15,14 +15,14 @@
 
 from .argument_utils import convert_neuronx_compiler_args_to_neuron, store_compilation_config
 from .constant import (
+    DECODER_NAME,
     DIFFUSION_MODEL_TEXT_ENCODER_2_NAME,
     DIFFUSION_MODEL_TEXT_ENCODER_NAME,
     DIFFUSION_MODEL_UNET_NAME,
     DIFFUSION_MODEL_VAE_DECODER_NAME,
     DIFFUSION_MODEL_VAE_ENCODER_NAME,
-    NEURON_FILE_NAME,
     ENCODER_NAME,
-    DECODER_NAME,
+    NEURON_FILE_NAME,
 )
 from .import_utils import (
     is_accelerate_available,

From e997f5fcbae3eceeaa9f5f6e96be6528eb4af41b Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Tue, 24 Oct 2023 23:12:27 +0000
Subject: [PATCH 04/30] decoder export

---
 optimum/exporters/neuron/base.py           |  2 +
 optimum/exporters/neuron/config.py         |  2 +-
 optimum/exporters/neuron/convert.py        | 15 +++++-
 optimum/exporters/neuron/model_configs.py  | 58 ++++++++++++++++------
 optimum/exporters/neuron/model_wrappers.py | 40 ++++++++++-----
 optimum/neuron/utils/__init__.py           |  1 +
 optimum/neuron/utils/input_generators.py   | 45 +++++++++++++++++
 7 files changed, 132 insertions(+), 31 deletions(-)
 create mode 100644 optimum/neuron/utils/input_generators.py

diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py
index 6414996f8..a2e2eb520 100644
--- a/optimum/exporters/neuron/base.py
+++ b/optimum/exporters/neuron/base.py
@@ -119,6 +119,7 @@ def __init__(
         audio_sequence_length: Optional[int] = None,
         point_batch_size: Optional[int] = None,
         nb_points_per_image: Optional[int] = None,
+        num_beams: Optional[int] = None,
         # TODO: add custom dtype after optimum 1.13 release
         # int_dtype: str = "int64",
         # float_dtype: str = "fp32",
@@ -147,6 +148,7 @@ def __init__(
             "audio_sequence_length": audio_sequence_length,
             "point_batch_size": point_batch_size,
             "nb_points_per_image": nb_points_per_image,
+            "num_beams": num_beams,
         }
         input_shapes = {}
         for name, value in axes_values.items():
diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py
index e9cbc37fe..3f18d67d6 100644
--- a/optimum/exporters/neuron/config.py
+++ b/optimum/exporters/neuron/config.py
@@ -91,7 +91,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
                 "decoder_input_ids",
                 "decoder_attention_mask",
                 "encoder_hidden_states",
-                "encoder_attention_mask",
+                "attention_mask",  # TODO: replace with `encoder_attention_mask` after optimum 1.14 release
             ]
 
         return common_inputs
diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index bd7a894bc..2466cf9c5 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -424,7 +424,13 @@ def export_neuronx(
     dummy_inputs = config.generate_dummy_inputs(**input_shapes)
     dummy_inputs = config.flatten_inputs(dummy_inputs)
     dummy_inputs_tuple = tuple(dummy_inputs.values())
-    checked_model = config.patch_model_for_export(model, dummy_inputs)
+
+    aliases = {}
+    if model.config.is_encoder_decoder:
+        checked_model = config.patch_model_for_export(model, **input_shapes)
+        aliases = config.generate_io_aliases(checked_model)
+    else:
+        checked_model = config.patch_model_for_export(model, dummy_inputs)
 
     if auto_cast is not None:
         logger.info(f"Using Neuron: --auto-cast {auto_cast}")
@@ -440,7 +446,12 @@ def export_neuronx(
     # diffusers specific
     compiler_args = add_stable_diffusion_compiler_args(config, compiler_args)
 
-    neuron_model = neuronx.trace(checked_model, dummy_inputs_tuple, compiler_args=compiler_args)
+    neuron_model = neuronx.trace(
+        checked_model,
+        dummy_inputs_tuple,
+        compiler_args=compiler_args,
+        input_output_aliases=aliases,
+    )
 
     if config.dynamic_batch_size is True:
         neuron_model = neuronx.dynamic_batch(neuron_model)
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index 8edabd177..92d566323 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -19,7 +19,9 @@
 
 import torch
 
+from ...neuron.utils import DummyBeamValuesGenerator
 from ...utils import (
+    DummyInputGenerator,
     DummySeq2SeqDecoderTextInputGenerator,
     DummyTimestepInputGenerator,
     DummyVisionInputGenerator,
@@ -368,7 +370,7 @@ class LLamaNeuronConfig(TextNeuronDecoderConfig):
 @register_in_tasks_manager("t5-encoder", "text2text-generation")
 class T5EncoderNeuronConfig(TextSeq2SeqNeuronConfig):
     ATOL_FOR_VALIDATION = 1e-3
-    MANDATORY_AXES = ("batch_size", "sequence_length")
+    MANDATORY_AXES = ("batch_size", "sequence_length", "num_beams")
     MODEL_TYPE = "t5-encoder"
     NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
         hidden_size="d_model",
@@ -379,22 +381,20 @@ class T5EncoderNeuronConfig(TextSeq2SeqNeuronConfig):
         allow_new=True,
     )
 
-    def generate_dummy_inputs(self, **kwargs):
-        dummy_inputs = super().generate_dummy_inputs(**kwargs)
-
-        return dummy_inputs
-
-    def patch_model_for_export(self, model, num_beams=1):
+    def patch_model_for_export(self, model, **kwargs):
+        num_beams = kwargs.pop("num_beams", 1)
         return super().patch_model_for_export(
             model=model,
             custom_model_wrapper=T5EncoderWrapper,
+            custom_wrapper_kwargs={"num_beams": num_beams},
         )
 
 
 @register_in_tasks_manager("t5-decoder", "text2text-generation")
 class T5DecoderNeuronConfig(TextSeq2SeqNeuronConfig):
     ATOL_FOR_VALIDATION = 1e-3
-    MANDATORY_AXES = ("batch_size", "sequence_length")
+    DUMMY_INPUT_GENERATOR_CLASSES = TextSeq2SeqNeuronConfig.DUMMY_INPUT_GENERATOR_CLASSES + (DummyBeamValuesGenerator,)
+    MANDATORY_AXES = ("batch_size", "sequence_length", "num_beams")
     MODEL_TYPE = "t5-decoder"
     NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
         hidden_size="d_model",
@@ -407,18 +407,46 @@ class T5DecoderNeuronConfig(TextSeq2SeqNeuronConfig):
 
     @property
     def inputs(self) -> List[str]:
-        common_inputs = super().inputs() + ["beam_idx", "beam_scores"]
+        common_inputs = super().inputs + ["beam_idx", "beam_scores"]
 
         return common_inputs
 
-    def patch_model_for_export(self, model, dummy_inputs):
-        return super().patch_model(
-            model=model,
-            custom_model_wrapper=T5DecoderWrapper,
+    def generate_dummy_inputs(self, **kwargs):
+        batch_size = kwargs.pop("batch_size") * kwargs.get("num_beams")
+        dummy_inputs = super().generate_dummy_inputs(batch_size=batch_size, **kwargs)
+        dummy_inputs["decoder_input_ids"] = dummy_inputs["decoder_input_ids"][:, :1]  # sequence_length = 1
+        dummy_inputs["encoder_hidden_states"] = dummy_inputs["encoder_hidden_states"][0]
+
+        return dummy_inputs
+
+    def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]:
+        dummy_inputs_generators = super()._create_dummy_input_generator_classes(**kwargs)
+        dummy_beam_values_generator = self.DUMMY_INPUT_GENERATOR_CLASSES[-1](
+            self.task,
+            self._normalized_config,
+            num_beams=kwargs.pop("num_beams", 1),
+            **kwargs,
         )
+        dummy_inputs_generators.append(dummy_beam_values_generator)
+        return dummy_inputs_generators
 
-    def generate_io_aliases(self, model, dummy_inputs):
-        return super().patch_model(
+    def patch_model_for_export(self, model, **kwargs):
+        return super().patch_model_for_export(
             model=model,
             custom_model_wrapper=T5DecoderWrapper,
+            custom_wrapper_kwargs={
+                "batch_size": kwargs.pop("batch_size", 1),
+                "sequence_length": kwargs.pop("sequence_length", 1),
+                "num_beams": kwargs.pop("num_beams", 1),
+            },
         )
+
+    def generate_io_aliases(self, model):
+        num_outputs_from_trace = 3 if model.num_beams > 1 else 1
+        aliases = {}
+        for i in range(len(model.past_key_values_sa)):
+            aliases[model.past_key_values_sa[i]] = i + num_outputs_from_trace
+        for i in range(len(model.past_key_values_ca)):
+            aliases[model.past_key_values_ca[i]] = len(model.past_key_values_sa) + i + num_outputs_from_trace
+
+        return aliases
diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py
index 3c51668e2..8477fdd07 100644
--- a/optimum/exporters/neuron/model_wrappers.py
+++ b/optimum/exporters/neuron/model_wrappers.py
@@ -47,11 +47,13 @@ class T5EncoderWrapper(torch.nn.Module):
     def __init__(
         self,
         model: "PreTrainedModel",
+        num_beams: int = 1,
         tp_degree=None,
     ):
         super().__init__()
         self.model = model
         self.config = model.config
+        self.num_beams = num_beams
         self.device = "xla"
         self.tp_degree = tp_degree
 
@@ -65,6 +67,9 @@ def forward(self, input_ids, attention_mask):
         )
 
         last_hidden_state = encoder_output["last_hidden_state"]
+        encoder_hidden_states = torch.concat(
+            [tensor.unsqueeze(0).repeat(self.num_beams, 1, 1) for tensor in last_hidden_state]
+        )
 
         decoder_blocks = self.model.decoder.block
         present_key_value_states_sa = []
@@ -77,10 +82,12 @@ def forward(self, input_ids, attention_mask):
 
             def shape(states):
                 """projection"""
-                return states.view(batch_size, -1, self.config.num_heads, attention.key_value_proj_dim).transpose(1, 2)
+                return states.view(
+                    self.num_beams * batch_size, -1, self.config.num_heads, attention.key_value_proj_dim
+                ).transpose(1, 2)
 
-            key_states = shape(attention.k(last_hidden_state))
-            value_states = shape(attention.v(last_hidden_state))
+            key_states = shape(attention.k(encoder_hidden_states))
+            value_states = shape(attention.v(encoder_hidden_states))
 
             # cross_attn_kv_state
             present_key_value_states_ca.append(key_states)
@@ -91,7 +98,7 @@ def shape(states):
             # [key states]
             present_key_value_states_sa.append(
                 torch.zeros(
-                    (batch_size, self.config.num_heads, sequence_length - 1, self.config.d_kv),
+                    (self.num_beams * batch_size, self.config.num_heads, sequence_length - 1, self.config.d_kv),
                     dtype=torch.float32,
                     device=self.device,
                 )
@@ -99,7 +106,7 @@ def shape(states):
             # [value states]
             present_key_value_states_sa.append(
                 torch.zeros(
-                    (batch_size, self.config.num_heads, sequence_length - 1, self.config.d_kv),
+                    (self.num_beams * batch_size, self.config.num_heads, sequence_length - 1, self.config.d_kv),
                     dtype=torch.float32,
                     device=self.device,
                 )
@@ -110,12 +117,15 @@ def shape(states):
 
 # Adapted from https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/torch-neuronx/t5-inference-tutorial.html
 class T5DecoderWrapper(torch.nn.Module):
-    """Wrapper to trace the decoder with past with a language head."""
+    """Wrapper to trace the decoder with past keys values with a language head."""
 
-    def __init__(self, model: "PreTrainedModel", num_beams: int, sequence_length: int, tp_degree=None):
+    def __init__(
+        self, model: "PreTrainedModel", batch_size: int, sequence_length: int, num_beams: int = 1, tp_degree=None
+    ):
         super().__init__()
         self.model = model
         self.config = model.config
+        self.batch_size = batch_size
         self.num_beams = num_beams
         self.device = "xla"
         self.tp_degree = tp_degree
@@ -125,7 +135,13 @@ def __init__(self, model: "PreTrainedModel", num_beams: int, sequence_length: in
             [
                 torch.nn.Parameter(
                     torch.ones(
-                        (num_beams, self.config.num_heads, sequence_length - 1, self.config.d_kv), dtype=torch.float32
+                        (
+                            self.batch_size * self.num_beams,
+                            self.config.num_heads,
+                            sequence_length - 1,
+                            self.config.d_kv,
+                        ),
+                        dtype=torch.float32,
                     ),
                     requires_grad=False,
                 )
@@ -136,7 +152,8 @@ def __init__(self, model: "PreTrainedModel", num_beams: int, sequence_length: in
             [
                 torch.nn.Parameter(
                     torch.ones(
-                        (num_beams, self.config.num_heads, sequence_length, self.config.d_kv), dtype=torch.float32
+                        (self.batch_size * self.num_beams, self.config.num_heads, sequence_length, self.config.d_kv),
+                        dtype=torch.float32,
                     ),
                     requires_grad=False,
                 )
@@ -175,9 +192,6 @@ def forward(
         beam_scores,
         **kwargs,
     ):
-        # Infer shapes
-        batch_size = input_ids.shape[0] or 1
-
         if self.num_beams > 1:
             # We reorder the cache based on the beams selected in each iteration. Required step for beam search.
             past_key_values_sa = self.reorder_cache(self.past_key_values_sa, beam_idx)
@@ -234,7 +248,7 @@ def forward(
 
             # reshape for beam search
             vocab_size = next_token_scores.shape[-1]
-            next_token_scores = next_token_scores.view(batch_size, self.num_beams * vocab_size)
+            next_token_scores = next_token_scores.view(self.batch_size, self.num_beams * vocab_size)
             next_token_scores = next_token_scores * 1
 
             # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search)
diff --git a/optimum/neuron/utils/__init__.py b/optimum/neuron/utils/__init__.py
index 96af3e158..c859ba71b 100644
--- a/optimum/neuron/utils/__init__.py
+++ b/optimum/neuron/utils/__init__.py
@@ -33,6 +33,7 @@
     is_torch_xla_available,
     is_transformers_neuronx_available,
 )
+from .input_generators import DummyBeamValuesGenerator
 from .optimization_utils import get_attention_scores_sd, get_attention_scores_sdxl
 from .patching import DynamicPatch, ModelPatcher, Patcher, patch_everywhere, patch_within_function
 from .training_utils import (
diff --git a/optimum/neuron/utils/input_generators.py b/optimum/neuron/utils/input_generators.py
new file mode 100644
index 000000000..1616123a9
--- /dev/null
+++ b/optimum/neuron/utils/input_generators.py
@@ -0,0 +1,45 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dummy input generation classes."""
+import torch
+
+from ...utils import DTYPE_MAPPER, DummyInputGenerator, NormalizedTextConfig
+
+
+class DummyBeamValuesGenerator(DummyInputGenerator):
+    """
+    Generates dummy beam search inputs.
+    """
+
+    SUPPORTED_INPUT_NAMES = (
+        "beam_idx",
+        "beam_scores",
+    )
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedTextConfig,
+        num_beams: int = 1,
+        **kwargs,
+    ):
+        self.task = task
+        self.num_beams = num_beams
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "beam_idx":
+            return torch.arange(0, self.num_beams, dtype=DTYPE_MAPPER.pt(int_dtype))
+        elif input_name == "beam_scores":
+            return torch.zeros((self.num_beams,), dtype=DTYPE_MAPPER.pt(float_dtype))

From 7621e39ea652910aded31a28da17ec2618738f15 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Wed, 25 Oct 2023 10:53:20 +0000
Subject: [PATCH 05/30] CLI support

---
 optimum/commands/export/neuronx.py        |   5 +
 optimum/exporters/neuron/__main__.py      | 178 +++++++++++++++-------
 optimum/exporters/neuron/config.py        |   4 +
 optimum/exporters/neuron/convert.py       |   3 +-
 optimum/exporters/neuron/model_configs.py |   8 +
 optimum/exporters/neuron/utils.py         |  24 ++-
 6 files changed, 156 insertions(+), 66 deletions(-)

diff --git a/optimum/commands/export/neuronx.py b/optimum/commands/export/neuronx.py
index 72673b8a4..616cee693 100644
--- a/optimum/commands/export/neuronx.py
+++ b/optimum/commands/export/neuronx.py
@@ -102,6 +102,11 @@ def parse_args_neuronx(parser: "ArgumentParser"):
         type=int,
         help=f"Sequence length {doc_input}",
     )
+    input_group.add_argument(
+        "--num_beams",
+        type=int,
+        help=f"Number of beams for beam search {doc_input}",
+    )
     input_group.add_argument(
         "--num_choices",
         type=int,
diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
index bac4906da..f0402552d 100644
--- a/optimum/exporters/neuron/__main__.py
+++ b/optimum/exporters/neuron/__main__.py
@@ -25,11 +25,13 @@
 from transformers import AutoConfig
 
 from ...neuron.utils import (
+    DECODER_NAME,
     DIFFUSION_MODEL_TEXT_ENCODER_2_NAME,
     DIFFUSION_MODEL_TEXT_ENCODER_NAME,
     DIFFUSION_MODEL_UNET_NAME,
     DIFFUSION_MODEL_VAE_DECODER_NAME,
     DIFFUSION_MODEL_VAE_ENCODER_NAME,
+    ENCODER_NAME,
     NEURON_FILE_NAME,
     is_neuron_available,
     is_neuronx_available,
@@ -43,6 +45,7 @@
 from .model_configs import *  # noqa: F403
 from .utils import (
     build_stable_diffusion_components_mandatory_shapes,
+    get_encoder_decoder_models_for_export,
     get_stable_diffusion_models_for_export,
 )
 
@@ -63,8 +66,10 @@
 
 
 if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
     if is_diffusers_available():
-        from diffusers import StableDiffusionPipeline
+        from diffusers import DiffusionPipeline, StableDiffusionPipeline
 
 
 logger = logging.get_logger()
@@ -102,7 +107,11 @@ def infer_task(task: str, model_name_or_path: str) -> str:
 
 def normalize_input_shapes(task: str, args: argparse.Namespace) -> Dict[str, int]:
     config = AutoConfig.from_pretrained(args.model)
+
     model_type = config.model_type.replace("_", "-")
+    if config.is_encoder_decoder:
+        model_type = model_type + "-encoder"
+
     neuron_config_constructor = TasksManager.get_exporter_config_constructor(
         model_type=model_type, exporter="neuron", task=task
     )
@@ -172,6 +181,113 @@ def infer_stable_diffusion_shapes_from_diffusers(
     return input_shapes
 
 
+def _get_submodels_and_neuron_configs(
+    model: Union["PreTrainedModel", "DiffusionPipeline"],
+    input_shapes: Dict[str, int],
+    task: str,
+    output: Path,
+    dynamic_batch_size: bool = False,
+    model_name_or_path: Optional[Union[str, Path]] = None,
+):
+    is_stable_diffusion = "stable-diffusion" in task
+    is_encoder_decoder = model.config.is_encoder_decoder
+
+    if is_stable_diffusion:
+        return _get_submodels_and_neuron_configs_for_stable_diffusion(
+            model, input_shapes, task, output, dynamic_batch_size
+        )
+    elif is_encoder_decoder:
+        return _get_submodels_and_neuron_configs_for_encoder_decoder(
+            model, input_shapes, task, output, dynamic_batch_size, model_name_or_path
+        )
+    else:
+        neuron_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=model, exporter="neuron", task=task
+        )
+        neuron_config = neuron_config_constructor(model.config, dynamic_batch_size=dynamic_batch_size, **input_shapes)
+        model_name = model.name_or_path.split("/")[-1]
+        output_model_names = {model_name: "model.neuron"}
+        models_and_neuron_configs = {model_name: (model, neuron_config)}
+        maybe_save_preprocessors(model_name_or_path, output)
+        return models_and_neuron_configs, output_model_names
+
+
+def _get_submodels_and_neuron_configs_for_stable_diffusion(
+    model: Union["PreTrainedModel", "DiffusionPipeline"],
+    input_shapes: Dict[str, int],
+    task: str,
+    output: Path,
+    dynamic_batch_size: bool = False,
+):
+    check_compiler_compatibility_for_stable_diffusion()
+    if is_neuron_available():
+        raise RuntimeError(
+            "Stable diffusion export is not supported by neuron-cc on inf1, please use neuronx-cc on either inf2/trn1 instead."
+        )
+    input_shapes = infer_stable_diffusion_shapes_from_diffusers(input_shapes, model)
+
+    # Saving the model config and preprocessor as this is needed sometimes.
+    model.scheduler.save_pretrained(output.joinpath("scheduler"))
+    if hasattr(model, "tokenizer") and model.tokenizer is not None:
+        model.tokenizer.save_pretrained(output.joinpath("tokenizer"))
+    if hasattr(model, "tokenizer_2") and model.tokenizer_2 is not None:
+        model.tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
+    if hasattr(model, "feature_extractor"):
+        model.feature_extractor.save_pretrained(output.joinpath("feature_extractor"))
+    model.save_config(output)
+
+    models_and_neuron_configs = get_stable_diffusion_models_for_export(
+        pipeline=model,
+        task=task,
+        dynamic_batch_size=dynamic_batch_size,
+        **input_shapes,
+    )
+    output_model_names = {
+        DIFFUSION_MODEL_UNET_NAME: os.path.join(DIFFUSION_MODEL_UNET_NAME, NEURON_FILE_NAME),
+        DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME),
+        DIFFUSION_MODEL_VAE_DECODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_DECODER_NAME, NEURON_FILE_NAME),
+    }
+    if hasattr(model, "text_encoder") and model.text_encoder is not None:
+        output_model_names[DIFFUSION_MODEL_TEXT_ENCODER_NAME] = os.path.join(
+            DIFFUSION_MODEL_TEXT_ENCODER_NAME, NEURON_FILE_NAME
+        )
+    if hasattr(model, "text_encoder_2") and model.text_encoder_2 is not None:
+        output_model_names[DIFFUSION_MODEL_TEXT_ENCODER_2_NAME] = os.path.join(
+            DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, NEURON_FILE_NAME
+        )
+    del model
+
+    return models_and_neuron_configs, output_model_names
+
+
+def _get_submodels_and_neuron_configs_for_encoder_decoder(
+    model: Union["PreTrainedModel", "DiffusionPipeline"],
+    input_shapes: Dict[str, int],
+    task: str,
+    output: Path,
+    dynamic_batch_size: bool = False,
+    model_name_or_path: Optional[Union[str, Path]] = None,
+):
+    if is_neuron_available():
+        raise RuntimeError(
+            "Encoder-decoder models export is not supported by neuron-cc on inf1, please use neuronx-cc on either inf2/trn1 instead."
+        )
+
+    models_and_neuron_configs = get_encoder_decoder_models_for_export(
+        model=model,
+        task=task,
+        dynamic_batch_size=dynamic_batch_size,
+        input_shapes=input_shapes,
+    )
+    output_model_names = {
+        ENCODER_NAME: os.path.join(ENCODER_NAME, NEURON_FILE_NAME),
+        DECODER_NAME: os.path.join(DECODER_NAME, NEURON_FILE_NAME),
+    }
+    maybe_save_preprocessors(model_name_or_path, output)
+
+    return models_and_neuron_configs, output_model_names
+
+
 def main_export(
     model_name_or_path: str,
     output: Union[str, Path],
@@ -194,6 +310,7 @@ def main_export(
         output.parent.mkdir(parents=True)
 
     task = TasksManager.map_from_synonym(task)
+    is_stable_diffusion = "stable-diffusion" in task
 
     model_kwargs = {
         "task": task,
@@ -209,57 +326,14 @@ def main_export(
     }
     model = TasksManager.get_model_from_task(**model_kwargs)
 
-    is_stable_diffusion = "stable-diffusion" in task
-    if not is_stable_diffusion:
-        neuron_config_constructor = TasksManager.get_exporter_config_constructor(
-            model=model, exporter="neuron", task=task
-        )
-        neuron_config = neuron_config_constructor(model.config, dynamic_batch_size=dynamic_batch_size, **input_shapes)
-        if atol is None:
-            atol = neuron_config.ATOL_FOR_VALIDATION
-        model_name = model.name_or_path.split("/")[-1]
-        output_model_names = {model_name: "model.neuron"}
-        models_and_neuron_configs = {model_name: (model, neuron_config)}
-        maybe_save_preprocessors(model, output.parent)
-
-    if is_stable_diffusion:
-        check_compiler_compatibility_for_stable_diffusion()
-        if is_neuron_available():
-            raise RuntimeError(
-                "Stable diffusion export is not supported by neuron-cc on inf1, please use neuronx-cc on either inf2/trn1 instead."
-            )
-        input_shapes = infer_stable_diffusion_shapes_from_diffusers(input_shapes, model)
-
-        # Saving the model config and preprocessor as this is needed sometimes.
-        model.scheduler.save_pretrained(output.joinpath("scheduler"))
-        if hasattr(model, "tokenizer") and model.tokenizer is not None:
-            model.tokenizer.save_pretrained(output.joinpath("tokenizer"))
-        if hasattr(model, "tokenizer_2") and model.tokenizer_2 is not None:
-            model.tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
-        if hasattr(model, "feature_extractor"):
-            model.feature_extractor.save_pretrained(output.joinpath("feature_extractor"))
-        model.save_config(output)
-
-        models_and_neuron_configs = get_stable_diffusion_models_for_export(
-            pipeline=model,
-            task=task,
-            dynamic_batch_size=dynamic_batch_size,
-            **input_shapes,
-        )
-        output_model_names = {
-            DIFFUSION_MODEL_UNET_NAME: os.path.join(DIFFUSION_MODEL_UNET_NAME, NEURON_FILE_NAME),
-            DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME),
-            DIFFUSION_MODEL_VAE_DECODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_DECODER_NAME, NEURON_FILE_NAME),
-        }
-        if hasattr(model, "text_encoder") and model.text_encoder is not None:
-            output_model_names[DIFFUSION_MODEL_TEXT_ENCODER_NAME] = os.path.join(
-                DIFFUSION_MODEL_TEXT_ENCODER_NAME, NEURON_FILE_NAME
-            )
-        if hasattr(model, "text_encoder_2") and model.text_encoder_2 is not None:
-            output_model_names[DIFFUSION_MODEL_TEXT_ENCODER_2_NAME] = os.path.join(
-                DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, NEURON_FILE_NAME
-            )
-        del model
+    models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs(
+        model=model,
+        input_shapes=input_shapes,
+        task=task,
+        output=output,
+        dynamic_batch_size=dynamic_batch_size,
+        model_name_or_path=model_name_or_path,
+    )
 
     _, neuron_outputs = export_models(
         models_and_neuron_configs=models_and_neuron_configs,
diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py
index 3f18d67d6..42fc7d593 100644
--- a/optimum/exporters/neuron/config.py
+++ b/optimum/exporters/neuron/config.py
@@ -78,6 +78,10 @@ class TextSeq2SeqNeuronConfig(NeuronConfig):
         DummySeq2SeqPastKeyValuesGenerator,
     )
 
+    @property
+    def is_decoder(self) -> bool:
+        raise NotImplementedError()
+
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         common_inputs = []
diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index 2466cf9c5..185bbf1d3 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -428,7 +428,8 @@ def export_neuronx(
     aliases = {}
     if model.config.is_encoder_decoder:
         checked_model = config.patch_model_for_export(model, **input_shapes)
-        aliases = config.generate_io_aliases(checked_model)
+        if getattr(config, "is_decoder", False):
+            aliases = config.generate_io_aliases(checked_model)
     else:
         checked_model = config.patch_model_for_export(model, dummy_inputs)
 
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index 92d566323..3d3bd3395 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -381,6 +381,10 @@ class T5EncoderNeuronConfig(TextSeq2SeqNeuronConfig):
         allow_new=True,
     )
 
+    @property
+    def is_decoder(self) -> bool:
+        return False
+
     def patch_model_for_export(self, model, **kwargs):
         num_beams = kwargs.pop("num_beams", 1)
         return super().patch_model_for_export(
@@ -405,6 +409,10 @@ class T5DecoderNeuronConfig(TextSeq2SeqNeuronConfig):
         allow_new=True,
     )
 
+    @property
+    def is_decoder(self) -> bool:
+        return True
+
     @property
     def inputs(self) -> List[str]:
         common_inputs = super().inputs + ["beam_idx", "beam_scores"]
diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py
index d4c9a85bf..04cec9208 100644
--- a/optimum/exporters/neuron/utils.py
+++ b/optimum/exporters/neuron/utils.py
@@ -326,8 +326,8 @@ def override_diffusers_2_0_attn_processors(model):
 
 def get_encoder_decoder_models_for_export(
     model: "PreTrainedModel",
-    encoder_input_shapes: Dict[str, int],
-    decoder_input_shapes: Dict[str, int],
+    task: str,
+    input_shapes: Dict[str, int],
     dynamic_batch_size: Optional[bool] = False,
 ) -> Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]:
     """
@@ -339,10 +339,8 @@ def get_encoder_decoder_models_for_export(
     Args:
         model ("PreTrainedModel"):
             The model to export.
-        encoder_input_shapes (`Dict[str, int]`):
-            Static shapes used for compiling the encoder.
-        decoder_input_shapes (`Dict[str, int]`):
-            Static shapes used for compiling the decoder.
+        input_shapes (`Dict[str, int]`):
+            Static shapes used for compiling the encoder and the decoder.
         dynamic_batch_size (`bool`, defaults to `False`):
             Whether the Neuron compiled model supports dynamic batch size.
 
@@ -350,31 +348,31 @@ def get_encoder_decoder_models_for_export(
         `Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]`: A Dict containing the model and
         Neuron configs for the different components of the model.
     """
-    models_for_export = []
+    models_for_export = {}
 
     # Encoder
     model_type = getattr(model.config, "model_type") + "-encoder"
     encoder_config_constructor = TasksManager.get_exporter_config_constructor(
-        exporter="neuron", model_type=model_type, task="text2text-generation"
+        exporter="neuron", model_type=model_type, task=task
     )
     encoder_neuron_config = encoder_config_constructor(
         config=model.config,
-        task="text2text-generation",
+        task=task,
         dynamic_batch_size=dynamic_batch_size,
-        **encoder_input_shapes,
+        **input_shapes,
     )
     models_for_export[ENCODER_NAME] = (model, encoder_neuron_config)
 
     # Decoder
     model_type = getattr(model.config, "model_type") + "-decoder"
     decoder_config_constructor = TasksManager.get_exporter_config_constructor(
-        exporter="neuron", model_type=model_type, task="text2text-generation"
+        exporter="neuron", model_type=model_type, task=task
     )
     decoder_neuron_config = decoder_config_constructor(
         config=model.config,
-        task="text2text-generation",
+        task=task,
         dynamic_batch_size=dynamic_batch_size,
-        **decoder_input_shapes,
+        **input_shapes,
     )
     models_for_export[DECODER_NAME] = (model, decoder_neuron_config)
 

From 1eaa54adccc87e73a3421eb5d6840393b09827c1 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Wed, 25 Oct 2023 22:55:44 +0000
Subject: [PATCH 06/30] validation

---
 optimum/exporters/neuron/config.py         | 21 +++++-
 optimum/exporters/neuron/convert.py        | 12 ++--
 optimum/exporters/neuron/model_configs.py  | 13 ++--
 optimum/exporters/neuron/model_wrappers.py | 83 +++++++++++++++-------
 4 files changed, 91 insertions(+), 38 deletions(-)

diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py
index 42fc7d593..82cbf4450 100644
--- a/optimum/exporters/neuron/config.py
+++ b/optimum/exporters/neuron/config.py
@@ -88,7 +88,6 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
         # encoder + decoder without past
         if "encoder" in self.MODEL_TYPE:
             common_inputs = ["input_ids", "attention_mask"]
-
         # decoder with past
         if "decoder" in self.MODEL_TYPE:
             common_inputs = [
@@ -102,12 +101,28 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
 
     @property
     def outputs(self) -> Dict[str, Dict[int, str]]:
+        common_outputs = []
         # encoder + decoder without past
         if "encoder" in self.MODEL_TYPE:
-            common_outputs = ["present_key_values_self_attn", "past_key_values_cross_attn"]
+            common_outputs = (
+                [f"present.{idx}.self.key" for idx in range(self._config.num_decoder_layers)]
+                + [f"present.{idx}.self.value" for idx in range(self._config.num_decoder_layers)]
+                + [f"present.{idx}.cross.key" for idx in range(self._config.num_decoder_layers)]
+                + [f"present.{idx}.cross.value" for idx in range(self._config.num_decoder_layers)]
+            )
         # decoder with past
         if "decoder" in self.MODEL_TYPE:
-            common_outputs = ["next_tokens", "past_key_values_self_attn", "past_key_values_cross_attn"]
+            beam_outputs = (
+                ["next_token_scores", "next_tokens", "next_indices"] if self.num_beams > 1 else ["next_tokens"]
+            )
+            # for i in range(self._config.num_decoder_layers):
+            common_outputs = (
+                beam_outputs
+                + [f"past.{idx}.self.key" for idx in range(self._config.num_decoder_layers)]
+                + [f"past.{idx}.self.value" for idx in range(self._config.num_decoder_layers)]
+                + [f"past.{idx}.cross.key" for idx in range(self._config.num_decoder_layers)]
+                + [f"past.{idx}.cross.value" for idx in range(self._config.num_decoder_layers)]
+            )
         return common_outputs
 
     def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]:
diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index 185bbf1d3..c4072cc2a 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -169,8 +169,12 @@ def validate_model_outputs(
     with torch.no_grad():
         reference_model.eval()
         ref_inputs = config.generate_dummy_inputs(return_tuple=False, **input_shapes)
-        if hasattr(config._config, "_class_name") and "AutoencoderKL" in config._config._class_name:
-            # VAE components for stable diffusion
+        if reference_model.config.is_encoder_decoder:
+            reference_model = config.patch_model_for_export(reference_model, device="cpu", **input_shapes)
+        if (
+            hasattr(config._config, "_class_name") and "AutoencoderKL" in config._config._class_name
+        ) or reference_model.config.is_encoder_decoder:
+            # VAE components for stable diffusion or Encoder-Decoder models
             ref_inputs = tuple(ref_inputs.values())
             ref_outputs = reference_model(*ref_inputs)
             neuron_inputs = ref_inputs
@@ -217,9 +221,9 @@ def validate_model_outputs(
     # Check the shape and values match
     shape_failures = []
     value_failures = []
-    for name, output in zip(neuron_output_names_list, neuron_outputs):
+    for i, (name, output) in enumerate(zip(neuron_output_names_list, neuron_outputs)):
         if isinstance(output, torch.Tensor):
-            ref_output = ref_outputs[name].numpy()
+            ref_output = ref_outputs[name].numpy() if isinstance(ref_outputs, Dict) else ref_outputs[i].numpy()
             output = output.numpy()
         elif isinstance(output, tuple):  # eg. `hidden_states` of `AutoencoderKL` is a tuple of tensors.
             ref_output = torch.stack(ref_outputs[name]).numpy()
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index 3d3bd3395..0b251cc27 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -372,6 +372,7 @@ class T5EncoderNeuronConfig(TextSeq2SeqNeuronConfig):
     ATOL_FOR_VALIDATION = 1e-3
     MANDATORY_AXES = ("batch_size", "sequence_length", "num_beams")
     MODEL_TYPE = "t5-encoder"
+    CUSTOM_MODEL_WRAPPER = T5EncoderWrapper
     NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
         hidden_size="d_model",
         num_attention_heads="num_heads",
@@ -385,12 +386,12 @@ class T5EncoderNeuronConfig(TextSeq2SeqNeuronConfig):
     def is_decoder(self) -> bool:
         return False
 
-    def patch_model_for_export(self, model, **kwargs):
+    def patch_model_for_export(self, model, device="xla", **kwargs):
         num_beams = kwargs.pop("num_beams", 1)
         return super().patch_model_for_export(
             model=model,
-            custom_model_wrapper=T5EncoderWrapper,
-            custom_wrapper_kwargs={"num_beams": num_beams},
+            custom_model_wrapper=self.CUSTOM_MODEL_WRAPPER,
+            custom_wrapper_kwargs={"num_beams": num_beams, "device": device},
         )
 
 
@@ -400,6 +401,7 @@ class T5DecoderNeuronConfig(TextSeq2SeqNeuronConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = TextSeq2SeqNeuronConfig.DUMMY_INPUT_GENERATOR_CLASSES + (DummyBeamValuesGenerator,)
     MANDATORY_AXES = ("batch_size", "sequence_length", "num_beams")
     MODEL_TYPE = "t5-decoder"
+    CUSTOM_MODEL_WRAPPER = T5DecoderWrapper
     NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
         hidden_size="d_model",
         num_attention_heads="num_heads",
@@ -438,11 +440,12 @@ def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGen
         dummy_inputs_generators.append(dummy_beam_values_generator)
         return dummy_inputs_generators
 
-    def patch_model_for_export(self, model, **kwargs):
+    def patch_model_for_export(self, model, device="xla", **kwargs):
         return super().patch_model_for_export(
             model=model,
-            custom_model_wrapper=T5DecoderWrapper,
+            custom_model_wrapper=self.CUSTOM_MODEL_WRAPPER,
             custom_wrapper_kwargs={
+                "device": device,
                 "batch_size": kwargs.pop("batch_size", 1),
                 "sequence_length": kwargs.pop("sequence_length", 1),
                 "num_beams": kwargs.pop("num_beams", 1),
diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py
index 8477fdd07..6bc869203 100644
--- a/optimum/exporters/neuron/model_wrappers.py
+++ b/optimum/exporters/neuron/model_wrappers.py
@@ -48,13 +48,14 @@ def __init__(
         self,
         model: "PreTrainedModel",
         num_beams: int = 1,
+        device: str = "xla",
         tp_degree=None,
     ):
         super().__init__()
         self.model = model
         self.config = model.config
         self.num_beams = num_beams
-        self.device = "xla"
+        self.device = device
         self.tp_degree = tp_degree
 
     def forward(self, input_ids, attention_mask):
@@ -120,46 +121,72 @@ class T5DecoderWrapper(torch.nn.Module):
     """Wrapper to trace the decoder with past keys values with a language head."""
 
     def __init__(
-        self, model: "PreTrainedModel", batch_size: int, sequence_length: int, num_beams: int = 1, tp_degree=None
+        self,
+        model: "PreTrainedModel",
+        batch_size: int,
+        sequence_length: int,
+        num_beams: int = 1,
+        device: str = "xla",
+        tp_degree=None,
     ):
         super().__init__()
         self.model = model
         self.config = model.config
         self.batch_size = batch_size
+        self.sequence_length = sequence_length
         self.num_beams = num_beams
-        self.device = "xla"
+        self.device = device
         self.tp_degree = tp_degree
 
         # Initialize KV cache (num_beams, n_heads, seq_length, dim_per_head)
-        self.past_key_values_sa = torch.nn.ParameterList(
-            [
-                torch.nn.Parameter(
-                    torch.ones(
-                        (
-                            self.batch_size * self.num_beams,
-                            self.config.num_heads,
-                            sequence_length - 1,
-                            self.config.d_kv,
-                        ),
-                        dtype=torch.float32,
-                    ),
-                    requires_grad=False,
+        if device == "cpu":
+            self.past_key_values_sa = [
+                torch.ones(
+                    (num_beams, self.config.num_heads, self.sequence_length - 1, self.config.d_kv), dtype=torch.float32
                 )
                 for _ in range(self.config.num_decoder_layers * 2)
             ]
-        )
-        self.past_key_values_ca = torch.nn.ParameterList(
-            [
-                torch.nn.Parameter(
-                    torch.ones(
-                        (self.batch_size * self.num_beams, self.config.num_heads, sequence_length, self.config.d_kv),
-                        dtype=torch.float32,
-                    ),
-                    requires_grad=False,
+            self.past_key_values_ca = [
+                torch.ones(
+                    (num_beams, self.config.num_heads, self.sequence_length, self.config.d_kv), dtype=torch.float32
                 )
                 for _ in range(self.config.num_decoder_layers * 2)
             ]
-        )
+        elif device == "xla":
+            self.past_key_values_sa = torch.nn.ParameterList(
+                [
+                    torch.nn.Parameter(
+                        torch.ones(
+                            (
+                                self.batch_size * self.num_beams,
+                                self.config.num_heads,
+                                sequence_length - 1,
+                                self.config.d_kv,
+                            ),
+                            dtype=torch.float32,
+                        ),
+                        requires_grad=False,
+                    )
+                    for _ in range(self.config.num_decoder_layers * 2)
+                ]
+            )
+            self.past_key_values_ca = torch.nn.ParameterList(
+                [
+                    torch.nn.Parameter(
+                        torch.ones(
+                            (
+                                self.batch_size * self.num_beams,
+                                self.config.num_heads,
+                                sequence_length,
+                                self.config.d_kv,
+                            ),
+                            dtype=torch.float32,
+                        ),
+                        requires_grad=False,
+                    )
+                    for _ in range(self.config.num_decoder_layers * 2)
+                ]
+            )
 
     def update_past(self, past_key_values):
         new_past_sa = []
@@ -235,6 +262,10 @@ def forward(
         past_key_values_sa = [vec for kv_per_layer in past_key_values_sa for vec in kv_per_layer]
         past_key_values_ca = [vec for kv_per_layer in past_key_values_ca for vec in kv_per_layer]
 
+        if self.device == "cpu":
+            self.past_key_values_sa = past_key_values_sa
+            self.past_key_values_ca = past_key_values_ca
+
         # We calculate topk inside the wrapper
         next_token_logits = lm_logits[:, -1, :]
 

From 2231afbdd9db0dd9b07ffd357947bed7cfec3b35 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Thu, 26 Oct 2023 17:24:28 +0000
Subject: [PATCH 07/30] add seq2seq base model

---
 optimum/exporters/neuron/utils.py    |  11 ++
 optimum/neuron/__init__.py           |   2 +
 optimum/neuron/modeling_diffusion.py |   2 +
 optimum/neuron/modeling_seq2seq.py   | 156 +++++++++++++++++++++++----
 4 files changed, 149 insertions(+), 22 deletions(-)

diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py
index 04cec9208..29e898f2d 100644
--- a/optimum/exporters/neuron/utils.py
+++ b/optimum/exporters/neuron/utils.py
@@ -324,6 +324,15 @@ def override_diffusers_2_0_attn_processors(model):
     return model
 
 
+def check_mandatory_input_shapes(neuron_config_constructor, task, input_shapes):
+    mandatory_shapes = neuron_config_constructor.func.get_mandatory_axes_for_task(task)
+    for name in mandatory_shapes:
+        if input_shapes.get(name, None) is None:
+            raise AttributeError(
+                f"Cannot find the value of `{name}` which is mandatory for exporting the model to the neuron format, please set the value explicitly."
+            )
+
+
 def get_encoder_decoder_models_for_export(
     model: "PreTrainedModel",
     task: str,
@@ -355,6 +364,7 @@ def get_encoder_decoder_models_for_export(
     encoder_config_constructor = TasksManager.get_exporter_config_constructor(
         exporter="neuron", model_type=model_type, task=task
     )
+    check_mandatory_input_shapes(encoder_config_constructor, task, input_shapes)
     encoder_neuron_config = encoder_config_constructor(
         config=model.config,
         task=task,
@@ -368,6 +378,7 @@ def get_encoder_decoder_models_for_export(
     decoder_config_constructor = TasksManager.get_exporter_config_constructor(
         exporter="neuron", model_type=model_type, task=task
     )
+    check_mandatory_input_shapes(encoder_config_constructor, task, input_shapes)
     decoder_neuron_config = decoder_config_constructor(
         config=model.config,
         task=task,
diff --git a/optimum/neuron/__init__.py b/optimum/neuron/__init__.py
index 12dcb93a9..1398d3f8a 100644
--- a/optimum/neuron/__init__.py
+++ b/optimum/neuron/__init__.py
@@ -41,6 +41,7 @@
         "NeuronStableDiffusionXLInpaintPipeline",
     ],
     "modeling_decoder": ["NeuronDecoderModel"],
+    "modeling_seq2seq": ["NeuronModelForSeq2SeqLM"],
     "accelerate": [
         "NeuronAccelerator",
         "NeuronAcceleratorState",
@@ -71,6 +72,7 @@
         NeuronStableDiffusionXLInpaintPipeline,
         NeuronStableDiffusionXLPipeline,
     )
+    from .modeling_seq2seq import NeuronModelForSeq2SeqLM
     from .pipelines import pipeline
     from .trainers import NeuronTrainer, Seq2SeqNeuronTrainer
     from .training_args import NeuronTrainingArguments, Seq2SeqNeuronTrainingArguments
diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py
index d2e947d47..5f0befdd1 100644
--- a/optimum/neuron/modeling_diffusion.py
+++ b/optimum/neuron/modeling_diffusion.py
@@ -360,6 +360,7 @@ def _from_pretrained(
         config: Dict[str, Any],
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
+        force_download: bool = False,
         cache_dir: Optional[str] = None,
         text_encoder_file_name: Optional[str] = NEURON_FILE_NAME,
         text_encoder_2_file_name: Optional[str] = NEURON_FILE_NAME,
@@ -400,6 +401,7 @@ def _from_pretrained(
                 local_files_only=local_files_only,
                 use_auth_token=use_auth_token,
                 revision=revision,
+                force_download=force_download,
                 allow_patterns=allow_patterns,
                 ignore_patterns=["*.msgpack", "*.safetensors", "*.bin"],
             )
diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index f39505cba..39346fd3d 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -13,17 +13,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """NeuroModelForXXX classes for seq2seq models' inference on neuron devices."""
+import os
 from abc import abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 import torch
-from transformers import AutoModelForSeq2SeqLM
+from huggingface_hub import snapshot_download
+from transformers import AutoConfig, AutoModelForSeq2SeqLM
 
+from ..exporters.neuron import (
+    NeuronConfig,
+    main_export,
+)
+from ..exporters.neuron.model_configs import *  # noqa: F403
+from ..exporters.tasks import TasksManager
+from ..utils.save_utils import maybe_load_preprocessors
 from .generation import NeuronGenerationMixin
-from .modeling_base import NeuronBaseModel, NeuronConfig
+from .modeling_base import NeuronBaseModel
 from .utils import (
+    DECODER_NAME,
+    ENCODER_NAME,
     NEURON_FILE_NAME,
     is_neuronx_available,
 )
@@ -43,23 +54,34 @@ def __init__(
         self,
         encoder: torch.jit._script.ScriptModule,
         decoder: torch.jit._script.ScriptModule,
-        config: "PretrainedConfig",
+        configs: Optional[Dict[str, "PretrainedConfig"]] = None,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
-        model_file_name: Optional[str] = None,
+        encoder_file_name: Optional[str] = NEURON_FILE_NAME,
+        decoder_file_name: Optional[str] = NEURON_FILE_NAME,
         preprocessors: Optional[List] = None,
-        neuron_config: Optional["NeuronConfig"] = None,
+        neuron_configs: Optional[Dict[str, "NeuronConfig"]] = None,
         **kwargs,
     ):
-        pass
-
-    @staticmethod
-    def load_model(
-        encoder_path: Union[str, Path],
-        decoder_path: Optional[Union[str, Path]] = None,
-        device_ids: Optional[List[int]] = None,
-        dynamic_batch_size: bool = False,
-    ):
-        pass
+        self.encoder = NeuronEncoder(
+            encoder,
+            self,
+            self.configs[ENCODER_NAME],
+            self.neuron_configs[ENCODER_NAME],
+        )
+        self.decoder = NeuronEncoder(
+            decoder,
+            self,
+            self.configs[DECODER_NAME],
+            self.neuron_configs[DECODER_NAME],
+        )
+        self.configs = configs
+        self.neuron_configs = neuron_configs
+        self.dynamic_batch_size = all(
+            neuron_config._config.neuron["dynamic_batch_size"] for neuron_config in self.neuron_configs.values()
+        )
+        self._attributes_init(model_save_dir, preprocessors, **kwargs)
+        self.encoder_file_name = encoder_file_name
+        self.decoder_file_name = decoder_file_name
 
     def _save_pretrained(
         self,
@@ -76,13 +98,14 @@ def _save_pretrained(
             save_directory (`Union[str, Path`]):
                 The directory where to save the model files.
         """
-        pass
+        save_directory = Path(save_directory)
+        # TODO
 
     @classmethod
     def _from_pretrained(
         cls,
         model_id: Union[str, Path],
-        config: Dict[str, Any],
+        config: "PretrainedConfig",
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -92,10 +115,63 @@ def _from_pretrained(
         subfolder: str = "",
         local_files_only: bool = False,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
-        device_ids: Optional[List[int]] = None,
         **kwargs,
     ):
-        pass
+        import pdb
+
+        pdb.set_trace()
+        patterns = {ENCODER_NAME, DECODER_NAME}
+
+        if not os.path.isdir(model_id):
+            allow_patterns = {os.path.join(k, "*") for k in patterns if not k.startswith("_")}
+            # Downloads all repo's files matching the allowed patterns
+            model_id = snapshot_download(
+                model_id,
+                cache_dir=cache_dir,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                force_download=force_download,
+                allow_patterns=allow_patterns,
+                ignore_patterns=["*.msgpack", "*.safetensors", "*.bin"],  # only download *.neuron artifacts
+            )
+
+        preprocessors = maybe_load_preprocessors(model_id, subfolder=subfolder)
+
+        new_model_save_dir = Path(model_id)
+
+        model_and_config_save_paths = {
+            "encoder": (
+                new_model_save_dir / ENCODER_NAME / encoder_file_name,
+                new_model_save_dir / ENCODER_NAME / cls.config_name,
+            ),
+            "decoder": (
+                new_model_save_dir / DECODER_NAME / decoder_file_name,
+                new_model_save_dir / DECODER_NAME / cls.config_name,
+            ),
+        }
+
+        # Re-build pretrained configs and neuron configs
+        configs, neuron_configs = {}, {}
+        for name, file_paths in model_and_config_save_paths.items():
+            if file_paths[1].is_file():
+                model_config = AutoConfig.from_json_file(file_paths[1])
+                configs[name] = model_config
+                neuron_configs[name] = cls._neuron_config_init(model_config)
+
+        encoder = cls.load_model(model_and_config_save_paths["encoder"][0])
+        decoder = cls.load_model(model_and_config_save_paths["decoder"][0])
+
+        return cls(
+            encoder=encoder,
+            decoder=decoder,
+            configs=configs,
+            model_save_dir=model_save_dir,
+            encoder_file_name=encoder_file_name,
+            decoder_file_name=decoder_file_name,
+            preprocessors=preprocessors,
+            neuron_configs=neuron_configs,
+        )
 
     @classmethod
     def _from_transformers(
@@ -115,9 +191,45 @@ def _from_transformers(
         disable_fast_relayout: Optional[bool] = False,
         disable_fallback: bool = False,
         dynamic_batch_size: bool = False,
-        device_ids: Optional[List[int]] = None,
+        **kwargs_shapes,
     ) -> "NeuronModelForConditionalGeneration":
-        pass
+        if task is None:
+            task = TasksManager.infer_task_from_model(cls.auto_model_class)
+
+        # Get compilation arguments
+        auto_cast_type = None if auto_cast is None else auto_cast_type
+        compiler_kwargs = {
+            "auto_cast": auto_cast,
+            "auto_cast_type": auto_cast_type,
+            "disable_fast_relayout": disable_fast_relayout,
+            "disable_fallback": disable_fallback,
+        }
+
+        save_dir = TemporaryDirectory()
+        save_dir_path = Path(save_dir.name)
+
+        main_export(
+            model_name_or_path=model_id,
+            output=save_dir_path,
+            compiler_kwargs=compiler_kwargs,
+            task=task,
+            dynamic_batch_size=dynamic_batch_size,
+            cache_dir=cache_dir,
+            trust_remote_code=trust_remote_code,
+            subfolder=subfolder,
+            revision=revision,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            use_auth_token=use_auth_token,
+            do_validation=False,
+            **kwargs_shapes,
+        )
+
+        return cls._from_pretrained(
+            model_id=save_dir_path,
+            config=config,
+            model_save_dir=save_dir,
+        )
 
 
 class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin):

From 72ed695cceb4b247eb9106c30b7195032ee14d5a Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Mon, 30 Oct 2023 18:04:38 +0000
Subject: [PATCH 08/30] modeling export and loading

---
 optimum/commands/export/neuronx.py     |  1 +
 optimum/exporters/neuron/__main__.py   |  2 +-
 optimum/exporters/neuron/convert.py    |  4 +-
 optimum/neuron/modeling_base.py        |  3 +-
 optimum/neuron/modeling_diffusion.py   | 14 ++---
 optimum/neuron/modeling_seq2seq.py     | 86 +++++++++++++++++++++++---
 optimum/neuron/utils/argument_utils.py | 12 ++--
 7 files changed, 99 insertions(+), 23 deletions(-)

diff --git a/optimum/commands/export/neuronx.py b/optimum/commands/export/neuronx.py
index 616cee693..d73f252bb 100644
--- a/optimum/commands/export/neuronx.py
+++ b/optimum/commands/export/neuronx.py
@@ -105,6 +105,7 @@ def parse_args_neuronx(parser: "ArgumentParser"):
     input_group.add_argument(
         "--num_beams",
         type=int,
+        default=1,
         help=f"Number of beams for beam search {doc_input}",
     )
     input_group.add_argument(
diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
index f0402552d..c671f9cb0 100644
--- a/optimum/exporters/neuron/__main__.py
+++ b/optimum/exporters/neuron/__main__.py
@@ -261,7 +261,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
 
 
 def _get_submodels_and_neuron_configs_for_encoder_decoder(
-    model: Union["PreTrainedModel", "DiffusionPipeline"],
+    model: "PreTrainedModel",
     input_shapes: Dict[str, int],
     task: str,
     output: Path,
diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index c4072cc2a..da4667fb0 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -328,7 +328,7 @@ def export_models(
             if is_diffusers_available() and isinstance(model_config, FrozenDict):
                 model_config = OrderedDict(model_config)
                 model_config = DiffusersPretrainedConfig.from_dict(model_config)
-
+            
             model_config = store_compilation_config(
                 config=model_config,
                 input_shapes=sub_neuron_config.input_shapes,
@@ -343,6 +343,8 @@ def export_models(
             )
             if isinstance(model_config, PretrainedConfig):
                 model_config = DiffusersPretrainedConfig.from_dict(model_config.__dict__)
+            import pdb
+            pdb.set_trace()
             model_config.save_pretrained(output_path.parent)
         except Exception as e:
             failed_models.append((i, model_name))
diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py
index 7e6e3da3f..fa21a16c1 100644
--- a/optimum/neuron/modeling_base.py
+++ b/optimum/neuron/modeling_base.py
@@ -408,8 +408,9 @@ def _neuron_config_init(cls, config: "PretrainedConfig") -> "NeuronConfig":
         # Neuron config constructuor
         task = getattr(config, "task") or TasksManager.infer_task_from_model(cls.auto_model_class)
         task = TasksManager.map_from_synonym(task)
+        model_type = neuron_configs.get("model_type", None) or config.model_type
         neuron_config_constructor = TasksManager.get_exporter_config_constructor(
-            model_type=config.model_type, exporter="neuron", task=task
+            model_type=model_type, exporter="neuron", task=task
         )
 
         return neuron_config_constructor(
diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py
index 5f0befdd1..d7ee37ab1 100644
--- a/optimum/neuron/modeling_diffusion.py
+++ b/optimum/neuron/modeling_diffusion.py
@@ -294,6 +294,12 @@ def _save_pretrained(
         """
         Saves the model to the serialized format optimized for Neuron devices.
         """
+        if self.model_and_config_save_paths is None:
+            logger.warning(
+                "`model_save_paths` is None which means that no path of Neuron model is defined. Nothing will be saved."
+            )
+            return
+        
         save_directory = Path(save_directory)
         if not self.model_and_config_save_paths.get(DIFFUSION_MODEL_VAE_ENCODER_NAME)[0].is_file():
             self.model_and_config_save_paths.pop(DIFFUSION_MODEL_VAE_ENCODER_NAME)
@@ -304,13 +310,7 @@ def _save_pretrained(
         if not self.model_and_config_save_paths.get(DIFFUSION_MODEL_TEXT_ENCODER_2_NAME)[0].is_file():
             self.model_and_config_save_paths.pop(DIFFUSION_MODEL_TEXT_ENCODER_2_NAME)
 
-        if self.model_and_config_save_paths is None:
-            logger.warning(
-                "`model_save_paths` is None which means that no path of Neuron model is defined. Nothing will be saved."
-            )
-            return
-        else:
-            logger.info(f"Saving the {tuple(self.model_and_config_save_paths.keys())}...")
+        logger.info(f"Saving the {tuple(self.model_and_config_save_paths.keys())}...")
 
         dst_paths = {
             DIFFUSION_MODEL_TEXT_ENCODER_NAME: save_directory
diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index 39346fd3d..9bb3df27c 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """NeuroModelForXXX classes for seq2seq models' inference on neuron devices."""
 import os
+import shutil
+import logging
 from abc import abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -21,7 +23,7 @@
 
 import torch
 from huggingface_hub import snapshot_download
-from transformers import AutoConfig, AutoModelForSeq2SeqLM
+from transformers import AutoConfig, AutoModelForSeq2SeqLM, GenerationConfig
 
 from ..exporters.neuron import (
     NeuronConfig,
@@ -46,9 +48,12 @@
 if is_neuronx_available():
     pass
 
+logger = logging.getLogger(__name__)
+
 
 class NeuronModelForConditionalGeneration(NeuronBaseModel):
     base_model_prefix = "neuron_model"
+    config_name = "config.json"
 
     def __init__(
         self,
@@ -60,6 +65,7 @@ def __init__(
         decoder_file_name: Optional[str] = NEURON_FILE_NAME,
         preprocessors: Optional[List] = None,
         neuron_configs: Optional[Dict[str, "NeuronConfig"]] = None,
+        generation_config: Optional[GenerationConfig] = None,
         **kwargs,
     ):
         self.encoder = NeuronEncoder(
@@ -82,6 +88,10 @@ def __init__(
         self._attributes_init(model_save_dir, preprocessors, **kwargs)
         self.encoder_file_name = encoder_file_name
         self.decoder_file_name = decoder_file_name
+        
+        if generation_config is None:
+            generation_config = GenerationConfig.from_model_config(self.configs[DECODER_NAME])
+        self.generation_config = generation_config
 
     def _save_pretrained(
         self,
@@ -98,8 +108,49 @@ def _save_pretrained(
             save_directory (`Union[str, Path`]):
                 The directory where to save the model files.
         """
+        if self.model_and_config_save_paths is None:
+            logger.warning(
+                "`model_save_paths` is None which means that no path of Neuron model is defined. Nothing will be saved."
+            )
+            return
+        
         save_directory = Path(save_directory)
-        # TODO
+        if not self.model_and_config_save_paths.get(ENCODER_NAME)[0].is_file():
+            self.model_and_config_save_paths.pop(ENCODER_NAME)
+
+        if not self.model_and_config_save_paths.get(DECODER_NAME)[0].is_file():
+            self.model_and_config_save_paths.pop(DECODER_NAME)
+        
+        dst_paths = {
+            ENCODER_NAME: save_directory / ENCODER_NAME / encoder_file_name,
+            DECODER_NAME: save_directory / DECODER_NAME / decoder_file_name,
+        }
+        
+        model_src_to_dst_path = {
+            self.model_and_config_save_paths[model_name][0]: dst_paths[model_name]
+            for model_name in set(self.model_and_config_save_paths.keys()).intersection(dst_paths.keys())
+        }
+        # save
+        config_src_to_dst_path = {
+            self.model_and_config_save_paths[model_name][1]: dst_paths[model_name].parent / self.config_name
+            for model_name in set(self.model_and_config_save_paths.keys()).intersection(dst_paths.keys())
+        }
+        
+        src_paths = list(model_src_to_dst_path.keys()) + list(config_src_to_dst_path.keys())
+        dst_paths = list(model_src_to_dst_path.values()) + list(config_src_to_dst_path.values())
+
+        for src_path, dst_path in zip(src_paths, dst_paths):
+            dst_path.parent.mkdir(parents=True, exist_ok=True)
+            if src_path.is_file():
+                shutil.copyfile(src_path, dst_path)
+            
+        src_paths = [Path(path) for path in self.onnx_paths]
+        dst_paths = [save_directory / path.name for path in src_paths]
+        
+        if self.tokenizer is not None:
+            self.tokenizer.save_pretrained(save_directory)
+
+        self.generation_config.save_pretrained(save_directory)
 
     @classmethod
     def _from_pretrained(
@@ -117,13 +168,9 @@ def _from_pretrained(
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ):
-        import pdb
-
-        pdb.set_trace()
-        patterns = {ENCODER_NAME, DECODER_NAME}
+        model_id = str(model_id)
 
         if not os.path.isdir(model_id):
-            allow_patterns = {os.path.join(k, "*") for k in patterns if not k.startswith("_")}
             # Downloads all repo's files matching the allowed patterns
             model_id = snapshot_download(
                 model_id,
@@ -132,7 +179,6 @@ def _from_pretrained(
                 use_auth_token=use_auth_token,
                 revision=revision,
                 force_download=force_download,
-                allow_patterns=allow_patterns,
                 ignore_patterns=["*.msgpack", "*.safetensors", "*.bin"],  # only download *.neuron artifacts
             )
 
@@ -155,12 +201,33 @@ def _from_pretrained(
         configs, neuron_configs = {}, {}
         for name, file_paths in model_and_config_save_paths.items():
             if file_paths[1].is_file():
-                model_config = AutoConfig.from_json_file(file_paths[1])
+                model_config = AutoConfig.from_pretrained(file_paths[1])
                 configs[name] = model_config
                 neuron_configs[name] = cls._neuron_config_init(model_config)
 
         encoder = cls.load_model(model_and_config_save_paths["encoder"][0])
         decoder = cls.load_model(model_and_config_save_paths["decoder"][0])
+        
+        # TODO: Debug num_beams unmatched issue
+        import pdb
+        pdb.set_trace()
+        
+        if model_save_dir is None:
+            model_save_dir = new_model_save_dir
+        
+        generation_config = None
+        try:
+            generation_config = GenerationConfig.from_pretrained(
+                model_id,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                subfolder=os.path.join(subfolder, DECODER_NAME),
+            )
+        except OSError:
+            logger.info("Generation config file not found, using a generation config created from the model config.")
 
         return cls(
             encoder=encoder,
@@ -171,6 +238,7 @@ def _from_pretrained(
             decoder_file_name=decoder_file_name,
             preprocessors=preprocessors,
             neuron_configs=neuron_configs,
+            generation_config=generation_config,
         )
 
     @classmethod
diff --git a/optimum/neuron/utils/argument_utils.py b/optimum/neuron/utils/argument_utils.py
index 68c79b684..eb24d169f 100644
--- a/optimum/neuron/utils/argument_utils.py
+++ b/optimum/neuron/utils/argument_utils.py
@@ -172,6 +172,13 @@ def store_compilation_config(
 
     config_args["input_names"] = input_names
     config_args["output_names"] = output_names
+    
+    original_model_type = getattr(config, "model_type", None)
+    neuron_model_type = str(model_type).replace("_", "-")
+    if original_model_type is None:
+        update_func("model_type", neuron_model_type)   # Add model_type to the config if it doesn't exist before, eg. submodel of Stable Diffusion.
+    elif neuron_model_type != original_model_type:
+        config_args["model_type"] = neuron_model_type  # Neuron custom model_type, eg. `t5-encoder`.
 
     update_func("neuron", config_args)
 
@@ -179,10 +186,7 @@ def store_compilation_config(
         import diffusers
 
         update_func("_diffusers_version", diffusers.__version__)
-
-    model_type = getattr(config, "model_type", None) or model_type
-    model_type = str(model_type).replace("_", "-")
-    update_func("model_type", model_type)
+    
     update_func("task", task)
 
     return config

From 16ddeeb4aa83430ec9b6771503841d7c38f01086 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Sun, 5 Nov 2023 16:43:41 +0000
Subject: [PATCH 09/30] fix style

---
 optimum/exporters/neuron/convert.py       |  7 +++---
 optimum/exporters/neuron/model_configs.py |  7 ++++++
 optimum/neuron/modeling_diffusion.py      |  2 +-
 optimum/neuron/modeling_seq2seq.py        | 27 ++++++++++++++---------
 optimum/neuron/utils/argument_utils.py    |  8 ++++---
 5 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index da4667fb0..6d2c20071 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -328,7 +328,7 @@ def export_models(
             if is_diffusers_available() and isinstance(model_config, FrozenDict):
                 model_config = OrderedDict(model_config)
                 model_config = DiffusersPretrainedConfig.from_dict(model_config)
-            
+
             model_config = store_compilation_config(
                 config=model_config,
                 input_shapes=sub_neuron_config.input_shapes,
@@ -343,8 +343,6 @@ def export_models(
             )
             if isinstance(model_config, PretrainedConfig):
                 model_config = DiffusersPretrainedConfig.from_dict(model_config.__dict__)
-            import pdb
-            pdb.set_trace()
             model_config.save_pretrained(output_path.parent)
         except Exception as e:
             failed_models.append((i, model_name))
@@ -453,6 +451,9 @@ def export_neuronx(
     # diffusers specific
     compiler_args = add_stable_diffusion_compiler_args(config, compiler_args)
 
+    import pdb
+
+    pdb.set_trace()
     neuron_model = neuronx.trace(
         checked_model,
         dummy_inputs_tuple,
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index a92681b43..4ea1beff6 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -394,6 +394,13 @@ def patch_model_for_export(self, model, device="xla", **kwargs):
             custom_wrapper_kwargs={"num_beams": num_beams, "device": device},
         )
 
+    # def generate_dummy_inputs(self, **kwargs):
+    #     batch_size = kwargs.pop("batch_size") * kwargs.get("num_beams")
+    #     dummy_inputs = super().generate_dummy_inputs(batch_size=batch_size, **kwargs)
+
+    #     return dummy_inputs
+
+
 @register_in_tasks_manager("opt", "text-generation")
 class OPTNeuronConfig(TextNeuronDecoderConfig):
     NEURONX_CLASS = "opt.model.OPTForSampling"
diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py
index d7ee37ab1..569191387 100644
--- a/optimum/neuron/modeling_diffusion.py
+++ b/optimum/neuron/modeling_diffusion.py
@@ -299,7 +299,7 @@ def _save_pretrained(
                 "`model_save_paths` is None which means that no path of Neuron model is defined. Nothing will be saved."
             )
             return
-        
+
         save_directory = Path(save_directory)
         if not self.model_and_config_save_paths.get(DIFFUSION_MODEL_VAE_ENCODER_NAME)[0].is_file():
             self.model_and_config_save_paths.pop(DIFFUSION_MODEL_VAE_ENCODER_NAME)
diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index 9bb3df27c..89e9c86c5 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -13,9 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """NeuroModelForXXX classes for seq2seq models' inference on neuron devices."""
+import logging
 import os
 import shutil
-import logging
 from abc import abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -88,7 +88,7 @@ def __init__(
         self._attributes_init(model_save_dir, preprocessors, **kwargs)
         self.encoder_file_name = encoder_file_name
         self.decoder_file_name = decoder_file_name
-        
+
         if generation_config is None:
             generation_config = GenerationConfig.from_model_config(self.configs[DECODER_NAME])
         self.generation_config = generation_config
@@ -113,19 +113,19 @@ def _save_pretrained(
                 "`model_save_paths` is None which means that no path of Neuron model is defined. Nothing will be saved."
             )
             return
-        
+
         save_directory = Path(save_directory)
         if not self.model_and_config_save_paths.get(ENCODER_NAME)[0].is_file():
             self.model_and_config_save_paths.pop(ENCODER_NAME)
 
         if not self.model_and_config_save_paths.get(DECODER_NAME)[0].is_file():
             self.model_and_config_save_paths.pop(DECODER_NAME)
-        
+
         dst_paths = {
             ENCODER_NAME: save_directory / ENCODER_NAME / encoder_file_name,
             DECODER_NAME: save_directory / DECODER_NAME / decoder_file_name,
         }
-        
+
         model_src_to_dst_path = {
             self.model_and_config_save_paths[model_name][0]: dst_paths[model_name]
             for model_name in set(self.model_and_config_save_paths.keys()).intersection(dst_paths.keys())
@@ -135,7 +135,7 @@ def _save_pretrained(
             self.model_and_config_save_paths[model_name][1]: dst_paths[model_name].parent / self.config_name
             for model_name in set(self.model_and_config_save_paths.keys()).intersection(dst_paths.keys())
         }
-        
+
         src_paths = list(model_src_to_dst_path.keys()) + list(config_src_to_dst_path.keys())
         dst_paths = list(model_src_to_dst_path.values()) + list(config_src_to_dst_path.values())
 
@@ -143,10 +143,10 @@ def _save_pretrained(
             dst_path.parent.mkdir(parents=True, exist_ok=True)
             if src_path.is_file():
                 shutil.copyfile(src_path, dst_path)
-            
+
         src_paths = [Path(path) for path in self.onnx_paths]
         dst_paths = [save_directory / path.name for path in src_paths]
-        
+
         if self.tokenizer is not None:
             self.tokenizer.save_pretrained(save_directory)
 
@@ -205,16 +205,21 @@ def _from_pretrained(
                 configs[name] = model_config
                 neuron_configs[name] = cls._neuron_config_init(model_config)
 
+        # TODO: Debug num_beams unmatched issue
+        import pdb
+
+        pdb.set_trace()
         encoder = cls.load_model(model_and_config_save_paths["encoder"][0])
         decoder = cls.load_model(model_and_config_save_paths["decoder"][0])
-        
+
         # TODO: Debug num_beams unmatched issue
         import pdb
+
         pdb.set_trace()
-        
+
         if model_save_dir is None:
             model_save_dir = new_model_save_dir
-        
+
         generation_config = None
         try:
             generation_config = GenerationConfig.from_pretrained(
diff --git a/optimum/neuron/utils/argument_utils.py b/optimum/neuron/utils/argument_utils.py
index eb24d169f..b7e9b4ab0 100644
--- a/optimum/neuron/utils/argument_utils.py
+++ b/optimum/neuron/utils/argument_utils.py
@@ -172,11 +172,13 @@ def store_compilation_config(
 
     config_args["input_names"] = input_names
     config_args["output_names"] = output_names
-    
+
     original_model_type = getattr(config, "model_type", None)
     neuron_model_type = str(model_type).replace("_", "-")
     if original_model_type is None:
-        update_func("model_type", neuron_model_type)   # Add model_type to the config if it doesn't exist before, eg. submodel of Stable Diffusion.
+        update_func(
+            "model_type", neuron_model_type
+        )  # Add model_type to the config if it doesn't exist before, eg. submodel of Stable Diffusion.
     elif neuron_model_type != original_model_type:
         config_args["model_type"] = neuron_model_type  # Neuron custom model_type, eg. `t5-encoder`.
 
@@ -186,7 +188,7 @@ def store_compilation_config(
         import diffusers
 
         update_func("_diffusers_version", diffusers.__version__)
-    
+
     update_func("task", task)
 
     return config

From 3efdbc860f2621f57b8a5a5bf7d6e6d9af9a3f6e Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Sun, 5 Nov 2023 22:25:40 +0000
Subject: [PATCH 10/30] finish base modeling funcs

---
 optimum/exporters/neuron/convert.py |  3 --
 optimum/neuron/modeling_base.py     |  3 --
 optimum/neuron/modeling_seq2seq.py  | 64 ++++++++++++-----------------
 3 files changed, 27 insertions(+), 43 deletions(-)

diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index 6d2c20071..c4072cc2a 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -451,9 +451,6 @@ def export_neuronx(
     # diffusers specific
     compiler_args = add_stable_diffusion_compiler_args(config, compiler_args)
 
-    import pdb
-
-    pdb.set_trace()
     neuron_model = neuronx.trace(
         checked_model,
         dummy_inputs_tuple,
diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py
index fa21a16c1..05790c084 100644
--- a/optimum/neuron/modeling_base.py
+++ b/optimum/neuron/modeling_base.py
@@ -373,9 +373,6 @@ def _attributes_init(
 
         self.preprocessors = preprocessors if preprocessors is not None else []
 
-        self.input_names = getattr(self.config, "input_names", [])
-        self.output_names = getattr(self.config, "output_names", [])
-
         # Registers the NeuronModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating
         # a pipeline https://github.com/huggingface/transformers/blob/3d3204c025b6b5de013e07dd364208e28b4d9589/src/transformers/pipelines/base.py#L940
         AutoConfig.register(self.model_type, AutoConfig)
diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index 89e9c86c5..5c614073f 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -19,7 +19,7 @@
 from abc import abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
 from huggingface_hub import snapshot_download
@@ -46,7 +46,7 @@
     from transformers import PretrainedConfig
 
 if is_neuronx_available():
-    pass
+    import torch_neuronx
 
 logger = logging.getLogger(__name__)
 
@@ -66,8 +66,13 @@ def __init__(
         preprocessors: Optional[List] = None,
         neuron_configs: Optional[Dict[str, "NeuronConfig"]] = None,
         generation_config: Optional[GenerationConfig] = None,
+        model_and_config_save_paths: Optional[Dict[str, Tuple[str, Path]]] = None,
         **kwargs,
     ):
+        self.configs = configs
+        self.neuron_configs = neuron_configs
+        self._attributes_init(model_save_dir, preprocessors, **kwargs)
+        self.model_and_config_save_paths = model_and_config_save_paths if model_and_config_save_paths else None
         self.encoder = NeuronEncoder(
             encoder,
             self,
@@ -80,12 +85,9 @@ def __init__(
             self.configs[DECODER_NAME],
             self.neuron_configs[DECODER_NAME],
         )
-        self.configs = configs
-        self.neuron_configs = neuron_configs
         self.dynamic_batch_size = all(
             neuron_config._config.neuron["dynamic_batch_size"] for neuron_config in self.neuron_configs.values()
         )
-        self._attributes_init(model_save_dir, preprocessors, **kwargs)
         self.encoder_file_name = encoder_file_name
         self.decoder_file_name = decoder_file_name
 
@@ -121,35 +123,20 @@ def _save_pretrained(
         if not self.model_and_config_save_paths.get(DECODER_NAME)[0].is_file():
             self.model_and_config_save_paths.pop(DECODER_NAME)
 
-        dst_paths = {
-            ENCODER_NAME: save_directory / ENCODER_NAME / encoder_file_name,
-            DECODER_NAME: save_directory / DECODER_NAME / decoder_file_name,
-        }
-
-        model_src_to_dst_path = {
-            self.model_and_config_save_paths[model_name][0]: dst_paths[model_name]
-            for model_name in set(self.model_and_config_save_paths.keys()).intersection(dst_paths.keys())
-        }
-        # save
-        config_src_to_dst_path = {
-            self.model_and_config_save_paths[model_name][1]: dst_paths[model_name].parent / self.config_name
-            for model_name in set(self.model_and_config_save_paths.keys()).intersection(dst_paths.keys())
-        }
-
-        src_paths = list(model_src_to_dst_path.keys()) + list(config_src_to_dst_path.keys())
-        dst_paths = list(model_src_to_dst_path.values()) + list(config_src_to_dst_path.values())
+        dst_paths = [
+            save_directory / ENCODER_NAME / encoder_file_name,
+            save_directory / DECODER_NAME / decoder_file_name,
+        ]
+        src_paths = [
+            Path(self.model_and_config_save_paths[model_name][0])
+            for model_name in set(self.model_and_config_save_paths.keys()).intersection([ENCODER_NAME, DECODER_NAME])
+        ]
 
         for src_path, dst_path in zip(src_paths, dst_paths):
             dst_path.parent.mkdir(parents=True, exist_ok=True)
             if src_path.is_file():
                 shutil.copyfile(src_path, dst_path)
 
-        src_paths = [Path(path) for path in self.onnx_paths]
-        dst_paths = [save_directory / path.name for path in src_paths]
-
-        if self.tokenizer is not None:
-            self.tokenizer.save_pretrained(save_directory)
-
         self.generation_config.save_pretrained(save_directory)
 
     @classmethod
@@ -205,17 +192,9 @@ def _from_pretrained(
                 configs[name] = model_config
                 neuron_configs[name] = cls._neuron_config_init(model_config)
 
-        # TODO: Debug num_beams unmatched issue
-        import pdb
-
-        pdb.set_trace()
         encoder = cls.load_model(model_and_config_save_paths["encoder"][0])
         decoder = cls.load_model(model_and_config_save_paths["decoder"][0])
-
-        # TODO: Debug num_beams unmatched issue
-        import pdb
-
-        pdb.set_trace()
+        torch_neuronx.move_trace_to_device(decoder, 0)
 
         if model_save_dir is None:
             model_save_dir = new_model_save_dir
@@ -244,6 +223,7 @@ def _from_pretrained(
             preprocessors=preprocessors,
             neuron_configs=neuron_configs,
             generation_config=generation_config,
+            model_and_config_save_paths=model_and_config_save_paths,
         )
 
     @classmethod
@@ -266,6 +246,11 @@ def _from_transformers(
         dynamic_batch_size: bool = False,
         **kwargs_shapes,
     ) -> "NeuronModelForConditionalGeneration":
+        if dynamic_batch_size is True:
+            logger.warning(
+                "Sequence-to-sequence models don't support dynamic batch size yet, `dynamic_batch_size` will be set to False."
+            )
+
         if task is None:
             task = TasksManager.infer_task_from_model(cls.auto_model_class)
 
@@ -304,6 +289,11 @@ def _from_transformers(
             model_save_dir=save_dir,
         )
 
+    def _save_config(self, save_directory):
+        save_directory = Path(save_directory)
+        self.configs[ENCODER_NAME].save_pretrained(save_directory / ENCODER_NAME)
+        self.configs[DECODER_NAME].save_pretrained(save_directory / DECODER_NAME)
+
 
 class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin):
     auto_model_class = AutoModelForSeq2SeqLM

From cdc885eba6ac0b6ce6340dc4209420201d2ff39f Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Mon, 6 Nov 2023 09:58:48 +0000
Subject: [PATCH 11/30] quick test inference

---
 optimum/neuron/modeling_seq2seq.py | 450 ++++++++++++++++++++++++++++-
 1 file changed, 446 insertions(+), 4 deletions(-)

diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index 5c614073f..2332b3a3b 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -19,11 +19,25 @@
 from abc import abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from huggingface_hub import snapshot_download
-from transformers import AutoConfig, AutoModelForSeq2SeqLM, GenerationConfig
+from transformers import AutoConfig, AutoModelForSeq2SeqLM, GenerationConfig, PreTrainedTokenizerBase
+from transformers.generation.beam_search import BeamScorer
+from transformers.generation.logits_process import (
+    LogitsProcessorList,
+)
+from transformers.generation.stopping_criteria import (
+    MaxLengthCriteria,
+    MaxTimeCriteria,
+    StoppingCriteriaList,
+)
+from transformers.generation.utils import (
+    BeamSearchOutput,
+    GreedySearchOutput,
+)
+from transformers.modeling_outputs import ModelOutput, Seq2SeqLMOutput
 
 from ..exporters.neuron import (
     NeuronConfig,
@@ -291,14 +305,442 @@ def _from_transformers(
 
     def _save_config(self, save_directory):
         save_directory = Path(save_directory)
-        self.configs[ENCODER_NAME].save_pretrained(save_directory / ENCODER_NAME)
-        self.configs[DECODER_NAME].save_pretrained(save_directory / DECODER_NAME)
+        config = self.configs[ENCODER_NAME].copy()
+        encoder_neuron_config = self.configs[ENCODER_NAME].neuron
+        decoder_neuron_config = self.configs[DECODER_NAME].neuron
+        # TODO: Combine encoder decoder config and save in root
+        combined_config_args = {}
+        config.__setattr__("neuron", combined_config_args)
+        config.save_pretrained(save_directory / ENCODER_NAME)
 
 
 class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin):
     auto_model_class = AutoModelForSeq2SeqLM
     main_input_name = "input_ids"
 
+    def _prepare_encoder_decoder_kwargs_for_generation(
+        self, inputs_tensor: torch.Tensor, model_kwargs, model_input_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        encoder = self.get_encoder()
+        model_kwargs["encoder_outputs"]: ModelOutput = encoder(inputs_tensor, model_kwargs["attention_mask"])
+        return model_kwargs
+
+    def _update_model_kwargs_for_xla_generation(
+        self,
+        model_kwargs: Dict[str, Any],
+        batch_size: int,
+        is_encoder_decoder: bool = False,
+        standardize_cache_format: bool = False,
+        max_length: Optional[int] = None,
+        seq_length: Optional[int] = None,
+        use_cache: bool = True,
+    ) -> Dict[str, Any]:
+        def _update_attention(model_kwargs, is_encoder_decoder):
+            """Updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`"""
+
+            attention_mask_name = "decoder_attention_mask" if is_encoder_decoder else "attention_mask"
+            attention_mask = model_kwargs.pop(attention_mask_name)
+            attention_mask_update_slice = torch.ones(
+                (batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device
+            )
+            attention_mask = torch.cat([attention_mask[:, 1:], attention_mask_update_slice], dim=-1)
+            mask = {attention_mask_name: attention_mask}
+            return mask
+
+        mask = _update_attention(model_kwargs, is_encoder_decoder)
+        # sets the updated variables (mask and past_key_values)
+        model_kwargs.update(mask)
+
+        # Set a mock cache tensor
+        model_kwargs["past_key_values"] = torch.tensor([])
+
+        return model_kwargs
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        """
+        This is needed for beam search and not greedy sampling
+        We reorder the cache within the trace so we can skip it in modelling_t5.py. So we override the _reorder_cache
+        """
+        self.beam_idx = beam_idx
+        return past_key_values
+
+    def generate(
+        self,
+        tokenizer: "PreTrainedTokenizerBase",
+        prompt: str,
+        max_length: int,
+        num_beams: int,
+        num_return_sequences: int,
+        device: str,
+    ):
+        batch_encoding = tokenizer(
+            prompt, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt"
+        )
+
+        past_key_values = self.encoder(batch_encoding["input_ids"], batch_encoding["attention_mask"])
+
+        decoder_attention_mask = torch.cat(
+            [torch.zeros((1, max_length - 1), dtype=torch.int32), torch.ones((1, 1), dtype=torch.int32)], axis=1
+        )
+
+        # copy the new cache state to the decoder
+        if device == "xla":
+            for state, tensor in zip(self.decoder.parameters(), past_key_values):
+                state.copy_(tensor)
+        else:
+            # First half of the cache is self attention and the rest is cross attention
+            self.decoder.past_key_values_sa = past_key_values[: len(past_key_values) // 2]
+            self.decoder.past_key_values_ca = past_key_values[len(past_key_values) // 2 :]
+
+        output = super().generate(
+            **batch_encoding,
+            max_length=max_length,
+            num_beams=num_beams,
+            num_return_sequences=num_return_sequences,
+            do_sample=False,
+            use_cache=True,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs={"last_hidden_state": torch.ones((1, 128, 1))},
+        )  # Pass fake encoder_outputs so the transfomers code will not invoke the encoder
+        return output
+
+    def forward(
+        self,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        beam_scores=None,
+        **kwargs,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        hidden_states = encoder_outputs["last_hidden_state"]
+
+        if not hasattr(self, "beam_idx"):
+            # Infering the number of beams from the attention mask
+            num_beams = attention_mask.shape[0]
+            self.beam_idx = torch.arange(0, num_beams, dtype=torch.int64)
+
+        decoder_outputs = self.decoder(
+            decoder_input_ids, decoder_attention_mask, hidden_states, attention_mask, self.beam_idx, beam_scores
+        )
+
+        # lm_logits = decoder_outputs[0]
+        next_token_scores = decoder_outputs[0]
+        next_tokens = decoder_outputs[1]
+        next_indices = decoder_outputs[2]
+
+        return next_token_scores, next_tokens, next_indices
+
+    def beam_search(
+        self,
+        input_ids: torch.LongTensor,
+        beam_scorer: "BeamScorer",
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = False,
+        seq_length: Optional[int] = None,
+        **model_kwargs,
+    ) -> Union[BeamSearchOutput, torch.LongTensor]:
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+
+        batch_beam_size, cur_len = input_ids.shape
+
+        # Overwrite cur_len
+        cur_len = seq_length
+
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        beam_indices = (
+            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
+        )
+
+        # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
+        # of the first beam are considered to avoid sampling the exact same tokens across all beams.
+        # beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores_device = "cpu"
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=beam_scores_device)
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+
+        while True:
+            # prepare model inputs
+            # From max_length-sized input_ids, select first
+            # cur_len - 1 values.
+            update_indices = torch.stack(
+                [torch.arange(input_ids.size(0)), torch.tensor(cur_len - 1).repeat(input_ids.size(0))], dim=-1
+            )
+            input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None]
+            model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs)
+
+            next_token_scores, next_tokens, next_indices = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                beam_scores=beam_scores,
+            )
+
+            # stateless
+            beam_outputs = beam_scorer.process(
+                input_ids.to("cpu")[:, :cur_len],
+                next_token_scores.to("cpu"),
+                next_tokens.to("cpu"),
+                next_indices.to("cpu"),
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                beam_indices=beam_indices,
+            )
+
+            beam_scores = beam_outputs["next_beam_scores"]
+            beam_next_tokens = beam_outputs["next_beam_tokens"]
+            beam_idx = beam_outputs["next_beam_indices"]
+
+            update_indices = torch.stack(
+                [torch.arange(batch_beam_size), torch.tensor(cur_len - 1).repeat(batch_beam_size)], dim=-1
+            )
+            update_indices_2 = torch.stack(
+                [torch.arange(batch_beam_size), torch.tensor(cur_len).repeat(batch_beam_size)], dim=-1
+            )
+            # First select beam_indices
+            device = input_ids.device
+            beam_idx_device = beam_idx.to(device=input_ids.device)
+            input_ids[:, :] = input_ids[beam_idx_device.long(), :]
+
+            # Then append new tokens
+            input_ids[update_indices_2[:, 0], update_indices_2[:, 1], None] = (
+                beam_next_tokens.unsqueeze(-1).to(device).to(torch.long)
+            )
+            input_ids = input_ids * 1  # Hack to materialize tensor
+
+            # update generated ids, model inputs, and length for next step
+            model_kwargs = self._update_model_kwargs_for_xla_generation(
+                model_kwargs,
+                batch_size=batch_beam_size,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                max_length=stopping_criteria.max_length,
+                seq_length=cur_len,
+                use_cache=model_kwargs["use_cache"],
+            )
+            if model_kwargs["past_key_values"] is not None:
+                model_kwargs["past_key_values"] = self._reorder_cache(
+                    model_kwargs["past_key_values"], beam_idx.to(torch.int64)
+                )
+
+            if return_dict_in_generate and output_scores:
+                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
+
+            # increase cur_len
+            cur_len = cur_len + 1
+
+            # stop when each sentence is finished, or if we exceed the maximum length
+            stop_criterion_1 = beam_scorer.is_done
+            if isinstance(stopping_criteria, list):
+                if len(stopping_criteria) == 1:
+                    stopping_criteria = stopping_criteria[0]
+
+            # Cases that can be handled in XLA without requiring
+            # non-padded input_ids
+            if isinstance(stopping_criteria, MaxLengthCriteria):
+                stop_criterion_2 = cur_len >= stopping_criteria.max_length
+            elif isinstance(stopping_criteria, MaxTimeCriteria):
+                stop_criterion_2 = stopping_criteria(input_ids, scores)
+            else:
+                # Other cases will be handled on CPU
+                batch_size, _ = input_ids.shape
+                input_ids_cpu = input_ids.to("cpu")
+                mask = torch.cat(
+                    [torch.ones(batch_size, cur_len), torch.zeros(batch_size, input_ids.shape[1] - cur_len)], dim=1
+                ).bool()
+                input_ids_cpu = torch.masked_select(input_ids_cpu, mask).reshape((batch_size, cur_len))
+                scores_cpu = scores.to("cpu") if torch.is_tensor(scores) else scores
+                stop_criterion_2 = stopping_criteria(input_ids_cpu, scores_cpu)
+
+            if stop_criterion_1 or stop_criterion_2:
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
+
+        sequence_outputs = beam_scorer.finalize(
+            input_ids.to("cpu"),
+            beam_scores.to("cpu"),
+            next_tokens.to("cpu"),
+            next_indices.to("cpu"),
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+            beam_indices=beam_indices,
+        )
+
+        for k, v in sequence_outputs.items():
+            if type(v) == torch.Tensor:
+                sequence_outputs[k] = sequence_outputs[k].to(input_ids.device)
+
+        return sequence_outputs["sequences"]
+
+    def greedy_search(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: Optional["LogitsProcessorList"] = None,
+        stopping_criteria: Optional["StoppingCriteriaList"] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        seq_length: Optional[int] = int,
+        streamer: Optional["BaseStreamer"] = None,
+        **model_kwargs,
+    ) -> Union[GreedySearchOutput, torch.LongTensor]:
+        """
+        Overriding greedy sampling to use next tokens returned from neuron device instead of logits.
+        """
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        use_cache = model_kwargs["use_cache"] if "use_cache" in model_kwargs else False
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # keep track of which sequences are already finished
+        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
+
+        this_peer_finished = False  # used by synced_gpus only
+        while True:
+            # prepare model inputs
+            # From max_length-sized input_ids, select first
+            # seq_length - 1 values.
+
+            if model_kwargs.get("past_key_values") is None:
+                input_ids_ = input_ids[:, :seq_length]
+            else:
+                update_indices = torch.stack(
+                    [torch.arange(input_ids.size(0)), torch.tensor(seq_length - 1).repeat(input_ids.size(0))],
+                    dim=-1,
+                )
+                input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None]
+
+            model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs)
+
+            # forward pass to get next token
+            output = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            next_tokens = output[0]
+
+            # finished sentences should have their next token be a padding token
+            if eos_token_id is not None:
+                if pad_token_id is None:
+                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+
+            # update generated ids, model inputs, and length for next step
+
+            batch_size, _ = input_ids.shape
+            update_indices = torch.stack(
+                [torch.arange(batch_size), torch.tensor(seq_length).repeat(batch_size)], dim=-1
+            )
+            input_ids[update_indices[:, 0], update_indices[:, 1]] = next_tokens[:]
+            model_kwargs = self._update_model_kwargs_for_xla_generation(
+                model_kwargs,
+                batch_size=batch_size,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                max_length=stopping_criteria.max_length,
+                seq_length=seq_length,
+                use_cache=use_cache,
+            )
+
+            seq_length += 1
+
+            # if eos_token was found in one sentence, set sentence to finished
+            if eos_token_id_tensor is not None:
+                unfinished_sequences = unfinished_sequences.mul(
+                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+                )
+
+            # stop when each sentence is finished, or if we exceed the maximum length
+            stop_criterion_1 = unfinished_sequences.max() == 0
+
+            if isinstance(stopping_criteria, list):
+                if len(stopping_criteria) == 1:
+                    stopping_criteria = stopping_criteria[0]
+
+            # Cases that can be handled in XLA without requiring
+            # non-padded input_ids
+            if isinstance(stopping_criteria, MaxLengthCriteria):
+                stop_criterion_2 = seq_length >= stopping_criteria.max_length
+            elif isinstance(stopping_criteria, MaxTimeCriteria):
+                stop_criterion_2 = stopping_criteria(input_ids, scores)
+            else:
+                # Other cases will be handled on CPU
+                batch_size, _ = input_ids.shape
+                mask = torch.cat(
+                    [torch.ones(batch_size, seq_length), torch.zeros(batch_size, input_ids.shape[1] - seq_length)],
+                    dim=1,
+                ).bool()
+                input_ids_cpu = torch.masked_select(input_ids, mask).reshape((batch_size, seq_length)).to("cpu")
+                scores_cpu = scores.to("cpu") if torch.is_tensor(scores) else scores
+                stop_criterion_2 = stopping_criteria(input_ids_cpu, scores_cpu)
+
+            if stop_criterion_1 or stop_criterion_2:
+                this_peer_finished = True
+
+            if this_peer_finished:
+                break
+
+        if streamer is not None:
+            streamer.end()
+
+        return input_ids
+
 
 class _NeuronSeq2SeqModelPart:
     """

From 2384e522c8e118ea46a380e40691a2059f0c0ab6 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Tue, 7 Nov 2023 23:50:10 +0000
Subject: [PATCH 12/30] fix config loding

---
 optimum/neuron/modeling_seq2seq.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index 2332b3a3b..344d6dad5 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -305,13 +305,27 @@ def _from_transformers(
 
     def _save_config(self, save_directory):
         save_directory = Path(save_directory)
-        config = self.configs[ENCODER_NAME].copy()
-        encoder_neuron_config = self.configs[ENCODER_NAME].neuron
-        decoder_neuron_config = self.configs[DECODER_NAME].neuron
-        # TODO: Combine encoder decoder config and save in root
-        combined_config_args = {}
-        config.__setattr__("neuron", combined_config_args)
-        config.save_pretrained(save_directory / ENCODER_NAME)
+        self.configs[ENCODER_NAME].save_pretrained(save_directory / ENCODER_NAME)
+        self.configs[DECODER_NAME].save_pretrained(save_directory / DECODER_NAME)
+        combined_config = self._combine_encoder_decoder_config(
+            encoder_config=self.configs[ENCODER_NAME],
+            decoder_config=self.configs[DECODER_NAME],
+        )
+        combined_config.save_pretrained(save_directory)
+
+    def _combine_encoder_decoder_config(self, encoder_config: "PretrainedConfig", decoder_config: "PretrainedConfig"):
+        encoder_neuron_config = encoder_config.neuron
+        decoder_neuron_config = decoder_config.neuron
+
+        encoder_neuron_config["encoder_input_names"] = encoder_neuron_config.pop("input_names")
+        encoder_neuron_config["encoder_output_names"] = encoder_neuron_config.pop("output_names")
+        decoder_neuron_config["decoder_input_names"] = decoder_neuron_config.pop("input_names")
+        decoder_neuron_config["decoder_output_names"] = decoder_neuron_config.pop("output_names")
+
+        neuron_config = encoder_neuron_config.update(decoder_neuron_config)
+        encoder_config.__setattr__("neuron", neuron_config)
+
+        return encoder_config
 
 
 class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin):

From ae9df1add5e98573d6f98250f7b600c45939a8bf Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Thu, 9 Nov 2023 00:26:55 +0000
Subject: [PATCH 13/30] finish modeling, works

---
 optimum/exporters/neuron/__main__.py |   6 +-
 optimum/neuron/generation/utils.py   |   2 +-
 optimum/neuron/modeling_base.py      |   9 +-
 optimum/neuron/modeling_seq2seq.py   | 269 +++++++++++++++++----------
 4 files changed, 179 insertions(+), 107 deletions(-)

diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
index c671f9cb0..ecdf76ba5 100644
--- a/optimum/exporters/neuron/__main__.py
+++ b/optimum/exporters/neuron/__main__.py
@@ -193,11 +193,11 @@ def _get_submodels_and_neuron_configs(
     is_encoder_decoder = model.config.is_encoder_decoder
 
     if is_stable_diffusion:
-        return _get_submodels_and_neuron_configs_for_stable_diffusion(
+        models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_stable_diffusion(
             model, input_shapes, task, output, dynamic_batch_size
         )
     elif is_encoder_decoder:
-        return _get_submodels_and_neuron_configs_for_encoder_decoder(
+        models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_encoder_decoder(
             model, input_shapes, task, output, dynamic_batch_size, model_name_or_path
         )
     else:
@@ -209,7 +209,7 @@ def _get_submodels_and_neuron_configs(
         output_model_names = {model_name: "model.neuron"}
         models_and_neuron_configs = {model_name: (model, neuron_config)}
         maybe_save_preprocessors(model_name_or_path, output)
-        return models_and_neuron_configs, output_model_names
+    return models_and_neuron_configs, output_model_names
 
 
 def _get_submodels_and_neuron_configs_for_stable_diffusion(
diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py
index ce6f93e8b..81a5c3fa2 100644
--- a/optimum/neuron/generation/utils.py
+++ b/optimum/neuron/generation/utils.py
@@ -967,7 +967,7 @@ def generate(
                 "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
             )
 
-        if self.device.type != input_ids.device.type:
+        if hasattr(self, "device") and self.device.type != input_ids.device.type:
             warnings.warn(
                 "You are calling .generate() with the `input_ids` being on a device type different"
                 f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py
index 05790c084..0d26adaee 100644
--- a/optimum/neuron/modeling_base.py
+++ b/optimum/neuron/modeling_base.py
@@ -480,10 +480,15 @@ def _pad_to_compiled_shape(self, inputs: Dict[str, "torch.Tensor"]):
 
             # Pad to batch size: dimension 0 (pad_token_id can't be 0)
             padding = (0,) * len(padding)
-            if self.neuron_config.dynamic_batch_size is True and input_tensor.size(0) % target_shapes[0] == 0:
+            is_encoder_decoder = getattr(self.config, "is_encoder_decoder", False)
+            if (
+                not is_encoder_decoder
+                and self.neuron_config.dynamic_batch_size is True
+                and input_tensor.size(0) % target_shapes[0] == 0
+            ):
                 inputs[input_name] = input_tensor
                 continue
-            elif self.neuron_config.dynamic_batch_size is True:
+            elif not is_encoder_decoder and self.neuron_config.dynamic_batch_size is True:
                 target_shape = (input_tensor.size(0) // target_shapes[0] + 1) * target_shapes[0]
                 to_pad = target_shape - input_tensor.size(0)
             else:
diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index 344d6dad5..1ae49e721 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -13,17 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """NeuroModelForXXX classes for seq2seq models' inference on neuron devices."""
+import copy
 import logging
 import os
 import shutil
 from abc import abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from huggingface_hub import snapshot_download
-from transformers import AutoConfig, AutoModelForSeq2SeqLM, GenerationConfig, PreTrainedTokenizerBase
+from transformers import AutoConfig, AutoModelForSeq2SeqLM, GenerationConfig
 from transformers.generation.beam_search import BeamScorer
 from transformers.generation.logits_process import (
     LogitsProcessorList,
@@ -37,7 +38,7 @@
     BeamSearchOutput,
     GreedySearchOutput,
 )
-from transformers.modeling_outputs import ModelOutput, Seq2SeqLMOutput
+from transformers.modeling_outputs import Seq2SeqLMOutput
 
 from ..exporters.neuron import (
     NeuronConfig,
@@ -57,7 +58,8 @@
 
 
 if TYPE_CHECKING:
-    from transformers import PretrainedConfig
+    from transformers import PretrainedConfig, PreTrainedModel
+    from transformers.generation.streamers import BaseStreamer
 
 if is_neuronx_available():
     import torch_neuronx
@@ -73,18 +75,23 @@ def __init__(
         self,
         encoder: torch.jit._script.ScriptModule,
         decoder: torch.jit._script.ScriptModule,
-        configs: Optional[Dict[str, "PretrainedConfig"]] = None,
+        config: "PretrainedConfig",
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         encoder_file_name: Optional[str] = NEURON_FILE_NAME,
         decoder_file_name: Optional[str] = NEURON_FILE_NAME,
         preprocessors: Optional[List] = None,
         neuron_configs: Optional[Dict[str, "NeuronConfig"]] = None,
+        configs: Optional[Dict[str, "PretrainedConfig"]] = None,
         generation_config: Optional[GenerationConfig] = None,
         model_and_config_save_paths: Optional[Dict[str, Tuple[str, Path]]] = None,
         **kwargs,
     ):
+        self.config = config
         self.configs = configs
         self.neuron_configs = neuron_configs
+        self.input_static_shapes = NeuronModelForConditionalGeneration.get_input_static_shapes(
+            self.neuron_configs[ENCODER_NAME]
+        )  # only for the encoder
         self._attributes_init(model_save_dir, preprocessors, **kwargs)
         self.model_and_config_save_paths = model_and_config_save_paths if model_and_config_save_paths else None
         self.encoder = NeuronEncoder(
@@ -93,7 +100,7 @@ def __init__(
             self.configs[ENCODER_NAME],
             self.neuron_configs[ENCODER_NAME],
         )
-        self.decoder = NeuronEncoder(
+        self.decoder = NeuronDecoder(
             decoder,
             self,
             self.configs[DECODER_NAME],
@@ -142,8 +149,8 @@ def _save_pretrained(
             save_directory / DECODER_NAME / decoder_file_name,
         ]
         src_paths = [
-            Path(self.model_and_config_save_paths[model_name][0])
-            for model_name in set(self.model_and_config_save_paths.keys()).intersection([ENCODER_NAME, DECODER_NAME])
+            Path(self.model_and_config_save_paths[ENCODER_NAME][0]),
+            Path(self.model_and_config_save_paths[DECODER_NAME][0]),
         ]
 
         for src_path, dst_path in zip(src_paths, dst_paths):
@@ -206,8 +213,13 @@ def _from_pretrained(
                 configs[name] = model_config
                 neuron_configs[name] = cls._neuron_config_init(model_config)
 
-        encoder = cls.load_model(model_and_config_save_paths["encoder"][0])
-        decoder = cls.load_model(model_and_config_save_paths["decoder"][0])
+        # Initialize Neuron Runtime before loading models
+        runtime = torch.classes.neuron.Runtime()
+        runtime.initialize()
+        runtime.set_default_neuron_cores(0, 1)
+
+        encoder = cls.load_model(model_and_config_save_paths[ENCODER_NAME][0])
+        decoder = cls.load_model(model_and_config_save_paths[DECODER_NAME][0])
         torch_neuronx.move_trace_to_device(decoder, 0)
 
         if model_save_dir is None:
@@ -230,12 +242,13 @@ def _from_pretrained(
         return cls(
             encoder=encoder,
             decoder=decoder,
-            configs=configs,
+            config=config,
             model_save_dir=model_save_dir,
             encoder_file_name=encoder_file_name,
             decoder_file_name=decoder_file_name,
             preprocessors=preprocessors,
             neuron_configs=neuron_configs,
+            configs=configs,
             generation_config=generation_config,
             model_and_config_save_paths=model_and_config_save_paths,
         )
@@ -316,108 +329,30 @@ def _save_config(self, save_directory):
     def _combine_encoder_decoder_config(self, encoder_config: "PretrainedConfig", decoder_config: "PretrainedConfig"):
         encoder_neuron_config = encoder_config.neuron
         decoder_neuron_config = decoder_config.neuron
+        combined_config = copy.deepcopy(encoder_config)
 
         encoder_neuron_config["encoder_input_names"] = encoder_neuron_config.pop("input_names")
         encoder_neuron_config["encoder_output_names"] = encoder_neuron_config.pop("output_names")
         decoder_neuron_config["decoder_input_names"] = decoder_neuron_config.pop("input_names")
         decoder_neuron_config["decoder_output_names"] = decoder_neuron_config.pop("output_names")
 
-        neuron_config = encoder_neuron_config.update(decoder_neuron_config)
-        encoder_config.__setattr__("neuron", neuron_config)
+        encoder_neuron_config.update(decoder_neuron_config)
+        encoder_neuron_config.pop("model_type")
+        combined_config.__setattr__("neuron", encoder_neuron_config)
 
-        return encoder_config
+        return combined_config
+
+    def can_generate(self):
+        logger.warning(
+            "NeuronModelForConditionalGeneration is an abstract class and is not meant to be used for generation. Please use NeuronModelForSeq2SeqLM instead."
+        )
+        return False
 
 
 class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin):
     auto_model_class = AutoModelForSeq2SeqLM
     main_input_name = "input_ids"
 
-    def _prepare_encoder_decoder_kwargs_for_generation(
-        self, inputs_tensor: torch.Tensor, model_kwargs, model_input_name: Optional[str] = None
-    ) -> Dict[str, Any]:
-        encoder = self.get_encoder()
-        model_kwargs["encoder_outputs"]: ModelOutput = encoder(inputs_tensor, model_kwargs["attention_mask"])
-        return model_kwargs
-
-    def _update_model_kwargs_for_xla_generation(
-        self,
-        model_kwargs: Dict[str, Any],
-        batch_size: int,
-        is_encoder_decoder: bool = False,
-        standardize_cache_format: bool = False,
-        max_length: Optional[int] = None,
-        seq_length: Optional[int] = None,
-        use_cache: bool = True,
-    ) -> Dict[str, Any]:
-        def _update_attention(model_kwargs, is_encoder_decoder):
-            """Updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`"""
-
-            attention_mask_name = "decoder_attention_mask" if is_encoder_decoder else "attention_mask"
-            attention_mask = model_kwargs.pop(attention_mask_name)
-            attention_mask_update_slice = torch.ones(
-                (batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device
-            )
-            attention_mask = torch.cat([attention_mask[:, 1:], attention_mask_update_slice], dim=-1)
-            mask = {attention_mask_name: attention_mask}
-            return mask
-
-        mask = _update_attention(model_kwargs, is_encoder_decoder)
-        # sets the updated variables (mask and past_key_values)
-        model_kwargs.update(mask)
-
-        # Set a mock cache tensor
-        model_kwargs["past_key_values"] = torch.tensor([])
-
-        return model_kwargs
-
-    def _reorder_cache(self, past_key_values, beam_idx):
-        """
-        This is needed for beam search and not greedy sampling
-        We reorder the cache within the trace so we can skip it in modelling_t5.py. So we override the _reorder_cache
-        """
-        self.beam_idx = beam_idx
-        return past_key_values
-
-    def generate(
-        self,
-        tokenizer: "PreTrainedTokenizerBase",
-        prompt: str,
-        max_length: int,
-        num_beams: int,
-        num_return_sequences: int,
-        device: str,
-    ):
-        batch_encoding = tokenizer(
-            prompt, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt"
-        )
-
-        past_key_values = self.encoder(batch_encoding["input_ids"], batch_encoding["attention_mask"])
-
-        decoder_attention_mask = torch.cat(
-            [torch.zeros((1, max_length - 1), dtype=torch.int32), torch.ones((1, 1), dtype=torch.int32)], axis=1
-        )
-
-        # copy the new cache state to the decoder
-        if device == "xla":
-            for state, tensor in zip(self.decoder.parameters(), past_key_values):
-                state.copy_(tensor)
-        else:
-            # First half of the cache is self attention and the rest is cross attention
-            self.decoder.past_key_values_sa = past_key_values[: len(past_key_values) // 2]
-            self.decoder.past_key_values_ca = past_key_values[len(past_key_values) // 2 :]
-
-        output = super().generate(
-            **batch_encoding,
-            max_length=max_length,
-            num_beams=num_beams,
-            num_return_sequences=num_return_sequences,
-            do_sample=False,
-            use_cache=True,
-            decoder_attention_mask=decoder_attention_mask,
-            encoder_outputs={"last_hidden_state": torch.ones((1, 128, 1))},
-        )  # Pass fake encoder_outputs so the transfomers code will not invoke the encoder
-        return output
-
     def forward(
         self,
         attention_mask: Optional[torch.FloatTensor] = None,
@@ -438,13 +373,59 @@ def forward(
             decoder_input_ids, decoder_attention_mask, hidden_states, attention_mask, self.beam_idx, beam_scores
         )
 
-        # lm_logits = decoder_outputs[0]
         next_token_scores = decoder_outputs[0]
         next_tokens = decoder_outputs[1]
         next_indices = decoder_outputs[2]
 
         return next_token_scores, next_tokens, next_indices
 
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        num_return_sequences: Optional[int] = None,
+        device: str = "xla",
+        **kwargs,
+    ):
+        max_length = self.neuron_configs[ENCODER_NAME].sequence_length
+        num_beams = self.neuron_configs[ENCODER_NAME].num_beams
+        batch_size = self.neuron_configs[ENCODER_NAME].batch_size
+
+        inputs = {"input_ids": input_ids}
+        if attention_mask is not None:
+            inputs["attention_mask"] = attention_mask
+        inputs = self._pad_to_compiled_shape(inputs)
+
+        past_key_values = self.encoder(**inputs)
+
+        decoder_attention_mask = torch.cat(
+            [torch.zeros((batch_size, max_length - 1), dtype=torch.int64), torch.ones((1, 1), dtype=torch.int64)],
+            axis=1,
+        )
+
+        # copy the new cache state to the decoder
+        for state, tensor in zip(self.decoder.model.parameters(), past_key_values):
+            state.copy_(tensor)
+
+        output = super().generate(
+            **inputs,
+            max_length=max_length,
+            num_beams=num_beams,
+            num_return_sequences=num_return_sequences,
+            do_sample=False,
+            use_cache=True,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs={"last_hidden_state": torch.ones((batch_size, max_length, 1))},
+        )  # Pass fake encoder_outputs so the transfomers code will not invoke the encoder
+        return output
+
     def beam_search(
         self,
         input_ids: torch.LongTensor,
@@ -642,6 +623,10 @@ def greedy_search(
         logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
         use_cache = model_kwargs["use_cache"] if "use_cache" in model_kwargs else False
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            from transformers.generation.stopping_criteria import validate_stopping_criteria
+
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
         pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
         eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
         if isinstance(eos_token_id, int):
@@ -755,6 +740,86 @@ def greedy_search(
 
         return input_ids
 
+    def _reorder_cache(self, past_key_values, beam_idx):
+        """
+        The cache was reordered during the tracing of the decoder so we can skip it here. This is needed for beam search and not greedy sampling.
+        """
+        self.beam_idx = beam_idx
+        return past_key_values
+
+    def get_encoder(self) -> "NeuronEncoder":
+        return self.encoder
+
+    def _update_model_kwargs_for_xla_generation(
+        self,
+        model_kwargs: Dict[str, Any],
+        batch_size: int,
+        is_encoder_decoder: bool = False,
+        standardize_cache_format: bool = False,
+        max_length: Optional[int] = None,
+        seq_length: Optional[int] = None,
+        use_cache: bool = True,
+    ) -> Dict[str, Any]:
+        def _update_attention(model_kwargs, is_encoder_decoder):
+            """Updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`"""
+
+            attention_mask_name = "decoder_attention_mask" if is_encoder_decoder else "attention_mask"
+            attention_mask = model_kwargs.pop(attention_mask_name)
+            attention_mask_update_slice = torch.ones(
+                (batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device
+            )
+            attention_mask = torch.cat([attention_mask[:, 1:], attention_mask_update_slice], dim=-1)
+            mask = {attention_mask_name: attention_mask}
+            return mask
+
+        mask = _update_attention(model_kwargs, is_encoder_decoder)
+        # sets the updated variables (mask and past_key_values)
+        model_kwargs.update(mask)
+
+        # Set a mock cache tensor
+        model_kwargs["past_key_values"] = torch.tensor([])
+
+        return model_kwargs
+
+    # Override to cut the input_ids to just last token
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        decoder_attention_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids as past is cached
+        input_ids = input_ids[:, -1:]
+
+        return {
+            "decoder_input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+
+    def _validate_static_shape(self, input_shapes: List[int], target_shapes: List[int]) -> bool:
+        """
+        Checks if a input needs to be padded.
+        """
+        return input_shapes == target_shapes
+
+    def can_generate(self):
+        """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate."""
+        return True
+
 
 class _NeuronSeq2SeqModelPart:
     """
@@ -790,6 +855,8 @@ class NeuronEncoder(_NeuronSeq2SeqModelPart):
     Encoder part of the encoder-decoder model for Neuron inference. (Actually it's a monolith of encoder + decoder without past_key_values to workaround the control flow in the decoder).
     """
 
+    main_input_name = "input_ids"
+
     def __init__(
         self,
         model: torch.jit._script.ScriptModule,

From a3784cf9e5805962e05f8851b3689d52ceb7f262 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Thu, 9 Nov 2023 14:47:47 +0000
Subject: [PATCH 14/30] add part of tests

---
 optimum/exporters/neuron/__main__.py |   6 +-
 optimum/neuron/modeling_seq2seq.py   |  26 +++----
 tests/cli/test_export_cli.py         |  29 ++++++++
 tests/exporters/exporters_utils.py   |   4 ++
 tests/exporters/test_export.py       | 102 +++++++++++++++------------
 tests/generation/conftest.py         |  45 ++++++++++--
 tests/generation/test_export.py      |  75 +++++++++++++-------
 tests/generation/test_generate.py    |  12 ++--
 tests/generation/test_hub.py         |  78 +++++++++++++-------
 9 files changed, 255 insertions(+), 122 deletions(-)

diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
index ecdf76ba5..324c678ac 100644
--- a/optimum/exporters/neuron/__main__.py
+++ b/optimum/exporters/neuron/__main__.py
@@ -22,7 +22,7 @@
 from typing import TYPE_CHECKING, Any, Dict, Optional, Union
 
 from requests.exceptions import ConnectionError as RequestsConnectionError
-from transformers import AutoConfig
+from transformers import AutoConfig, PretrainedConfig
 
 from ...neuron.utils import (
     DECODER_NAME,
@@ -190,7 +190,9 @@ def _get_submodels_and_neuron_configs(
     model_name_or_path: Optional[Union[str, Path]] = None,
 ):
     is_stable_diffusion = "stable-diffusion" in task
-    is_encoder_decoder = model.config.is_encoder_decoder
+    is_encoder_decoder = (
+        getattr(model.config, "is_encoder_decoder", False) if isinstance(model.config, PretrainedConfig) else False
+    )
 
     if is_stable_diffusion:
         models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_stable_diffusion(
diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index 1ae49e721..6a10ea7a5 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -387,11 +387,9 @@ def generate(
         logits_processor: Optional[LogitsProcessorList] = None,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
-        synced_gpus: Optional[bool] = None,
         assistant_model: Optional["PreTrainedModel"] = None,
         streamer: Optional["BaseStreamer"] = None,
         num_return_sequences: Optional[int] = None,
-        device: str = "xla",
         **kwargs,
     ):
         max_length = self.neuron_configs[ENCODER_NAME].sequence_length
@@ -416,14 +414,21 @@ def generate(
 
         output = super().generate(
             **inputs,
+            generation_config=generation_config,
+            logits_processor=logits_processor,
+            stopping_criteria=stopping_criteria,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            assistant_model=assistant_model,
+            streamer=streamer,
+            num_return_sequences=num_return_sequences,
             max_length=max_length,
             num_beams=num_beams,
-            num_return_sequences=num_return_sequences,
-            do_sample=False,
-            use_cache=True,
+            do_sample=kwargs.pop("do_sample", False),
+            use_cache=kwargs.pop("use_cache", True),
             decoder_attention_mask=decoder_attention_mask,
+            # Pass fake encoder_outputs so the transfomers code will not invoke the encoder
             encoder_outputs={"last_hidden_state": torch.ones((batch_size, max_length, 1))},
-        )  # Pass fake encoder_outputs so the transfomers code will not invoke the encoder
+        )
         return output
 
     def beam_search(
@@ -432,7 +437,6 @@ def beam_search(
         beam_scorer: "BeamScorer",
         logits_processor: Optional[LogitsProcessorList] = None,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
-        max_length: Optional[int] = None,
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[Union[int, List[int]]] = None,
         output_attentions: Optional[bool] = None,
@@ -443,6 +447,9 @@ def beam_search(
         seq_length: Optional[int] = None,
         **model_kwargs,
     ) -> Union[BeamSearchOutput, torch.LongTensor]:
+        """
+        Overriding beam search to use next_token_scores returned from neuron device instead of logits.
+        """
         logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
         pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
@@ -580,8 +587,6 @@ def beam_search(
             if stop_criterion_1 or stop_criterion_2:
                 if not synced_gpus:
                     break
-                else:
-                    this_peer_finished = True
 
         sequence_outputs = beam_scorer.finalize(
             input_ids.to("cpu"),
@@ -642,9 +647,6 @@ def greedy_search(
 
         # init attention / hidden states / scores tuples
         scores = () if (return_dict_in_generate and output_scores) else None
-        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
-        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
-        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
 
         # keep track of which sequences are already finished
         unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
diff --git a/tests/cli/test_export_cli.py b/tests/cli/test_export_cli.py
index b0ed83121..d6c5d38f1 100644
--- a/tests/cli/test_export_cli.py
+++ b/tests/cli/test_export_cli.py
@@ -213,3 +213,32 @@ def test_stable_diffusion_xl(self):
                 shell=False,
                 check=True,
             )
+
+    @requires_neuronx
+    def test_t5(self):
+        model_id = "hf-internal-testing/tiny-random-t5"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                [
+                    "optimum-cli",
+                    "export",
+                    "neuron",
+                    "--model",
+                    model_id,
+                    "--task",
+                    "text2text-generation",
+                    "--batch_size",
+                    "1",
+                    "--sequence_length",
+                    "18",
+                    "--num_beams",
+                    "4",
+                    "--auto_cast",
+                    "matmul",
+                    "--auto_cast_type",
+                    "bf16",
+                    tempdir,
+                ],
+                shell=False,
+                check=True,
+            )
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 55ed9fed5..7ad87ae36 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -32,6 +32,10 @@
     "xlm-roberta": "hf-internal-testing/tiny-xlm-roberta",
 }
 
+ENCODER_DECODER_MODELS_TINY = {
+    "t5": "hf-internal-testing/tiny-random-t5",
+}
+
 STABLE_DIFFUSION_MODELS_TINY = {
     "stable-diffusion": ["hf-internal-testing/tiny-stable-diffusion-torch"],
     "stable-diffusion-xl": ["echarlaix/tiny-random-stable-diffusion-xl"],
diff --git a/tests/exporters/test_export.py b/tests/exporters/test_export.py
index de1fa0dc0..2bd1981ee 100644
--- a/tests/exporters/test_export.py
+++ b/tests/exporters/test_export.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import copy
-import os
 import random
 import unittest
 from pathlib import Path
@@ -22,7 +21,7 @@
 from typing import Dict, Optional
 
 from parameterized import parameterized
-from transformers import AutoConfig, set_seed
+from transformers import AutoConfig, AutoModelForSeq2SeqLM, set_seed
 from transformers.testing_utils import require_vision
 
 from optimum.exporters.neuron import (
@@ -30,25 +29,17 @@
     build_stable_diffusion_components_mandatory_shapes,
     export,
     export_models,
-    get_stable_diffusion_models_for_export,
     validate_model_outputs,
     validate_models_outputs,
 )
+from optimum.exporters.neuron.__main__ import _get_submodels_and_neuron_configs
 from optimum.exporters.neuron.model_configs import *  # noqa: F403
 from optimum.exporters.tasks import TasksManager
-from optimum.neuron.utils import (
-    DIFFUSION_MODEL_TEXT_ENCODER_2_NAME,
-    DIFFUSION_MODEL_TEXT_ENCODER_NAME,
-    DIFFUSION_MODEL_UNET_NAME,
-    DIFFUSION_MODEL_VAE_DECODER_NAME,
-    DIFFUSION_MODEL_VAE_ENCODER_NAME,
-    NEURON_FILE_NAME,
-)
 from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
 from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available, logging
 from optimum.utils.testing_utils import require_diffusers
 
-from .exporters_utils import EXPORT_MODELS_TINY, STABLE_DIFFUSION_MODELS_TINY
+from .exporters_utils import ENCODER_DECODER_MODELS_TINY, EXPORT_MODELS_TINY, STABLE_DIFFUSION_MODELS_TINY
 
 
 if is_diffusers_available():
@@ -164,29 +155,23 @@ class NeuronStableDiffusionExportTestCase(unittest.TestCase):
     """
 
     @parameterized.expand(STABLE_DIFFUSION_MODELS_TINY["stable-diffusion"])
-    def test_export_for_stable_diffusion_models(self, model_name):
+    def test_export_for_stable_diffusion_models(self, model_id):
         set_seed(SEED)
 
         # prepare neuron config / models
-        pipe = StableDiffusionPipeline.from_pretrained(model_name)
+        model = StableDiffusionPipeline.from_pretrained(model_id)
         input_shapes = build_stable_diffusion_components_mandatory_shapes(
-            **{"batch_size": 1, "height": 64, "width": 64}
-        )
-        models_and_neuron_configs = get_stable_diffusion_models_for_export(
-            pipeline=pipe,
-            task="stable-diffusion",
-            dynamic_batch_size=False,
-            **input_shapes,
+            **{"batch_size": 1, "height": 64, "width": 64, "num_images_per_prompt": 4}
         )
 
-        output_model_names = {
-            DIFFUSION_MODEL_TEXT_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_TEXT_ENCODER_NAME, NEURON_FILE_NAME),
-            DIFFUSION_MODEL_UNET_NAME: os.path.join(DIFFUSION_MODEL_UNET_NAME, NEURON_FILE_NAME),
-            DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME),
-            DIFFUSION_MODEL_VAE_DECODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_DECODER_NAME, NEURON_FILE_NAME),
-        }
-
         with TemporaryDirectory() as tmpdirname:
+            models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs(
+                model=model,
+                input_shapes=input_shapes,
+                task="stable-diffusion",
+                output=Path(tmpdirname),
+                model_name_or_path=model_id,
+            )
             _, neuron_outputs = export_models(
                 models_and_neuron_configs=models_and_neuron_configs,
                 output_dir=Path(tmpdirname),
@@ -200,30 +185,59 @@ def test_export_for_stable_diffusion_models(self, model_name):
             )
 
     @parameterized.expand(STABLE_DIFFUSION_MODELS_TINY["stable-diffusion-xl"])
-    def test_export_for_stable_diffusion_xl_models(self, model_name):
+    def test_export_for_stable_diffusion_xl_models(self, model_id):
         set_seed(SEED)
 
         # prepare neuron config / models
-        pipe = StableDiffusionXLPipeline.from_pretrained(model_name)
+        model = StableDiffusionXLPipeline.from_pretrained(model_id)
         input_shapes = build_stable_diffusion_components_mandatory_shapes(
-            **{"batch_size": 1, "height": 64, "width": 64}
-        )
-        models_and_neuron_configs = get_stable_diffusion_models_for_export(
-            pipeline=pipe,
-            task="stable-diffusion-xl",
-            dynamic_batch_size=False,
-            **input_shapes,
+            **{"batch_size": 1, "height": 64, "width": 64, "num_images_per_prompt": 4}
         )
 
-        output_model_names = {
-            DIFFUSION_MODEL_TEXT_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_TEXT_ENCODER_NAME, NEURON_FILE_NAME),
-            DIFFUSION_MODEL_TEXT_ENCODER_2_NAME: os.path.join(DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, NEURON_FILE_NAME),
-            DIFFUSION_MODEL_UNET_NAME: os.path.join(DIFFUSION_MODEL_UNET_NAME, NEURON_FILE_NAME),
-            DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME),
-            DIFFUSION_MODEL_VAE_DECODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_DECODER_NAME, NEURON_FILE_NAME),
-        }
+        with TemporaryDirectory() as tmpdirname:
+            models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs(
+                model=model,
+                input_shapes=input_shapes,
+                task="stable-diffusion-xl",
+                output=Path(tmpdirname),
+                model_name_or_path=model_id,
+            )
+            _, neuron_outputs = export_models(
+                models_and_neuron_configs=models_and_neuron_configs,
+                output_dir=Path(tmpdirname),
+                output_file_names=output_model_names,
+            )
+            validate_models_outputs(
+                models_and_neuron_configs=models_and_neuron_configs,
+                neuron_named_outputs=neuron_outputs,
+                output_dir=Path(tmpdirname),
+                neuron_files_subpaths=output_model_names,
+            )
+
+
+@is_inferentia_test
+@requires_neuronx
+class NeuronEncoderDecoderExportTestCase(unittest.TestCase):
+    """
+    Integration tests ensuring encoder-decoder models are correctly exported.
+    """
+
+    @parameterized.expand(ENCODER_DECODER_MODELS_TINY.items())
+    def test_export_for_encoder_decoder_models(self, model_name, model_id):
+        set_seed(SEED)
+
+        # prepare neuron config / models
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
+        input_shapes = {"batch_size": 1, "sequence_length": 18, "num_beams": 4}
 
         with TemporaryDirectory() as tmpdirname:
+            models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs(
+                model=model,
+                input_shapes=input_shapes,
+                task="text2text-generation",
+                output=Path(tmpdirname),
+                model_name_or_path=model_id,
+            )
             _, neuron_outputs = export_models(
                 models_and_neuron_configs=models_and_neuron_configs,
                 output_dir=Path(tmpdirname),
diff --git a/tests/generation/conftest.py b/tests/generation/conftest.py
index 3997bc9a6..ccf40d151 100644
--- a/tests/generation/conftest.py
+++ b/tests/generation/conftest.py
@@ -17,7 +17,7 @@
 import pytest
 from transformers import AutoTokenizer
 
-from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM
 from optimum.neuron.utils.testing_utils import requires_neuronx
 from optimum.utils.testing_utils import USER
 
@@ -29,24 +29,32 @@
     "llama": "dacorvo/tiny-random-llama",
     "opt": "hf-internal-testing/tiny-random-OPTForCausalLM",
 }
+SEQ2SEQ_MODEL_NAMES = {
+    "t5": "hf-internal-testing/tiny-random-t5",
+}
 
 
 @pytest.fixture(scope="module", params=[DECODER_MODEL_NAMES[model_arch] for model_arch in DECODER_MODEL_ARCHITECTURES])
-def export_model_id(request):
+def export_decoder_id(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[SEQ2SEQ_MODEL_NAMES[model_arch] for model_arch in SEQ2SEQ_MODEL_NAMES])
+def export_seq2seq_id(request):
     return request.param
 
 
 @pytest.fixture(scope="module")
 @requires_neuronx
-def neuron_model_path(export_model_id):
+def neuron_decoder_path(export_decoder_id):
     model = NeuronModelForCausalLM.from_pretrained(
-        export_model_id, export=True, batch_size=1, sequence_length=100, num_cores=2
+        export_decoder_id, export=True, batch_size=1, sequence_length=100, num_cores=2
     )
     model_dir = TemporaryDirectory()
     model_path = model_dir.name
     model.save_pretrained(model_path)
     del model
-    tokenizer = AutoTokenizer.from_pretrained(export_model_id)
+    tokenizer = AutoTokenizer.from_pretrained(export_decoder_id)
     tokenizer.save_pretrained(model_path)
     del tokenizer
     # Yield instead of returning to keep a reference to the temporary directory.
@@ -56,8 +64,31 @@ def neuron_model_path(export_model_id):
 
 
 @pytest.fixture(scope="module")
-def neuron_push_id(export_model_id):
-    model_name = export_model_id.split("/")[-1]
+@requires_neuronx
+def neuron_seq2seq_path(export_seq2seq_id):
+    model = NeuronModelForSeq2SeqLM.from_pretrained(
+        export_seq2seq_id, export=True, batch_size=1, sequence_length=32, num_beams=4
+    )
+    model_dir = TemporaryDirectory()
+    model_path = model_dir.name
+    model.save_pretrained(model_path)
+    del model
+    # Yield instead of returning to keep a reference to the temporary directory.
+    # It will go out of scope and be released only once all tests needing the fixture
+    # have been completed.
+    yield model_path
+
+
+@pytest.fixture(scope="module")
+def neuron_push_decoder_id(export_decoder_id):
+    model_name = export_decoder_id.split("/")[-1]
+    repo_id = f"{USER}/{model_name}-neuronx"
+    return repo_id
+
+
+@pytest.fixture(scope="module")
+def neuron_push_seq2seq_id(export_seq2seq_id):
+    model_name = export_seq2seq_id.split("/")[-1]
     repo_id = f"{USER}/{model_name}-neuronx"
     return repo_id
 
diff --git a/tests/generation/test_export.py b/tests/generation/test_export.py
index e4eaef935..9b88b1515 100644
--- a/tests/generation/test_export.py
+++ b/tests/generation/test_export.py
@@ -16,34 +16,59 @@
 import pytest
 from generation_utils import check_neuron_model
 
-from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM
 from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
 
 
-@pytest.mark.parametrize(
-    "batch_size, sequence_length, num_cores, auto_cast_type",
-    [
-        [1, 100, 2, "fp32"],
-        [1, 100, 2, "fp16"],
-        [2, 100, 2, "fp16"],
-    ],
-)
-@is_inferentia_test
-@requires_neuronx
-def test_model_export(export_model_id, batch_size, sequence_length, num_cores, auto_cast_type):
-    model = NeuronModelForCausalLM.from_pretrained(
-        export_model_id,
-        export=True,
-        batch_size=batch_size,
-        sequence_length=sequence_length,
-        num_cores=num_cores,
-        auto_cast_type=auto_cast_type,
+class DecoderTests:
+    @pytest.mark.parametrize(
+        "batch_size, sequence_length, num_cores, auto_cast_type",
+        [
+            [1, 100, 2, "fp32"],
+            [1, 100, 2, "fp16"],
+            [2, 100, 2, "fp16"],
+        ],
     )
-    check_neuron_model(model, batch_size, sequence_length, num_cores, auto_cast_type)
+    @is_inferentia_test
+    @requires_neuronx
+    def test_decoder_export(export_decoder_id, batch_size, sequence_length, num_cores, auto_cast_type):
+        model = NeuronModelForCausalLM.from_pretrained(
+            export_decoder_id,
+            export=True,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            num_cores=num_cores,
+            auto_cast_type=auto_cast_type,
+        )
+        check_neuron_model(model, batch_size, sequence_length, num_cores, auto_cast_type)
 
+    @is_inferentia_test
+    @requires_neuronx
+    def test_model_from_path(neuron_decoder_path):
+        model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
+        check_neuron_model(model)
 
-@is_inferentia_test
-@requires_neuronx
-def test_model_from_path(neuron_model_path):
-    model = NeuronModelForCausalLM.from_pretrained(neuron_model_path)
-    check_neuron_model(model)
+
+class Seq2SeqTests:
+    @pytest.mark.parametrize(
+        "batch_size, sequence_length, num_beams",
+        [
+            [1, 32, 1],
+            [1, 32, 4],
+        ],
+    )
+    @is_inferentia_test
+    @requires_neuronx
+    def test_seq2seq_export(export_seq2seq_id, batch_size, sequence_length, num_beams):
+        model = NeuronModelForSeq2SeqLM.from_pretrained(
+            export_seq2seq_id,
+            export=True,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            num_beams=num_beams,
+        )
+
+    @is_inferentia_test
+    @requires_neuronx
+    def test_model_from_path(neuron_seq2seq_path):
+        model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path)
diff --git a/tests/generation/test_generate.py b/tests/generation/test_generate.py
index 47eecb8a7..04ec9b9ea 100644
--- a/tests/generation/test_generate.py
+++ b/tests/generation/test_generate.py
@@ -40,17 +40,17 @@ def _test_model_generation(model, tokenizer, batch_size, input_length, **gen_kwa
 )
 @is_inferentia_test
 @requires_neuronx
-def test_model_generation(neuron_model_path, gen_kwargs):
-    model = NeuronModelForCausalLM.from_pretrained(neuron_model_path)
-    tokenizer = AutoTokenizer.from_pretrained(neuron_model_path)
+def test_model_generation(neuron_decoder_path, gen_kwargs):
+    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
+    tokenizer = AutoTokenizer.from_pretrained(neuron_decoder_path)
     _test_model_generation(model, tokenizer, model.batch_size, 10, **gen_kwargs)
 
 
 @is_inferentia_test
 @requires_neuronx
-def test_model_generation_input_dimensions(neuron_model_path):
-    model = NeuronModelForCausalLM.from_pretrained(neuron_model_path)
-    tokenizer = AutoTokenizer.from_pretrained(neuron_model_path)
+def test_model_generation_input_dimensions(neuron_decoder_path):
+    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
+    tokenizer = AutoTokenizer.from_pretrained(neuron_decoder_path)
     # Using valid input dimensions
     _test_model_generation(model, tokenizer, model.batch_size, model.max_length // 2)
     # Using an incompatible batch_size
diff --git a/tests/generation/test_hub.py b/tests/generation/test_hub.py
index 2966e0199..e8f717677 100644
--- a/tests/generation/test_hub.py
+++ b/tests/generation/test_hub.py
@@ -18,33 +18,59 @@
 from huggingface_hub import HfApi
 from transformers.testing_utils import ENDPOINT_STAGING
 
-from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM
 from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
 from optimum.utils.testing_utils import TOKEN
 
 
-@is_inferentia_test
-@requires_neuronx
-def test_model_from_hub():
-    model = NeuronModelForCausalLM.from_pretrained(
-        "dacorvo/tiny-random-gpt2-neuronx", revision="6cb671b50db5cecb7abead9e2ec7099d4bab44a8"
-    )
-    check_neuron_model(model, batch_size=16, sequence_length=512, num_cores=2, auto_cast_type="fp32")
-
-
-@is_inferentia_test
-@requires_neuronx
-def test_push_to_hub(neuron_model_path, neuron_push_id):
-    model = NeuronModelForCausalLM.from_pretrained(neuron_model_path)
-    model.push_to_hub(neuron_model_path, neuron_push_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING)
-    api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
-    try:
-        hub_files_info = api.list_files_info(neuron_push_id)
-        hub_files_path = [info.rfilename for info in hub_files_info]
-        for path, _, files in os.walk(neuron_model_path):
-            for name in files:
-                local_file_path = os.path.join(path, name)
-                hub_file_path = os.path.relpath(local_file_path, neuron_model_path)
-                assert hub_file_path in hub_files_path
-    finally:
-        api.delete_repo(neuron_push_id)
+class DecoderTests:
+    @is_inferentia_test
+    @requires_neuronx
+    def test_model_from_hub():
+        model = NeuronModelForCausalLM.from_pretrained(
+            "dacorvo/tiny-random-gpt2-neuronx", revision="6cb671b50db5cecb7abead9e2ec7099d4bab44a8"
+        )
+        check_neuron_model(model, batch_size=16, sequence_length=512, num_cores=2, auto_cast_type="fp32")
+
+    @is_inferentia_test
+    @requires_neuronx
+    def test_push_to_hub(neuron_decoder_path, neuron_push_decoder_id):
+        model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
+        model.push_to_hub(neuron_decoder_path, neuron_push_decoder_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING)
+        api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
+        try:
+            hub_files_info = api.list_files_info(neuron_push_decoder_id)
+            hub_files_path = [info.rfilename for info in hub_files_info]
+            for path, _, files in os.walk(neuron_decoder_path):
+                for name in files:
+                    local_file_path = os.path.join(path, name)
+                    hub_file_path = os.path.relpath(local_file_path, neuron_decoder_path)
+                    assert hub_file_path in hub_files_path
+        finally:
+            api.delete_repo(neuron_push_decoder_id)
+
+
+class Seq2SeqTests:
+    @is_inferentia_test
+    @requires_neuronx
+    def test_model_from_hub():
+        model = NeuronModelForSeq2SeqLM.from_pretrained(
+            "Jingya/tiny-random-t5-neuronx", revision="6cb671b50db5cecb7abead9e2ec7099d4bab44a8"
+        )
+
+    @is_inferentia_test
+    @requires_neuronx
+    def test_push_seq2seq_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id):
+        model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_push_seq2seq_id)
+        model.push_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING)
+        api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
+        try:
+            hub_files_info = api.list_files_info(neuron_push_seq2seq_id)
+            hub_files_path = [info.rfilename for info in hub_files_info]
+            for path, _, files in os.walk(neuron_seq2seq_path):
+                for name in files:
+                    local_file_path = os.path.join(path, name)
+                    hub_file_path = os.path.relpath(local_file_path, neuron_seq2seq_path)
+                    assert hub_file_path in hub_files_path
+        finally:
+            api.delete_repo(neuron_push_seq2seq_id)

From 308c08e838509e5d734de61b88c9aa664d40bfe5 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Thu, 9 Nov 2023 15:45:11 +0000
Subject: [PATCH 15/30] tests done

---
 tests/generation/conftest.py      | 18 +++++-
 tests/generation/test_export.py   | 94 +++++++++++++++---------------
 tests/generation/test_generate.py | 24 +++++++-
 tests/generation/test_hub.py      | 95 ++++++++++++++++---------------
 4 files changed, 135 insertions(+), 96 deletions(-)

diff --git a/tests/generation/conftest.py b/tests/generation/conftest.py
index ccf40d151..85f203f85 100644
--- a/tests/generation/conftest.py
+++ b/tests/generation/conftest.py
@@ -67,7 +67,23 @@ def neuron_decoder_path(export_decoder_id):
 @requires_neuronx
 def neuron_seq2seq_path(export_seq2seq_id):
     model = NeuronModelForSeq2SeqLM.from_pretrained(
-        export_seq2seq_id, export=True, batch_size=1, sequence_length=32, num_beams=4
+        export_seq2seq_id, export=True, batch_size=1, sequence_length=64, num_beams=4
+    )
+    model_dir = TemporaryDirectory()
+    model_path = model_dir.name
+    model.save_pretrained(model_path)
+    del model
+    # Yield instead of returning to keep a reference to the temporary directory.
+    # It will go out of scope and be released only once all tests needing the fixture
+    # have been completed.
+    yield model_path
+
+
+@pytest.fixture(scope="module")
+@requires_neuronx
+def neuron_seq2seq_greedy_path(export_seq2seq_id):
+    model = NeuronModelForSeq2SeqLM.from_pretrained(
+        export_seq2seq_id, export=True, batch_size=1, sequence_length=64, num_beams=1
     )
     model_dir = TemporaryDirectory()
     model_path = model_dir.name
diff --git a/tests/generation/test_export.py b/tests/generation/test_export.py
index 9b88b1515..32c53c4a4 100644
--- a/tests/generation/test_export.py
+++ b/tests/generation/test_export.py
@@ -20,55 +20,57 @@
 from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
 
 
-class DecoderTests:
-    @pytest.mark.parametrize(
-        "batch_size, sequence_length, num_cores, auto_cast_type",
-        [
-            [1, 100, 2, "fp32"],
-            [1, 100, 2, "fp16"],
-            [2, 100, 2, "fp16"],
-        ],
+@pytest.mark.parametrize(
+    "batch_size, sequence_length, num_cores, auto_cast_type",
+    [
+        [1, 100, 2, "fp32"],
+        [1, 100, 2, "fp16"],
+        [2, 100, 2, "fp16"],
+    ],
+)
+@is_inferentia_test
+@requires_neuronx
+def test_decoder_export(export_decoder_id, batch_size, sequence_length, num_cores, auto_cast_type):
+    model = NeuronModelForCausalLM.from_pretrained(
+        export_decoder_id,
+        export=True,
+        batch_size=batch_size,
+        sequence_length=sequence_length,
+        num_cores=num_cores,
+        auto_cast_type=auto_cast_type,
     )
-    @is_inferentia_test
-    @requires_neuronx
-    def test_decoder_export(export_decoder_id, batch_size, sequence_length, num_cores, auto_cast_type):
-        model = NeuronModelForCausalLM.from_pretrained(
-            export_decoder_id,
-            export=True,
-            batch_size=batch_size,
-            sequence_length=sequence_length,
-            num_cores=num_cores,
-            auto_cast_type=auto_cast_type,
-        )
-        check_neuron_model(model, batch_size, sequence_length, num_cores, auto_cast_type)
+    check_neuron_model(model, batch_size, sequence_length, num_cores, auto_cast_type)
 
-    @is_inferentia_test
-    @requires_neuronx
-    def test_model_from_path(neuron_decoder_path):
-        model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
-        check_neuron_model(model)
 
+@is_inferentia_test
+@requires_neuronx
+def test_model_from_path(neuron_decoder_path):
+    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
+    check_neuron_model(model)
 
-class Seq2SeqTests:
-    @pytest.mark.parametrize(
-        "batch_size, sequence_length, num_beams",
-        [
-            [1, 32, 1],
-            [1, 32, 4],
-        ],
+
+@pytest.mark.parametrize(
+    "batch_size, sequence_length, num_beams",
+    [
+        [1, 64, 1],
+        [1, 64, 4],
+    ],
+)
+@is_inferentia_test
+@requires_neuronx
+def test_seq2seq_export(export_seq2seq_id, batch_size, sequence_length, num_beams):
+    model = NeuronModelForSeq2SeqLM.from_pretrained(
+        export_seq2seq_id,
+        export=True,
+        batch_size=batch_size,
+        sequence_length=sequence_length,
+        num_beams=num_beams,
     )
-    @is_inferentia_test
-    @requires_neuronx
-    def test_seq2seq_export(export_seq2seq_id, batch_size, sequence_length, num_beams):
-        model = NeuronModelForSeq2SeqLM.from_pretrained(
-            export_seq2seq_id,
-            export=True,
-            batch_size=batch_size,
-            sequence_length=sequence_length,
-            num_beams=num_beams,
-        )
+    return model
+
 
-    @is_inferentia_test
-    @requires_neuronx
-    def test_model_from_path(neuron_seq2seq_path):
-        model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path)
+@is_inferentia_test
+@requires_neuronx
+def test_seq2seq_model_from_path(neuron_seq2seq_path):
+    model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path)
+    return model
diff --git a/tests/generation/test_generate.py b/tests/generation/test_generate.py
index 04ec9b9ea..06cbed335 100644
--- a/tests/generation/test_generate.py
+++ b/tests/generation/test_generate.py
@@ -17,7 +17,7 @@
 import torch
 from transformers import AutoTokenizer
 
-from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM
 from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
 
 
@@ -40,7 +40,7 @@ def _test_model_generation(model, tokenizer, batch_size, input_length, **gen_kwa
 )
 @is_inferentia_test
 @requires_neuronx
-def test_model_generation(neuron_decoder_path, gen_kwargs):
+def test_decoder_generation(neuron_decoder_path, gen_kwargs):
     model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
     tokenizer = AutoTokenizer.from_pretrained(neuron_decoder_path)
     _test_model_generation(model, tokenizer, model.batch_size, 10, **gen_kwargs)
@@ -59,3 +59,23 @@ def test_model_generation_input_dimensions(neuron_decoder_path):
     # Using an incompatible input length
     with pytest.raises(ValueError, match="The input sequence length"):
         _test_model_generation(model, tokenizer, model.batch_size, input_length=model.max_length * 2)
+
+
+@is_inferentia_test
+@requires_neuronx
+def test_seq2seq_generation_beam(neuron_seq2seq_path):
+    model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path)
+    tokenizer = AutoTokenizer.from_pretrained(neuron_seq2seq_path)
+    inputs = tokenizer("translate English to German: Lets eat good food.", return_tensors="pt")
+    output = model.generate(**inputs, num_return_sequences=1)
+    return output
+
+
+@is_inferentia_test
+@requires_neuronx
+def test_seq2seq_generation_greedy(neuron_seq2seq_greedy_path):
+    model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_greedy_path)
+    tokenizer = AutoTokenizer.from_pretrained(neuron_seq2seq_greedy_path)
+    inputs = tokenizer("translate English to German: Lets eat good food.", return_tensors="pt")
+    output = model.generate(**inputs, num_return_sequences=1)
+    return output
diff --git a/tests/generation/test_hub.py b/tests/generation/test_hub.py
index e8f717677..ff8e90615 100644
--- a/tests/generation/test_hub.py
+++ b/tests/generation/test_hub.py
@@ -23,54 +23,55 @@
 from optimum.utils.testing_utils import TOKEN
 
 
-class DecoderTests:
-    @is_inferentia_test
-    @requires_neuronx
-    def test_model_from_hub():
-        model = NeuronModelForCausalLM.from_pretrained(
-            "dacorvo/tiny-random-gpt2-neuronx", revision="6cb671b50db5cecb7abead9e2ec7099d4bab44a8"
-        )
-        check_neuron_model(model, batch_size=16, sequence_length=512, num_cores=2, auto_cast_type="fp32")
+@is_inferentia_test
+@requires_neuronx
+def test_model_from_hub():
+    model = NeuronModelForCausalLM.from_pretrained(
+        "dacorvo/tiny-random-gpt2-neuronx", revision="6cb671b50db5cecb7abead9e2ec7099d4bab44a8"
+    )
+    check_neuron_model(model, batch_size=16, sequence_length=512, num_cores=2, auto_cast_type="fp32")
 
-    @is_inferentia_test
-    @requires_neuronx
-    def test_push_to_hub(neuron_decoder_path, neuron_push_decoder_id):
-        model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
-        model.push_to_hub(neuron_decoder_path, neuron_push_decoder_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING)
-        api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
-        try:
-            hub_files_info = api.list_files_info(neuron_push_decoder_id)
-            hub_files_path = [info.rfilename for info in hub_files_info]
-            for path, _, files in os.walk(neuron_decoder_path):
-                for name in files:
-                    local_file_path = os.path.join(path, name)
-                    hub_file_path = os.path.relpath(local_file_path, neuron_decoder_path)
-                    assert hub_file_path in hub_files_path
-        finally:
-            api.delete_repo(neuron_push_decoder_id)
 
+@is_inferentia_test
+@requires_neuronx
+def test_push_to_hub(neuron_decoder_path, neuron_push_decoder_id):
+    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
+    model.push_to_hub(neuron_decoder_path, neuron_push_decoder_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING)
+    api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
+    try:
+        hub_files_info = api.list_files_info(neuron_push_decoder_id)
+        hub_files_path = [info.rfilename for info in hub_files_info]
+        for path, _, files in os.walk(neuron_decoder_path):
+            for name in files:
+                local_file_path = os.path.join(path, name)
+                hub_file_path = os.path.relpath(local_file_path, neuron_decoder_path)
+                assert hub_file_path in hub_files_path
+    finally:
+        api.delete_repo(neuron_push_decoder_id)
 
-class Seq2SeqTests:
-    @is_inferentia_test
-    @requires_neuronx
-    def test_model_from_hub():
-        model = NeuronModelForSeq2SeqLM.from_pretrained(
-            "Jingya/tiny-random-t5-neuronx", revision="6cb671b50db5cecb7abead9e2ec7099d4bab44a8"
-        )
 
-    @is_inferentia_test
-    @requires_neuronx
-    def test_push_seq2seq_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id):
-        model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_push_seq2seq_id)
-        model.push_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING)
-        api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
-        try:
-            hub_files_info = api.list_files_info(neuron_push_seq2seq_id)
-            hub_files_path = [info.rfilename for info in hub_files_info]
-            for path, _, files in os.walk(neuron_seq2seq_path):
-                for name in files:
-                    local_file_path = os.path.join(path, name)
-                    hub_file_path = os.path.relpath(local_file_path, neuron_seq2seq_path)
-                    assert hub_file_path in hub_files_path
-        finally:
-            api.delete_repo(neuron_push_seq2seq_id)
+@is_inferentia_test
+@requires_neuronx
+def test_seq2seq_model_from_hub():
+    model = NeuronModelForSeq2SeqLM.from_pretrained(
+        "Jingya/tiny-random-t5-neuronx", revision="ce617676ce12a19df7c6bd523c69b83447fa036b"
+    )
+    return model
+
+
+@is_inferentia_test
+@requires_neuronx
+def test_push_seq2seq_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id):
+    model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path)
+    model.push_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING)
+    api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
+    try:
+        hub_files_info = api.list_files_info(neuron_push_seq2seq_id)
+        hub_files_path = [info.rfilename for info in hub_files_info]
+        for path, _, files in os.walk(neuron_seq2seq_path):
+            for name in files:
+                local_file_path = os.path.join(path, name)
+                hub_file_path = os.path.relpath(local_file_path, neuron_seq2seq_path)
+                assert hub_file_path in hub_files_path
+    finally:
+        api.delete_repo(neuron_push_seq2seq_id)

From 12e931190a3261c73a3bb5b4e027577e5080d9b9 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Fri, 17 Nov 2023 15:25:04 +0000
Subject: [PATCH 16/30] apply some suggestions

---
 optimum/exporters/neuron/model_configs.py | 6 ------
 optimum/neuron/modeling_seq2seq.py        | 1 -
 2 files changed, 7 deletions(-)

diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index 4ea1beff6..ddb1a1eb8 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -394,12 +394,6 @@ def patch_model_for_export(self, model, device="xla", **kwargs):
             custom_wrapper_kwargs={"num_beams": num_beams, "device": device},
         )
 
-    # def generate_dummy_inputs(self, **kwargs):
-    #     batch_size = kwargs.pop("batch_size") * kwargs.get("num_beams")
-    #     dummy_inputs = super().generate_dummy_inputs(batch_size=batch_size, **kwargs)
-
-    #     return dummy_inputs
-
 
 @register_in_tasks_manager("opt", "text-generation")
 class OPTNeuronConfig(TextNeuronDecoderConfig):
diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index 6a10ea7a5..ac926caa1 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -485,7 +485,6 @@ def beam_search(
 
         # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
         # of the first beam are considered to avoid sampling the exact same tokens across all beams.
-        # beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
         beam_scores_device = "cpu"
         beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=beam_scores_device)
         beam_scores[:, 1:] = -1e9

From 13445c9d622674ab3ba2b92c6534e51be71fe82e Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Wed, 22 Nov 2023 15:47:36 +0000
Subject: [PATCH 17/30] fix style

---
 optimum/exporters/neuron/__main__.py       | 2 +-
 optimum/exporters/neuron/model_configs.py  | 4 +++-
 optimum/exporters/neuron/model_wrappers.py | 6 +++---
 optimum/exporters/neuron/utils.py          | 3 ++-
 tests/exporters/test_export.py             | 4 +++-
 5 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
index 78e042a1d..70abd9619 100644
--- a/optimum/exporters/neuron/__main__.py
+++ b/optimum/exporters/neuron/__main__.py
@@ -223,7 +223,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
     output: Path,
     dynamic_batch_size: bool = False,
     submodels: Dict[str, Union[Path, str]] = None,
-):  
+):
     model = replace_stable_diffusion_submodels(model, submodels)
     check_compiler_compatibility_for_stable_diffusion()
     if is_neuron_available():
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index d6b16cb62..060e6ee01 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -450,7 +450,9 @@ def patch_model_for_export(self, model, device="xla", **kwargs):
         batch_size = kwargs.pop("batch_size", 1)
         sequence_length = kwargs.pop("sequence_length", 1)
         num_beams = kwargs.pop("num_beams", 1)
-        return self.CUSTOM_MODEL_WRAPPER(model, batch_size=batch_size, sequence_length=sequence_length, num_beams=num_beams, device=device)
+        return self.CUSTOM_MODEL_WRAPPER(
+            model, batch_size=batch_size, sequence_length=sequence_length, num_beams=num_beams, device=device
+        )
 
     def generate_io_aliases(self, model):
         num_outputs_from_trace = 3 if model.num_beams > 1 else 1
diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py
index 2ef95e875..3e27b4765 100644
--- a/optimum/exporters/neuron/model_wrappers.py
+++ b/optimum/exporters/neuron/model_wrappers.py
@@ -25,9 +25,9 @@
 
 class UnetNeuronWrapper(torch.nn.Module):
     def __init__(self, model, input_names: List[str]):
-            super().__init__()
-            self.model = model
-            self.input_names = input_names
+        super().__init__()
+        self.model = model
+        self.input_names = input_names
 
     def forward(self, *inputs):
         if len(inputs) != len(self.input_names):
diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py
index f11fcb1c2..81a474374 100644
--- a/optimum/exporters/neuron/utils.py
+++ b/optimum/exporters/neuron/utils.py
@@ -334,6 +334,7 @@ def check_mandatory_input_shapes(neuron_config_constructor, task, input_shapes):
                 f"Cannot find the value of `{name}` which is mandatory for exporting the model to the neuron format, please set the value explicitly."
             )
 
+
 def replace_stable_diffusion_submodels(pipeline, submodels):
     if submodels is not None:
         unet_id = submodels.pop("unet", None)
@@ -343,6 +344,7 @@ def replace_stable_diffusion_submodels(pipeline, submodels):
 
     return pipeline
 
+
 def get_encoder_decoder_models_for_export(
     model: "PreTrainedModel",
     task: str,
@@ -398,4 +400,3 @@ def get_encoder_decoder_models_for_export(
     models_for_export[DECODER_NAME] = (model, decoder_neuron_config)
 
     return models_for_export
-
diff --git a/tests/exporters/test_export.py b/tests/exporters/test_export.py
index 4a491f162..76b24a560 100644
--- a/tests/exporters/test_export.py
+++ b/tests/exporters/test_export.py
@@ -154,7 +154,9 @@ class NeuronStableDiffusionExportTestCase(unittest.TestCase):
     Integration tests ensuring stable diffusion models are correctly exported.
     """
 
-    @parameterized.expand([STABLE_DIFFUSION_MODELS_TINY["stable-diffusion"], STABLE_DIFFUSION_MODELS_TINY["latent-consistency"]])
+    @parameterized.expand(
+        [STABLE_DIFFUSION_MODELS_TINY["stable-diffusion"], STABLE_DIFFUSION_MODELS_TINY["latent-consistency"]]
+    )
     def test_export_for_stable_diffusion_models(self, model_id):
         set_seed(SEED)
 

From ded43a4e4722e696901f239ce5f48176abdd9a18 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Thu, 23 Nov 2023 14:32:20 +0000
Subject: [PATCH 18/30] address part of comments

---
 docs/source/tutorials/stable_diffusion.mdx |  2 +-
 optimum/exporters/neuron/base.py           |  2 +-
 optimum/exporters/neuron/config.py         | 11 +++--------
 optimum/exporters/neuron/convert.py        |  4 +---
 optimum/exporters/neuron/model_configs.py  |  1 -
 optimum/exporters/neuron/model_wrappers.py |  7 ++++---
 optimum/neuron/utils/input_generators.py   |  1 +
 7 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/docs/source/tutorials/stable_diffusion.mdx b/docs/source/tutorials/stable_diffusion.mdx
index c115dd760..5d6a734b6 100644
--- a/docs/source/tutorials/stable_diffusion.mdx
+++ b/docs/source/tutorials/stable_diffusion.mdx
@@ -357,7 +357,7 @@ To avoid Neuron device out of memory, it's suggested to finish all base inferenc
 Latent Consistency Models (LCMs) were proposed in [Latent Consistency Models: Synthesizing High-Resolution Images with Few-Step Inference by Simian Luo, Yiqin Tan, Longbo Huang, Jian Li, and Hang Zhao](https://huggingface.co/papers/2310.04378). LCMs enable inference with fewer steps on any pre-trained LDMs, including Stable Diffusion and SDXL.
 
 In `optimum-neuron`, you can:
-  - Use the class `NeuronLatentConsistencyModelPipeline` to compile and run inference of LCMs distilled from Stable Diffusion (SD) models, 
+  - Use the class `NeuronLatentConsistencyModelPipeline` to compile and run inference of LCMs distilled from Stable Diffusion (SD) models.
   - And continue to use the class `NeuronStableDiffusionXLPipeline` for LCMs distilled from SDXL models.
 
 Here are examples to compile the LCMs of Stable Diffusion ( [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) ) and Stable Diffusion XL( [latent-consistency/lcm-sdxl](https://huggingface.co/latent-consistency/lcm-sdxl) ), and then run inference on AWS Inferentia 2 :
diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py
index 5548dc4b0..6b005869f 100644
--- a/optimum/exporters/neuron/base.py
+++ b/optimum/exporters/neuron/base.py
@@ -119,7 +119,7 @@ def __init__(
         audio_sequence_length: Optional[int] = None,
         point_batch_size: Optional[int] = None,
         nb_points_per_image: Optional[int] = None,
-        num_beams: Optional[int] = None,
+        num_beams: int = 1,
         # TODO: add custom dtype after optimum 1.13 release
         # int_dtype: str = "int64",
         # float_dtype: str = "fp32",
diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py
index 82cbf4450..fccac7e39 100644
--- a/optimum/exporters/neuron/config.py
+++ b/optimum/exporters/neuron/config.py
@@ -16,7 +16,7 @@
 Common Neuron configuration classes that handle most of the features for building model specific
 configurations.
 """
-from typing import Dict, List
+from typing import List
 
 from ...utils import (
     DummyBboxInputGenerator,
@@ -79,11 +79,7 @@ class TextSeq2SeqNeuronConfig(NeuronConfig):
     )
 
     @property
-    def is_decoder(self) -> bool:
-        raise NotImplementedError()
-
-    @property
-    def inputs(self) -> Dict[str, Dict[int, str]]:
+    def inputs(self) -> List[str]:
         common_inputs = []
         # encoder + decoder without past
         if "encoder" in self.MODEL_TYPE:
@@ -100,7 +96,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
         return common_inputs
 
     @property
-    def outputs(self) -> Dict[str, Dict[int, str]]:
+    def outputs(self) -> List[str]:
         common_outputs = []
         # encoder + decoder without past
         if "encoder" in self.MODEL_TYPE:
@@ -115,7 +111,6 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
             beam_outputs = (
                 ["next_token_scores", "next_tokens", "next_indices"] if self.num_beams > 1 else ["next_tokens"]
             )
-            # for i in range(self._config.num_decoder_layers):
             common_outputs = (
                 beam_outputs
                 + [f"past.{idx}.self.key" for idx in range(self._config.num_decoder_layers)]
diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index 85d7ba124..9cace43f7 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -171,9 +171,7 @@ def validate_model_outputs(
         ref_inputs = config.generate_dummy_inputs(return_tuple=False, **input_shapes)
         if reference_model.config.is_encoder_decoder:
             reference_model = config.patch_model_for_export(reference_model, device="cpu", **input_shapes)
-        if (
-            hasattr(config._config, "_class_name") and "AutoencoderKL" in config._config._class_name
-        ) or reference_model.config.is_encoder_decoder:
+        if "AutoencoderKL" in getattr(config._config, "_class_name", "") or reference_model.config.is_encoder_decoder:
             # VAE components for stable diffusion or Encoder-Decoder models
             ref_inputs = tuple(ref_inputs.values())
             ref_outputs = reference_model(*ref_inputs)
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index 906e53764..e4dda2fa5 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -432,7 +432,6 @@ def is_decoder(self) -> bool:
     @property
     def inputs(self) -> List[str]:
         common_inputs = super().inputs + ["beam_idx", "beam_scores"]
-
         return common_inputs
 
     def generate_dummy_inputs(self, **kwargs):
diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py
index 3e27b4765..abc63c114 100644
--- a/optimum/exporters/neuron/model_wrappers.py
+++ b/optimum/exporters/neuron/model_wrappers.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Model wrappers for Neuron export."""
-from typing import TYPE_CHECKING, List
+
+from typing import TYPE_CHECKING, List, Optional
 
 import torch
 from transformers.models.t5.modeling_t5 import T5LayerCrossAttention
@@ -65,7 +66,7 @@ def __init__(
         model: "PreTrainedModel",
         num_beams: int = 1,
         device: str = "xla",
-        tp_degree=None,
+        tp_degree: Optional[int] = None,
     ):
         super().__init__()
         self.model = model
@@ -143,7 +144,7 @@ def __init__(
         sequence_length: int,
         num_beams: int = 1,
         device: str = "xla",
-        tp_degree=None,
+        tp_degree: Optional[int] = None,
     ):
         super().__init__()
         self.model = model
diff --git a/optimum/neuron/utils/input_generators.py b/optimum/neuron/utils/input_generators.py
index 1616123a9..91a1657d9 100644
--- a/optimum/neuron/utils/input_generators.py
+++ b/optimum/neuron/utils/input_generators.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Dummy input generation classes."""
+
 import torch
 
 from ...utils import DTYPE_MAPPER, DummyInputGenerator, NormalizedTextConfig

From 994374bd0e35d6867dadeeeed89735061367593c Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Thu, 23 Nov 2023 19:10:04 +0000
Subject: [PATCH 19/30] apply some suggestions

---
 optimum/neuron/generation/utils.py | 170 +++++++++++++++--------------
 optimum/neuron/modeling_seq2seq.py | 106 ++++--------------
 2 files changed, 112 insertions(+), 164 deletions(-)

diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py
index 81a5c3fa2..9ab87e914 100644
--- a/optimum/neuron/generation/utils.py
+++ b/optimum/neuron/generation/utils.py
@@ -82,6 +82,91 @@ class NeuronGenerationMixin(GenerationMixin):
     learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
     """
 
+    @staticmethod
+    def _initialize_attention(
+        model_kwargs,
+        num_padding_values,
+        batch_size,
+        device,
+        is_encoder_decoder,
+    ):
+        """Initializes the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`"""
+        if is_encoder_decoder:
+            # One 1 for decoder_start_token_id, 0s for the currently-unfilled locations in the past_key_values tensor,
+            # 1s for the actual input_ids
+            decoder_attention_mask = torch.cat(
+                [
+                    torch.zeros((batch_size, num_padding_values), dtype=torch.int32),
+                    torch.ones((batch_size, 2), dtype=torch.int32),
+                ],
+                axis=1,
+            ).to(device)
+            mask = {"decoder_attention_mask": decoder_attention_mask}
+        else:
+            attention_mask = model_kwargs.pop("attention_mask")
+            # 0s for the currently-unfilled locations in the past_key_values tensor, 1s for the actual input_ids
+            attention_mask = torch.cat(
+                [
+                    torch.zeros(
+                        (batch_size, num_padding_values), dtype=attention_mask.dtype, device=attention_mask.device
+                    ),
+                    attention_mask,
+                    torch.ones((batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device),
+                ],
+                axis=1,
+            )
+            mask = {"attention_mask": attention_mask}
+
+        return mask
+
+    @staticmethod
+    def _update_attention(model_kwargs, batch_size, is_encoder_decoder):
+        """Updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`"""
+
+        attention_mask_name = "decoder_attention_mask" if is_encoder_decoder else "attention_mask"
+        attention_mask = model_kwargs.pop(attention_mask_name)
+        attention_mask_update_slice = torch.ones(
+            (batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        attention_mask = torch.cat([attention_mask[:, 1:], attention_mask_update_slice], dim=-1)
+        mask = {attention_mask_name: attention_mask}
+        return mask
+
+    @staticmethod
+    def _initialize_past(past_key_values, num_padding_values):
+        """Initialize past_key_values with zeros -- the structure depends on `batch_axis`"""
+
+        new_past = ()
+        for past_layer in past_key_values:
+            new_past_layer = list(past_layer)
+            for i in range(len(new_past_layer[:2])):
+                b, n_heads, _, head_dim = past_layer[i].shape
+                new_past_layer[i] = torch.cat(
+                    [
+                        torch.zeros(
+                            (b, n_heads, num_padding_values, head_dim),
+                            dtype=past_layer[i].dtype,
+                            device=past_layer[i].device,
+                        ),
+                        past_layer[i],
+                    ],
+                    dim=2,
+                )
+            new_past += (tuple(new_past_layer),)
+
+        return new_past
+
+    @staticmethod
+    def _update_past(past_key_values):
+        new_past = ()
+        for past_layer in past_key_values:
+            new_past_layer = list(past_layer)
+            for i, _ in enumerate(new_past_layer[:2]):
+                new_past_layer[i] = past_layer[i][:, :, 1:]
+            new_past += (tuple(new_past_layer),)
+
+        return new_past
+
     def _update_model_kwargs_for_xla_generation(
         self,
         outputs: ModelOutput,
@@ -93,81 +178,6 @@ def _update_model_kwargs_for_xla_generation(
         seq_length: Optional[int] = None,
         use_cache: bool = True,
     ) -> Dict[str, Any]:
-        def _initialize_attention(model_kwargs, num_padding_values, is_encoder_decoder):
-            """Initializes the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`"""
-            if is_encoder_decoder:
-                # One 1 for decoder_start_token_id, 0s for the currently-unfilled locations in the past_key_values tensor,
-                # 1s for the actual input_ids
-                decoder_attention_mask = torch.cat(
-                    [
-                        torch.zeros((batch_size, num_padding_values), dtype=torch.int32),
-                        torch.ones((batch_size, 2), dtype=torch.int32),
-                    ],
-                    axis=1,
-                ).to(outputs.logits.device)
-                mask = {"decoder_attention_mask": decoder_attention_mask}
-            else:
-                attention_mask = model_kwargs.pop("attention_mask")
-                # 0s for the currently-unfilled locations in the past_key_values tensor, 1s for the actual input_ids
-                attention_mask = torch.cat(
-                    [
-                        torch.zeros(
-                            (batch_size, num_padding_values), dtype=attention_mask.dtype, device=attention_mask.device
-                        ),
-                        attention_mask,
-                        torch.ones((batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device),
-                    ],
-                    axis=1,
-                )
-                mask = {"attention_mask": attention_mask}
-
-            return mask
-
-        def _update_attention(model_kwargs, is_encoder_decoder):
-            """Updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`"""
-
-            attention_mask_name = "decoder_attention_mask" if is_encoder_decoder else "attention_mask"
-            attention_mask = model_kwargs.pop(attention_mask_name)
-            attention_mask_update_slice = torch.ones(
-                (batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device
-            )
-            attention_mask = torch.cat([attention_mask[:, 1:], attention_mask_update_slice], dim=-1)
-            mask = {attention_mask_name: attention_mask}
-            return mask
-
-        def _initialize_past(past_key_values, num_padding_values):
-            """Initialize past_key_values with zeros -- the structure depends on `batch_axis`"""
-
-            new_past = ()
-            for past_layer in past_key_values:
-                new_past_layer = list(past_layer)
-                for i in range(len(new_past_layer[:2])):
-                    b, n_heads, _, head_dim = past_layer[i].shape
-                    new_past_layer[i] = torch.cat(
-                        [
-                            torch.zeros(
-                                (b, n_heads, num_padding_values, head_dim),
-                                dtype=past_layer[i].dtype,
-                                device=past_layer[i].device,
-                            ),
-                            past_layer[i],
-                        ],
-                        dim=2,
-                    )
-                new_past += (tuple(new_past_layer),)
-
-            return new_past
-
-        def _update_past(past_key_values):
-            new_past = ()
-            for past_layer in past_key_values:
-                new_past_layer = list(past_layer)
-                for i, _ in enumerate(new_past_layer[:2]):
-                    new_past_layer[i] = past_layer[i][:, :, 1:]
-                new_past += (tuple(new_past_layer),)
-
-            return new_past
-
         if use_cache:
             past_key_values = self._extract_past_from_model_output(outputs)
             if past_key_values is None:
@@ -182,11 +192,13 @@ def _update_past(past_key_values):
                 # previous autoregressive generation steps (step 0 has no past_key_values, step 1 has 1 past_key_values value, ..., the last step
                 # has `max_length - 1` past_key_values values).
                 num_padding_values = max_length - seq_length
-                mask = _initialize_attention(model_kwargs, num_padding_values, is_encoder_decoder)
-                new_past = _initialize_past(past_key_values, num_padding_values)
+                mask = self._initialize_attention(
+                    model_kwargs, num_padding_values, batch_size, outputs.logits.device, is_encoder_decoder
+                )
+                new_past = self._initialize_past(past_key_values, num_padding_values)
             else:
-                mask = _update_attention(model_kwargs, is_encoder_decoder)
-                new_past = _update_past(past_key_values)
+                mask = self._update_attention(model_kwargs, batch_size, is_encoder_decoder)
+                new_past = self._update_past(past_key_values)
 
             # sets the updated variables (mask and past_key_values)
             model_kwargs.update(mask)
diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index ac926caa1..6071aad1b 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -12,12 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""NeuroModelForXXX classes for seq2seq models' inference on neuron devices."""
+"""NeuroModelForXXX classes for seq2seq models' inference on Neuron devices."""
+
 import copy
 import logging
 import os
 import shutil
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
@@ -44,7 +45,6 @@
     NeuronConfig,
     main_export,
 )
-from ..exporters.neuron.model_configs import *  # noqa: F403
 from ..exporters.tasks import TasksManager
 from ..utils.save_utils import maybe_load_preprocessors
 from .generation import NeuronGenerationMixin
@@ -67,7 +67,7 @@
 logger = logging.getLogger(__name__)
 
 
-class NeuronModelForConditionalGeneration(NeuronBaseModel):
+class NeuronModelForConditionalGeneration(NeuronBaseModel, ABC):
     base_model_prefix = "neuron_model"
     config_name = "config.json"
 
@@ -130,6 +130,10 @@ def _save_pretrained(
         Args:
             save_directory (`Union[str, Path`]):
                 The directory where to save the model files.
+            encoder_file_name (`str`, defaults to `NEURON_FILE_NAME`]):
+                The file name to save the encoder.
+            decoder_file_name (`str`, defaults to `NEURON_FILE_NAME`]):
+                The file name to save the decoder.
         """
         if self.model_and_config_save_paths is None:
             logger.warning(
@@ -342,12 +346,6 @@ def _combine_encoder_decoder_config(self, encoder_config: "PretrainedConfig", de
 
         return combined_config
 
-    def can_generate(self):
-        logger.warning(
-            "NeuronModelForConditionalGeneration is an abstract class and is not meant to be used for generation. Please use NeuronModelForSeq2SeqLM instead."
-        )
-        return False
-
 
 class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin):
     auto_model_class = AutoModelForSeq2SeqLM
@@ -360,7 +358,6 @@ def forward(
         decoder_attention_mask: Optional[torch.BoolTensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
         beam_scores=None,
-        **kwargs,
     ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
         hidden_states = encoder_outputs["last_hidden_state"]
 
@@ -439,8 +436,6 @@ def beam_search(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
         synced_gpus: Optional[bool] = False,
@@ -450,19 +445,16 @@ def beam_search(
         """
         Overriding beam search to use next_token_scores returned from neuron device instead of logits.
         """
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        if logits_processor is not None:
+            logger.warning(
+                "`logits_processor` will not be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron."
+            )
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
         pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
         eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_attentions = (
-            output_attentions if output_attentions is not None else self.generation_config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
-        )
 
         batch_size = len(beam_scorer._beam_hyps)
         num_beams = beam_scorer.num_beams
@@ -500,13 +492,7 @@ def beam_search(
             input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None]
             model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs)
 
-            next_token_scores, next_tokens, next_indices = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                beam_scores=beam_scores,
-            )
+            next_token_scores, next_tokens, next_indices = self(**model_inputs, beam_scores=beam_scores)
 
             # stateless
             beam_outputs = beam_scorer.process(
@@ -545,14 +531,8 @@ def beam_search(
                 model_kwargs,
                 batch_size=batch_beam_size,
                 is_encoder_decoder=self.config.is_encoder_decoder,
-                max_length=stopping_criteria.max_length,
-                seq_length=cur_len,
-                use_cache=model_kwargs["use_cache"],
             )
-            if model_kwargs["past_key_values"] is not None:
-                model_kwargs["past_key_values"] = self._reorder_cache(
-                    model_kwargs["past_key_values"], beam_idx.to(torch.int64)
-                )
+            self._reorder_cache(beam_idx.to(torch.int64))
 
             if return_dict_in_generate and output_scores:
                 beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
@@ -612,8 +592,6 @@ def greedy_search(
         max_length: Optional[int] = None,
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
         seq_length: Optional[int] = int,
@@ -624,8 +602,10 @@ def greedy_search(
         Overriding greedy sampling to use next tokens returned from neuron device instead of logits.
         """
         # init values
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        use_cache = model_kwargs["use_cache"] if "use_cache" in model_kwargs else False
+        if logits_processor is not None:
+            logger.warning(
+                "`logits_processor` will not be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron."
+            )
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
         if max_length is not None:
             from transformers.generation.stopping_criteria import validate_stopping_criteria
@@ -637,12 +617,6 @@ def greedy_search(
             eos_token_id = [eos_token_id]
         eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_attentions = (
-            output_attentions if output_attentions is not None else self.generation_config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
-        )
 
         # init attention / hidden states / scores tuples
         scores = () if (return_dict_in_generate and output_scores) else None
@@ -668,12 +642,7 @@ def greedy_search(
             model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs)
 
             # forward pass to get next token
-            output = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
+            output = self(**model_inputs)
             next_tokens = output[0]
 
             # finished sentences should have their next token be a padding token
@@ -693,9 +662,6 @@ def greedy_search(
                 model_kwargs,
                 batch_size=batch_size,
                 is_encoder_decoder=self.config.is_encoder_decoder,
-                max_length=stopping_criteria.max_length,
-                seq_length=seq_length,
-                use_cache=use_cache,
             )
 
             seq_length += 1
@@ -741,12 +707,11 @@ def greedy_search(
 
         return input_ids
 
-    def _reorder_cache(self, past_key_values, beam_idx):
+    def _reorder_cache(self, beam_idx):
         """
         The cache was reordered during the tracing of the decoder so we can skip it here. This is needed for beam search and not greedy sampling.
         """
         self.beam_idx = beam_idx
-        return past_key_values
 
     def get_encoder(self) -> "NeuronEncoder":
         return self.encoder
@@ -756,43 +721,19 @@ def _update_model_kwargs_for_xla_generation(
         model_kwargs: Dict[str, Any],
         batch_size: int,
         is_encoder_decoder: bool = False,
-        standardize_cache_format: bool = False,
-        max_length: Optional[int] = None,
-        seq_length: Optional[int] = None,
-        use_cache: bool = True,
     ) -> Dict[str, Any]:
-        def _update_attention(model_kwargs, is_encoder_decoder):
-            """Updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`"""
-
-            attention_mask_name = "decoder_attention_mask" if is_encoder_decoder else "attention_mask"
-            attention_mask = model_kwargs.pop(attention_mask_name)
-            attention_mask_update_slice = torch.ones(
-                (batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device
-            )
-            attention_mask = torch.cat([attention_mask[:, 1:], attention_mask_update_slice], dim=-1)
-            mask = {attention_mask_name: attention_mask}
-            return mask
-
-        mask = _update_attention(model_kwargs, is_encoder_decoder)
+        mask = self._update_attention(model_kwargs, batch_size, is_encoder_decoder)
         # sets the updated variables (mask and past_key_values)
         model_kwargs.update(mask)
 
-        # Set a mock cache tensor
-        model_kwargs["past_key_values"] = torch.tensor([])
-
         return model_kwargs
 
     # Override to cut the input_ids to just last token
     def prepare_inputs_for_generation(
         self,
         input_ids,
-        past_key_values=None,
         attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
         decoder_attention_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
         encoder_outputs=None,
         **kwargs,
     ):
@@ -801,14 +742,9 @@ def prepare_inputs_for_generation(
 
         return {
             "decoder_input_ids": input_ids,
-            "past_key_values": past_key_values,
             "encoder_outputs": encoder_outputs,
             "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
             "decoder_attention_mask": decoder_attention_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,
         }
 
     def _validate_static_shape(self, input_shapes: List[int], target_shapes: List[int]) -> bool:

From 5c55ec16c6715b1f633e704ede2280088570f242 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Fri, 24 Nov 2023 16:35:38 +0000
Subject: [PATCH 20/30] add pad left support and log

---
 optimum/neuron/modeling_base.py    | 45 ++++++++++++++++++++++++------
 optimum/neuron/modeling_seq2seq.py |  6 ++--
 2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py
index 1bfb42627..7b3a28ecd 100644
--- a/optimum/neuron/modeling_base.py
+++ b/optimum/neuron/modeling_base.py
@@ -20,7 +20,7 @@
 from contextlib import contextmanager
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union
 
 import torch
 from huggingface_hub import HfApi, HfFolder, hf_hub_download
@@ -451,10 +451,19 @@ def _raise_if_invalid_padding(self, input_name, input_tensor, target_shapes, to_
                 f" than the static shapes used for compilation: {target_shapes}{extra}."
             )
 
-    def _pad_to_compiled_shape(self, inputs: Dict[str, "torch.Tensor"]):
+    def _pad_to_compiled_shape(
+        self, inputs: Dict[str, "torch.Tensor"], padding_side: Literal["right", "left"] = "right"
+    ):
         """
         Pads input tensors if they are not in valid shape.
+
+        Args:
+            inputs (`Dict[str, "torch.Tensor"]`):
+                Dictionary of input torch tensors.
+            padding_side (`Literal["right", "left"]`, defaults to "right"):
+                The side on which to apply the padding.
         """
+        logger.info(f"Padding input tensors, the padding side is: {padding_side}.")
         for input_name, input_tensor in inputs.items():
             target_shapes = self.input_static_shapes[input_name]
             padding = ()
@@ -466,7 +475,7 @@ def _pad_to_compiled_shape(self, inputs: Dict[str, "torch.Tensor"]):
                 to_pad = target_shapes[i] - input_tensor.size(i)
 
                 self._raise_if_invalid_padding(input_name, input_tensor, target_shapes, to_pad, i)
-                padding += (0, to_pad)
+                padding += (0, to_pad) if padding_side == "right" else (to_pad, 0)
 
             if (
                 self.preprocessors is not None
@@ -496,7 +505,7 @@ def _pad_to_compiled_shape(self, inputs: Dict[str, "torch.Tensor"]):
             else:
                 to_pad = target_shapes[0] - input_tensor.size(0)
                 self._raise_if_invalid_padding(input_name, input_tensor, target_shapes, to_pad, 0)
-            padding += (0, to_pad)
+            padding += (0, to_pad) if padding_side == "right" else (to_pad, 0)
 
             pad_id = 1
             inputs[input_name] = torch.nn.functional.pad(input_tensor, padding, mode="constant", value=pad_id)
@@ -508,7 +517,13 @@ def neuron_padding_manager(self, inputs: Dict[str, "torch.Tensor"]):
         inputs = tuple(self._pad_to_compiled_shape(inputs).values())
         yield inputs
 
-    def remove_padding(self, outputs: List[torch.Tensor], dims: List[int], indices: List[int]) -> List[torch.Tensor]:
+    @staticmethod
+    def remove_padding(
+        outputs: List[torch.Tensor],
+        dims: List[int],
+        indices: List[int],
+        padding_side: Literal["right", "left"] = "right",
+    ) -> List[torch.Tensor]:
         """
         Removes padding from output tensors.
 
@@ -519,12 +534,26 @@ def remove_padding(self, outputs: List[torch.Tensor], dims: List[int], indices:
                 List of dimensions in which we slice a tensor.
             indices (`List[int]`):
                 List of indices in which we slice a tensor along an axis.
+            padding_side (`Literal["right", "left"]`, defaults to "right"):
+                The side on which the padding has been applied.
         """
         if len(dims) != len(indices):
             raise ValueError(f"The size of `dims`({len(dims)}) and indices`({len(indices)}) must be equal.")
+
         for dim, indice in zip(dims, indices):
-            outputs = [
-                torch.index_select(output_tensor, dim, torch.LongTensor(range(indice))) for output_tensor in outputs
-            ]
+            if padding_side == "right":
+                outputs = [
+                    torch.index_select(output_tensor, dim, torch.LongTensor(range(indice)))
+                    for output_tensor in outputs
+                ]
+            elif padding_side == "left":
+                outputs = [
+                    torch.index_select(
+                        output_tensor,
+                        dim,
+                        torch.LongTensor(range(output_tensor.shape[dim] - indice, output_tensor.shape[dim])),
+                    )
+                    for output_tensor in outputs
+                ]
 
         return outputs
diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index 6071aad1b..ff3feb9ba 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -421,7 +421,9 @@ def generate(
             max_length=max_length,
             num_beams=num_beams,
             do_sample=kwargs.pop("do_sample", False),
-            use_cache=kwargs.pop("use_cache", True),
+            use_cache=kwargs.pop(
+                "use_cache", False
+            ),  # `use_cache` is supported by default in `optimum-neuron`, set to False to avoid warning
             decoder_attention_mask=decoder_attention_mask,
             # Pass fake encoder_outputs so the transfomers code will not invoke the encoder
             encoder_outputs={"last_hidden_state": torch.ones((batch_size, max_length, 1))},
@@ -447,7 +449,7 @@ def beam_search(
         """
         if logits_processor is not None:
             logger.warning(
-                "`logits_processor` will not be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron."
+                "`logits_processor` will be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron."
             )
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
         pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id

From 9396c7ac5c6552cc2ae0051d95a570d6f4588b08 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Mon, 27 Nov 2023 11:57:14 +0000
Subject: [PATCH 21/30] fix enable custom max length instead of real max length
 limit

---
 optimum/neuron/modeling_seq2seq.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index ff3feb9ba..23396002e 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -59,7 +59,6 @@
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedModel
-    from transformers.generation.streamers import BaseStreamer
 
 if is_neuronx_available():
     import torch_neuronx
@@ -385,7 +384,6 @@ def generate(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
         assistant_model: Optional["PreTrainedModel"] = None,
-        streamer: Optional["BaseStreamer"] = None,
         num_return_sequences: Optional[int] = None,
         **kwargs,
     ):
@@ -416,9 +414,8 @@ def generate(
             stopping_criteria=stopping_criteria,
             prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
             assistant_model=assistant_model,
-            streamer=streamer,
             num_return_sequences=num_return_sequences,
-            max_length=max_length,
+            max_length=kwargs.pop("max_length", None) or max_length,
             num_beams=num_beams,
             do_sample=kwargs.pop("do_sample", False),
             use_cache=kwargs.pop(
@@ -597,7 +594,6 @@ def greedy_search(
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
         seq_length: Optional[int] = int,
-        streamer: Optional["BaseStreamer"] = None,
         **model_kwargs,
     ) -> Union[GreedySearchOutput, torch.LongTensor]:
         """
@@ -704,9 +700,6 @@ def greedy_search(
             if this_peer_finished:
                 break
 
-        if streamer is not None:
-            streamer.end()
-
         return input_ids
 
     def _reorder_cache(self, beam_idx):

From 9676a6543bc2fc29b9ec4a6f616955925d8d60f3 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Tue, 28 Nov 2023 18:35:29 +0000
Subject: [PATCH 22/30] reuse neuron gen mix

---
 optimum/neuron/generation/utils.py | 1155 ++++++++++++++--------------
 optimum/neuron/modeling_seq2seq.py |  298 +------
 2 files changed, 576 insertions(+), 877 deletions(-)

diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py
index 9ab87e914..11f64d88e 100644
--- a/optimum/neuron/generation/utils.py
+++ b/optimum/neuron/generation/utils.py
@@ -52,8 +52,7 @@
 
 
 if TYPE_CHECKING:
-    from transformers.generation.streamers import BaseStreamer
-    from transformers.modeling_utils import PreTrainedModel
+    pass
 
 logger = logging.get_logger(__name__)
 
@@ -272,419 +271,6 @@ def _expand_dict_for_generation(dict_to_expand):
 
         return input_ids, model_kwargs
 
-    def beam_search(
-        self,
-        input_ids: torch.LongTensor,
-        beam_scorer: BeamScorer,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        max_length: Optional[int] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_scores: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: Optional[bool] = False,
-        seq_length: Optional[int] = None,
-        **model_kwargs,
-    ) -> Union[BeamSearchOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
-        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
-
-        <Tip warning={true}>
-
-        In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate()
-        instead. For an overview of generation strategies and code examples, check the [following
-        guide](../generation_strategies).
-
-        </Tip>
-
-        Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                The sequence used as a prompt for the generation.
-            beam_scorer (`BeamScorer`):
-                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
-                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
-            logits_processor (`LogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-                used to modify the prediction scores of the language modeling head applied at each generation step.
-            stopping_criteria (`StoppingCriteriaList`, *optional*):
-                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
-                used to tell if the generation loop should stop.
-            max_length (`int`, *optional*, defaults to 20):
-                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
-                tokens. The maximum length of the sequence to be generated.
-            pad_token_id (`int`, *optional*):
-                The id of the *padding* token.
-            eos_token_id (`Union[int, List[int]]`, *optional*):
-                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more details.
-            output_scores (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-            synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
-            seq_length:
-                Length of current input_ids sequence
-            model_kwargs:
-                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
-                an encoder-decoder model the kwargs should include `encoder_outputs`.
-
-        Return:
-            [`generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or
-            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-            [`~generation.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
-            `return_dict_in_generate=True` or a [`~generation.BeamSearchEncoderDecoderOutput`] if
-            `model.config.is_encoder_decoder=True`.
-
-
-        Examples:
-
-        ```python
-        >>> from transformers import (
-        ...     AutoTokenizer,
-        ...     AutoModelForSeq2SeqLM,
-        ...     LogitsProcessorList,
-        ...     MinLengthLogitsProcessor,
-        ...     BeamSearchScorer,
-        ... )
-        >>> import torch
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-
-        >>> encoder_input_str = "translate English to German: How old are you?"
-        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
-
-
-        >>> # lets run beam search using 3 beams
-        >>> num_beams = 3
-        >>> # define decoder start token ids
-        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
-        >>> input_ids = input_ids * model.config.decoder_start_token_id
-
-        >>> # add encoder_outputs to model keyword arguments
-        >>> model_kwargs = {
-        ...     "encoder_outputs": model.get_encoder()(
-        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
-        ...     )
-        ... }
-
-        >>> # instantiate beam scorer
-        >>> beam_scorer = BeamSearchScorer(
-        ...     batch_size=1,
-        ...     num_beams=num_beams,
-        ...     device=model.device,
-        ... )
-
-        >>> # instantiate logits processors
-        >>> logits_processor = LogitsProcessorList(
-        ...     [
-        ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
-        ...     ]
-        ... )
-
-        >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
-
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ['Wie alt bist du?']
-        ```"""
-        # init values
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-        if max_length is not None:
-            warnings.warn(
-                "`max_length` is deprecated in this function, use"
-                " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
-                UserWarning,
-            )
-            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
-        if len(stopping_criteria) == 0:
-            warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
-        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
-        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_attentions = (
-            output_attentions if output_attentions is not None else self.generation_config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
-        )
-        return_dict_in_generate = (
-            return_dict_in_generate
-            if return_dict_in_generate is not None
-            else self.generation_config.return_dict_in_generate
-        )
-
-        batch_size = len(beam_scorer._beam_hyps)
-        num_beams = beam_scorer.num_beams
-
-        batch_beam_size, cur_len = input_ids.shape
-
-        # Overwrite cur_len
-        cur_len = seq_length
-
-        if num_beams * batch_size != batch_beam_size:
-            raise ValueError(
-                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
-            )
-
-        # init attention / hidden states / scores tuples
-        scores = () if (return_dict_in_generate and output_scores) else None
-        beam_indices = (
-            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
-        )
-        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
-        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
-        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
-
-        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
-        if return_dict_in_generate and self.config.is_encoder_decoder:
-            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
-            encoder_hidden_states = (
-                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
-            )
-
-        # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
-        # of the first beam are considered to avoid sampling the exact same tokens across all beams.
-        # beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
-        beam_scores_device = "cpu"
-        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=beam_scores_device)
-        beam_scores[:, 1:] = -1e9
-        beam_scores = beam_scores.view((batch_size * num_beams,))
-
-        this_peer_finished = False  # used by synced_gpus only
-        while True:
-            if synced_gpus:
-                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
-                # The following logic allows an early break if all peers finished generating their sequence
-                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
-                # send 0.0 if we finished, 1.0 otherwise
-                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
-                # did all peers finish? the reduced sum will be 0.0 then
-                if this_peer_finished_flag.item() == 0.0:
-                    break
-
-            # prepare model inputs
-            if model_kwargs["use_cache"]:
-                # From max_length-sized input_ids, select first
-                # cur_len - 1 values.
-                update_indices = torch.stack(
-                    [torch.arange(input_ids.size(0)), torch.tensor(cur_len - 1).repeat(input_ids.size(0))], dim=-1
-                )
-                input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None]
-                model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs)
-            else:
-                model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
-
-            if synced_gpus and this_peer_finished:
-                cur_len = cur_len + 1
-                continue  # don't waste resources running the code we don't need
-
-            if not model_kwargs["use_cache"]:
-                one_hot = (
-                    torch.cat(
-                        [
-                            torch.tensor([0]).repeat(1, cur_len - 1),
-                            torch.tensor([1]).repeat(1, 1),
-                            torch.tensor([0]).repeat(1, input_ids.size(1) - cur_len),
-                        ],
-                        dim=1,
-                    )
-                    .to(device=outputs.logits.device)
-                    .float()
-                )
-                next_token_logits = torch.matmul(one_hot, outputs.logits)
-                next_token_logits = next_token_logits.squeeze(1)
-            else:
-                next_token_logits = outputs.logits[:, -1, :]
-
-            # Manually compute log softmax
-            # log_softmax(vi) = vi - max(vi) - log(sum(exp(vi - max(vi))))
-            logit_max, _ = torch.max(next_token_logits, dim=-1, keepdim=True)
-            logsumexp = torch.log(torch.exp(next_token_logits - logit_max).sum(dim=-1, keepdim=True))
-            next_token_scores = next_token_logits - logit_max - logsumexp
-            # (batch_size * num_beams, vocab_size)
-
-            xm.mark_step()
-
-            # We don't want to change every single logit processor, so
-            # we peform this processing on CPU.
-            input_ids_ = input_ids.to("cpu")[:, :cur_len]
-            next_token_scores_ = next_token_scores.to("cpu")
-            next_token_scores_processed = logits_processor(input_ids_, next_token_scores_)
-
-            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores)
-
-            # Store scores, attentions and hidden_states when required
-            if return_dict_in_generate:
-                if output_scores:
-                    scores += (next_token_scores_processed,)
-                if output_attentions:
-                    decoder_attentions += (
-                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
-                    )
-                    if self.config.is_encoder_decoder:
-                        cross_attentions += (outputs.cross_attentions,)
-
-                if output_hidden_states:
-                    decoder_hidden_states += (
-                        (outputs.decoder_hidden_states,)
-                        if self.config.is_encoder_decoder
-                        else (outputs.hidden_states,)
-                    )
-
-            # reshape for beam search
-            vocab_size = next_token_scores.shape[-1]
-            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
-            next_token_scores = next_token_scores * 1
-
-            # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search)
-            next_token_scores, next_tokens = torch.topk(
-                next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True
-            )
-
-            next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
-            next_tokens = next_tokens % vocab_size
-
-            # stateless
-            beam_outputs = beam_scorer.process(
-                input_ids.to("cpu")[:, :cur_len],
-                next_token_scores.to("cpu"),
-                next_tokens.to("cpu"),
-                next_indices.to("cpu"),
-                pad_token_id=pad_token_id,
-                eos_token_id=eos_token_id,
-                beam_indices=beam_indices,
-            )
-
-            beam_scores = beam_outputs["next_beam_scores"]
-            beam_next_tokens = beam_outputs["next_beam_tokens"]
-            beam_idx = beam_outputs["next_beam_indices"]
-
-            update_indices = torch.stack(
-                [torch.arange(batch_beam_size), torch.tensor(cur_len - 1).repeat(batch_beam_size)], dim=-1
-            )
-            update_indices_2 = torch.stack(
-                [torch.arange(batch_beam_size), torch.tensor(cur_len).repeat(batch_beam_size)], dim=-1
-            )
-            # First select beam_indices
-            device = input_ids.device
-            beam_idx_device = beam_idx.to(device=input_ids.device)
-            input_ids[:, :] = input_ids[beam_idx_device.long(), :]
-
-            # Then append new tokens
-            input_ids[update_indices_2[:, 0], update_indices_2[:, 1], None] = beam_next_tokens.unsqueeze(-1).to(device)
-            input_ids = input_ids * 1  # Hack to materialize tensor
-
-            # update generated ids, model inputs, and length for next step
-            model_kwargs = self._update_model_kwargs_for_xla_generation(
-                outputs,
-                model_kwargs,
-                batch_size=batch_beam_size,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-                max_length=stopping_criteria.max_length,
-                seq_length=cur_len,
-                use_cache=model_kwargs["use_cache"],
-            )
-            if model_kwargs["past_key_values"] is not None:
-                model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx)
-
-            if return_dict_in_generate and output_scores:
-                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
-
-            # increase cur_len
-            cur_len = cur_len + 1
-
-            # stop when each sentence is finished, or if we exceed the maximum length
-            stop_criterion_1 = beam_scorer.is_done
-            if isinstance(stopping_criteria, list):
-                if len(stopping_criteria) == 1:
-                    stopping_criteria = stopping_criteria[0]
-
-            # Cases that can be handled in XLA without requiring
-            # non-padded input_ids
-            if isinstance(stopping_criteria, MaxLengthCriteria):
-                stop_criterion_2 = cur_len >= stopping_criteria.max_length
-            elif isinstance(stopping_criteria, MaxTimeCriteria):
-                stop_criterion_2 = stopping_criteria(input_ids, scores)
-            else:
-                # Other cases will be handled on CPU
-                batch_size, _ = input_ids.shape
-                input_ids_cpu = input_ids.to("cpu")
-                mask = torch.cat(
-                    [torch.ones(batch_size, cur_len), torch.zeros(batch_size, input_ids.shape[1] - cur_len)], dim=1
-                ).bool()
-                input_ids_cpu = torch.masked_select(input_ids_cpu, mask).reshape((batch_size, cur_len))
-                scores_cpu = scores.to("cpu") if torch.is_tensor(scores) else scores
-                stop_criterion_2 = stopping_criteria(input_ids_cpu, scores_cpu)
-
-            if stop_criterion_1 or stop_criterion_2:
-                if not synced_gpus:
-                    break
-                else:
-                    this_peer_finished = True
-
-        sequence_outputs = beam_scorer.finalize(
-            input_ids.to("cpu"),
-            beam_scores.to("cpu"),
-            next_tokens.to("cpu"),
-            next_indices.to("cpu"),
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-            max_length=stopping_criteria.max_length,
-            beam_indices=beam_indices,
-        )
-
-        for k, v in sequence_outputs.items():
-            if type(v) == torch.Tensor:
-                sequence_outputs[k] = sequence_outputs[k].to(input_ids.device)
-
-        if return_dict_in_generate:
-            if not output_scores:
-                sequence_outputs["sequence_scores"] = None
-
-            if self.config.is_encoder_decoder:
-                return BeamSearchEncoderDecoderOutput(
-                    sequences=sequence_outputs["sequences"],
-                    sequences_scores=sequence_outputs["sequence_scores"],
-                    scores=scores,
-                    beam_indices=sequence_outputs["beam_indices"],
-                    encoder_attentions=encoder_attentions,
-                    encoder_hidden_states=encoder_hidden_states,
-                    decoder_attentions=decoder_attentions,
-                    cross_attentions=cross_attentions,
-                    decoder_hidden_states=decoder_hidden_states,
-                )
-            else:
-                return BeamSearchDecoderOnlyOutput(
-                    sequences=sequence_outputs["sequences"],
-                    sequences_scores=sequence_outputs["sequence_scores"],
-                    scores=scores,
-                    beam_indices=sequence_outputs["beam_indices"],
-                    attentions=decoder_attentions,
-                    hidden_states=decoder_hidden_states,
-                )
-        else:
-            return sequence_outputs["sequences"]
-
     @torch.no_grad()
     def generate(
         self,
@@ -694,8 +280,7 @@ def generate(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
         synced_gpus: Optional[bool] = None,
-        assistant_model: Optional["PreTrainedModel"] = None,
-        streamer: Optional["BaseStreamer"] = None,
+        is_traced_inference: bool = False,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         r"""
@@ -714,23 +299,23 @@ def generate(
         </Tip>
 
         Parameters:
-            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
+            inputs (`Optional[torch.Tensor]`, defaults to `None`):
                 The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
                 method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
                 should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
                 `input_ids`, `input_values`, `input_features`, or `pixel_values`.
-            generation_config (`~generation.GenerationConfig`, *optional*):
+            generation_config (`Optional[GenerationConfig]`, defaults to `None`):
                 The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                 passed to generate matching the attributes of `generation_config` will override them. If
                 `generation_config` is not provided, the default will be used, which had the following loading
                 priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                 configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                 default values, whose documentation should be checked to parameterize generation.
-            logits_processor (`LogitsProcessorList`, *optional*):
+            logits_processor (`Optional[LogitsProcessorList]`, defaults to `None`):
                 Custom logits processors that complement the default logits processors built from arguments and
                 generation config. If a logit processor is passed that is already created with the arguments or a
                 generation config an error is thrown. This feature is intended for advanced users.
-            stopping_criteria (`StoppingCriteriaList`, *optional*):
+            stopping_criteria (`Optional[StoppingCriteriaList]`, defaults to `None`):
                 Custom stopping criteria that complement the default stopping criteria built from arguments and a
                 generation config. If a stopping criteria is passed that is already created with the arguments or a
                 generation config an error is thrown. This feature is intended for advanced users.
@@ -741,18 +326,13 @@ def generate(
                 on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
                 for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                 Retrieval](https://arxiv.org/abs/2010.00904).
-            synced_gpus (`bool`, *optional*):
+            synced_gpus (`Optional[bool]`, defaults to `None`):
                 Whether to continue running the while loop until max_length. Unless overridden this flag will be set to
                 `True` under DeepSpeed ZeRO Stage 3 multiple GPUs environment to avoid hanging if one GPU finished
                 generating before other GPUs. Otherwise it'll be set to `False`.
-            assistant_model (`PreTrainedModel`, *optional*):
-                An assistant model that can be used to accelerate generation. The assistant model must have the exact
-                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
-                is much faster than running generation with the model you're calling generate from. As such, the
-                assistant model should be much smaller.
-            streamer (`BaseStreamer`, *optional*):
-                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
-                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            is_traced_inference (`bool`, defaults to `False`):
+                Whether the decoder is traced or using XLA lazy tensor. If the decoder is traced, next tokens and the beam scores
+                are computed inside the decoder.
             kwargs:
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
@@ -832,9 +412,11 @@ def generate(
         # 4. Define other model kwargs
         model_kwargs["output_attentions"] = generation_config.output_attentions
         model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
-        if generation_config.use_cache:
+        if generation_config.use_cache and not is_traced_inference:
             warnings.warn("use_cache is not supported for generation on Neuron devices, switching to use_cache=False.")
-        model_kwargs["use_cache"] = False
+            model_kwargs["use_cache"] = False
+        else:
+            model_kwargs["use_cache"] = generation_config.use_cache
 
         accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys())
         requires_attention_mask = "encoder_outputs" not in model_kwargs
@@ -875,9 +457,6 @@ def generate(
         else:
             input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")
 
-        if streamer is not None:
-            streamer.put(input_ids.cpu())
-
         # 6. Prepare `max_length` depending on other stopping criteria.
         input_ids_seq_length = input_ids.shape[-1]
         has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
@@ -974,11 +553,6 @@ def generate(
         if generation_config.num_beam_groups > generation_config.num_beams:
             raise ValueError("`num_beam_groups` has to be smaller or equal to `num_beams`")
 
-        if streamer is not None and (generation_config.num_beams > 1):
-            raise ValueError(
-                "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
-            )
-
         if hasattr(self, "device") and self.device.type != input_ids.device.type:
             warnings.warn(
                 "You are calling .generate() with the `input_ids` being on a device type different"
@@ -1022,7 +596,7 @@ def generate(
                 return_dict_in_generate=generation_config.return_dict_in_generate,
                 synced_gpus=synced_gpus,
                 seq_length=input_ids_seq_length,
-                streamer=streamer,
+                is_traced_inference=is_traced_inference,
                 **model_kwargs,
             )
         elif is_beam_gen_mode:
@@ -1061,15 +635,332 @@ def generate(
                 return_dict_in_generate=generation_config.return_dict_in_generate,
                 synced_gpus=synced_gpus,
                 seq_length=input_ids_seq_length,
+                is_traced_inference=is_traced_inference,
                 **model_kwargs,
             )
 
         else:
-            raise ValueError("Only greedy search and beam search are supported on Neuron.")
+            raise ValueError("Only greedy search and beam search are supported on Neuron.")
+
+    def greedy_search(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        seq_length: Optional[int] = None,
+        is_traced_inference: bool = False,
+        **model_kwargs,
+    ) -> Union[GreedySearchOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
+        used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+        <Tip warning={true}>
+
+        In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate()
+        instead. For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+
+            max_length (`int`, *optional*, defaults to 20):
+                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+                tokens. The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            seq_length (`Optional[int]`, defaults to `False`):
+                Length of current input_ids sequence
+            is_traced_inference (`bool`, defaults to `False`):
+                Whether the decoder is traced or using XLA lazy tensor. If the decoder is traced, next tokens and the beam scores
+                are computed inside the decoder.
+            model_kwargs:
+                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
+                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`~generation.GreedySearchDecoderOnlyOutput`], [`~generation.GreedySearchEncoderDecoderOutput`] or
+            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GreedySearchEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer
+        >>> from optimum.neuron import NeuronModelForSeq2SeqLM
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
+        >>> input_shapes = {"batch_size": 1, "sequence_length": 128, "num_beams": 1}
+        >>> model = NeuronModelForSeq2SeqLM.from_pretrained("t5-small", export=True, dynamic_batch_size=False, **input_shapes)
+
+        >>> input_prompt = "translate English to German: Lets eat good food."
+        >>> inputs = tokenizer(input_prompt, return_tensors="pt")
+
+        >>> outputs = model.greedy_search(input_ids)
+
+        >>> results = [tokenizer.decode(t, skip_special_tokens=True) for t in outputs]
+        ```
+        """
+        # init values
+        if logits_processor is not None and is_traced_inference:
+            logger.warning(
+                "`logits_processor` will not be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron."
+            )
+        elif logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        use_cache = model_kwargs.pop("use_cache", False)
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            warnings.warn(
+                "`max_length` is deprecated in this function, use"
+                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        # keep track of which sequences are already finished
+        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
+
+        this_peer_finished = False  # used by synced_gpus only
+        while True:
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
+            # prepare model inputs
+            if use_cache:
+                # From max_length-sized input_ids, select first
+                # seq_length - 1 values.
+
+                if model_kwargs.get("past_key_values") is None:
+                    input_ids_ = input_ids[:, :seq_length]
+                else:
+                    update_indices = torch.stack(
+                        [torch.arange(input_ids.size(0)), torch.tensor(seq_length - 1).repeat(input_ids.size(0))],
+                        dim=-1,
+                    )
+                    input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None]
+
+                model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs)
+            else:
+                model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+
+            if synced_gpus and this_peer_finished:
+                continue  # don't waste resources running the code we don't need
+
+            if not is_traced_inference:
+                if not use_cache:
+                    one_hot = (
+                        torch.cat(
+                            [
+                                torch.tensor([0]).repeat(1, seq_length - 1),
+                                torch.tensor([1]).repeat(1, 1),
+                                torch.tensor([0]).repeat(1, input_ids.size(1) - seq_length),
+                            ],
+                            dim=1,
+                        )
+                        .to(device=outputs.logits.device)
+                        .float()
+                    )
+                    next_token_logits = torch.matmul(one_hot, outputs.logits)
+                    next_token_logits = next_token_logits.squeeze(1)
+                else:
+                    next_token_logits = outputs.logits[:, -1, :]
+
+                # pre-process distribution
+                # Move to cpu to handle arbitrary logits_processor
+                next_tokens_scores = logits_processor(input_ids.to("cpu")[:, :seq_length], next_token_logits.to("cpu"))
+                next_tokens_scores = next_tokens_scores.to(input_ids.device)
+
+                # Store scores, attentions and hidden_states when required
+                if return_dict_in_generate:
+                    if output_scores:
+                        scores += (next_tokens_scores,)
+                    if output_attentions:
+                        decoder_attentions += (
+                            (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                        )
+                        if self.config.is_encoder_decoder:
+                            cross_attentions += (outputs.cross_attentions,)
+
+                    if output_hidden_states:
+                        decoder_hidden_states += (
+                            (outputs.decoder_hidden_states,)
+                            if self.config.is_encoder_decoder
+                            else (outputs.hidden_states,)
+                        )
+
+                # argmax
+                next_tokens = torch.argmax(next_tokens_scores, dim=-1)
+            else:
+                next_tokens = outputs[0]
+
+            # finished sentences should have their next token be a padding token
+            if eos_token_id is not None:
+                if pad_token_id is None:
+                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+
+            # update generated ids, model inputs, and length for next step
+            batch_size, _ = input_ids.shape
+            update_indices = torch.stack(
+                [torch.arange(batch_size), torch.tensor(seq_length).repeat(batch_size)], dim=-1
+            )
+            input_ids[update_indices[:, 0], update_indices[:, 1]] = next_tokens[:]
+            model_kwargs = self._update_model_kwargs_for_xla_generation(
+                outputs=outputs,
+                model_kwargs=model_kwargs,
+                batch_size=batch_size,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                max_length=stopping_criteria.max_length,
+                seq_length=seq_length,
+                use_cache=use_cache,
+            )
+
+            seq_length += 1
+
+            # if eos_token was found in one sentence, set sentence to finished
+            if eos_token_id_tensor is not None:
+                unfinished_sequences = unfinished_sequences.mul(
+                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+                )
+
+            if not is_traced_inference:
+                xm.mark_step()
+
+            # stop when each sentence is finished, or if we exceed the maximum length
+            stop_criterion_1 = unfinished_sequences.max() == 0
+
+            if isinstance(stopping_criteria, list):
+                if len(stopping_criteria) == 1:
+                    stopping_criteria = stopping_criteria[0]
+
+            # Cases that can be handled in XLA without requiring
+            # non-padded input_ids
+            if isinstance(stopping_criteria, MaxLengthCriteria):
+                stop_criterion_2 = seq_length >= stopping_criteria.max_length
+            elif isinstance(stopping_criteria, MaxTimeCriteria):
+                stop_criterion_2 = stopping_criteria(input_ids, scores)
+            else:
+                # Other cases will be handled on CPU
+                batch_size, _ = input_ids.shape
+                mask = torch.cat(
+                    [torch.ones(batch_size, seq_length), torch.zeros(batch_size, input_ids.shape[1] - seq_length)],
+                    dim=1,
+                ).bool()
+                input_ids_cpu = torch.masked_select(input_ids, mask).reshape((batch_size, seq_length)).to("cpu")
+                scores_cpu = scores.to("cpu") if torch.is_tensor(scores) else scores
+                stop_criterion_2 = stopping_criteria(input_ids_cpu, scores_cpu)
+
+            if stop_criterion_1 or stop_criterion_2:
+                this_peer_finished = True
+
+            if this_peer_finished and not synced_gpus:
+                break
+
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return GreedySearchEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return GreedySearchDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return input_ids
 
-    def greedy_search(
+    def beam_search(
         self,
         input_ids: torch.LongTensor,
+        beam_scorer: BeamScorer,
         logits_processor: Optional[LogitsProcessorList] = None,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         max_length: Optional[int] = None,
@@ -1079,34 +970,35 @@ def greedy_search(
         output_hidden_states: Optional[bool] = None,
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: bool = False,
+        synced_gpus: Optional[bool] = False,
         seq_length: Optional[int] = None,
-        streamer: Optional["BaseStreamer"] = None,
+        is_traced_inference: bool = False,
         **model_kwargs,
-    ) -> Union[GreedySearchOutput, torch.LongTensor]:
+    ) -> Union[BeamSearchOutput, torch.LongTensor]:
         r"""
-        Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
-        used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+        Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
+        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         <Tip warning={true}>
 
-        In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate()
+        In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate()
         instead. For an overview of generation strategies and code examples, check the [following
         guide](../generation_strategies).
 
         </Tip>
 
-
         Parameters:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
+            beam_scorer (`BeamScorer`):
+                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
+                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
             logits_processor (`LogitsProcessorList`, *optional*):
                 An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                 used to modify the prediction scores of the language modeling head applied at each generation step.
             stopping_criteria (`StoppingCriteriaList`, *optional*):
                 An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                 used to tell if the generation loop should stop.
-
             max_length (`int`, *optional*, defaults to 20):
                 **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
                 tokens. The maximum length of the sequence to be generated.
@@ -1126,75 +1018,74 @@ def greedy_search(
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
             synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
-            seq_length:
+            seq_length (`Optional[int]`, defaults to `False`):
                 Length of current input_ids sequence
-            streamer (`BaseStreamer`, *optional*):
-                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
-                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
-                Unsupported for XLA devices
+            is_traced_inference (`bool`, defaults to `False`):
+                Whether the decoder is traced or using XLA lazy tensor. If the decoder is traced, next tokens and the beam scores
+                are computed inside the decoder.
             model_kwargs:
-                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
-                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
 
         Return:
-            [`~generation.GreedySearchDecoderOnlyOutput`], [`~generation.GreedySearchEncoderDecoderOutput`] or
+            [`generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or
             `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-            [`~generation.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
-            `return_dict_in_generate=True` or a [`~generation.GreedySearchEncoderDecoderOutput`] if
+            [`~generation.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.BeamSearchEncoderDecoderOutput`] if
             `model.config.is_encoder_decoder=True`.
 
+
         Examples:
 
         ```python
-        >>> from transformers import (
-        ...     AutoTokenizer,
-        ...     AutoModelForCausalLM,
-        ...     LogitsProcessorList,
-        ...     MinLengthLogitsProcessor,
-        ...     StoppingCriteriaList,
-        ...     MaxLengthCriteria,
-        ... )
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> from transformers import AutoTokenizer
+        >>> from optimum.neuron import NeuronModelForSeq2SeqLM
 
-        >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
-        >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
+        >>> input_shapes = {"batch_size": 1, "sequence_length": 128, "num_beams": 4}
+        >>> model = NeuronModelForSeq2SeqLM.from_pretrained("t5-small", export=True, dynamic_batch_size=False, **input_shapes)
 
-        >>> input_prompt = "It might be possible to"
-        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
-
-        >>> # instantiate logits processors
-        >>> logits_processor = LogitsProcessorList(
-        ...     [
-        ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
-        ...     ]
-        ... )
-        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
+        >>> input_prompt = "translate English to German: Lets eat good food."
+        >>> inputs = tokenizer(input_prompt, return_tensors="pt")
 
-        >>> outputs = model.greedy_search(
-        ...     input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria
+        >>> # add encoder_outputs to model keyword arguments
+        >>> model_kwargs = {
+        ...     "encoder_outputs": model.get_encoder()(
+        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
+        ...     )
+        ... }
+        >>> # instantiate beam scorer
+        >>> beam_scorer = BeamSearchScorer(
+        ...     batch_size=1,
+        ...     num_beams=num_beams,
+        ...     device=model.device,
         ... )
 
+        >>> outputs = model.beam_search(input_ids, beam_scorer)
         >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
-        ```"""
+        ```
+        """
         # init values
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        use_cache = model_kwargs["use_cache"] if "use_cache" in model_kwargs else False
+        if logits_processor is not None and is_traced_inference:
+            logger.warning(
+                "`logits_processor` will not be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron."
+            )
+        elif logits_processor is None:
+            logits_processor = LogitsProcessorList()
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
         if max_length is not None:
             warnings.warn(
                 "`max_length` is deprecated in this function, use"
-                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
+                " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
                 UserWarning,
             )
             stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        if len(stopping_criteria) == 0:
+            warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
         pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
         eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
-        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
         output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
         output_attentions = (
             output_attentions if output_attentions is not None else self.generation_config.output_attentions
@@ -1208,8 +1099,24 @@ def greedy_search(
             else self.generation_config.return_dict_in_generate
         )
 
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+
+        batch_beam_size, cur_len = input_ids.shape
+
+        # Overwrite cur_len
+        cur_len = seq_length
+
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+
         # init attention / hidden states / scores tuples
         scores = () if (return_dict_in_generate and output_scores) else None
+        beam_indices = (
+            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
+        )
         decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
         cross_attentions = () if (return_dict_in_generate and output_attentions) else None
         decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
@@ -1221,8 +1128,13 @@ def greedy_search(
                 model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
             )
 
-        # keep track of which sequences are already finished
-        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
+        # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
+        # of the first beam are considered to avoid sampling the exact same tokens across all beams.
+        # beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores_device = "cpu"
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=beam_scores_device)
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view((batch_size * num_beams,))
 
         this_peer_finished = False  # used by synced_gpus only
         while True:
@@ -1237,113 +1149,153 @@ def greedy_search(
                     break
 
             # prepare model inputs
-            if use_cache:
-                # From max_length-sized input_ids, select first
-                # seq_length - 1 values.
-
-                if model_kwargs.get("past_key_values") is None:
-                    input_ids_ = input_ids[:, :seq_length]
-                else:
-                    update_indices = torch.stack(
-                        [torch.arange(input_ids.size(0)), torch.tensor(seq_length - 1).repeat(input_ids.size(0))],
-                        dim=-1,
-                    )
-                    input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None]
+            if model_kwargs["use_cache"]:
+                import pdb
 
+                pdb.set_trace()
+                # From max_length-sized input_ids, select first
+                # cur_len - 1 values.
+                update_indices = torch.stack(
+                    [torch.arange(input_ids.size(0)), torch.tensor(cur_len - 1).repeat(input_ids.size(0))], dim=-1
+                )
+                input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None]
                 model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs)
             else:
                 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
-
-            if synced_gpus and this_peer_finished:
-                continue  # don't waste resources running the code we don't need
+            if is_traced_inference:
+                next_token_scores, next_tokens, next_indices = self(**model_inputs, beam_scores=beam_scores)
+            else:
+                outputs = self(
+                    **model_inputs,
+                    return_dict=True,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                )
 
-            if not use_cache:
-                one_hot = (
-                    torch.cat(
-                        [
-                            torch.tensor([0]).repeat(1, seq_length - 1),
-                            torch.tensor([1]).repeat(1, 1),
-                            torch.tensor([0]).repeat(1, input_ids.size(1) - seq_length),
-                        ],
-                        dim=1,
+                if synced_gpus and this_peer_finished:
+                    cur_len = cur_len + 1
+                    continue  # don't waste resources running the code we don't need
+
+                if not model_kwargs["use_cache"]:
+                    one_hot = (
+                        torch.cat(
+                            [
+                                torch.tensor([0]).repeat(1, cur_len - 1),
+                                torch.tensor([1]).repeat(1, 1),
+                                torch.tensor([0]).repeat(1, input_ids.size(1) - cur_len),
+                            ],
+                            dim=1,
+                        )
+                        .to(device=outputs.logits.device)
+                        .float()
                     )
-                    .to(device=outputs.logits.device)
-                    .float()
+                    next_token_logits = torch.matmul(one_hot, outputs.logits)
+                    next_token_logits = next_token_logits.squeeze(1)
+                else:
+                    next_token_logits = outputs.logits[:, -1, :]
+
+                # Manually compute log softmax
+                # log_softmax(vi) = vi - max(vi) - log(sum(exp(vi - max(vi))))
+                logit_max, _ = torch.max(next_token_logits, dim=-1, keepdim=True)
+                logsumexp = torch.log(torch.exp(next_token_logits - logit_max).sum(dim=-1, keepdim=True))
+                next_token_scores = next_token_logits - logit_max - logsumexp
+                # (batch_size * num_beams, vocab_size)
+
+                xm.mark_step()
+
+                # We don't want to change every single logit processor, so
+                # we peform this processing on CPU.
+                input_ids_ = input_ids.to("cpu")[:, :cur_len]
+                next_token_scores_ = next_token_scores.to("cpu")
+                next_token_scores_processed = logits_processor(input_ids_, next_token_scores_)
+
+                next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores)
+
+                # Store scores, attentions and hidden_states when required
+                if return_dict_in_generate:
+                    if output_scores:
+                        scores += (next_token_scores_processed,)
+                    if output_attentions:
+                        decoder_attentions += (
+                            (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                        )
+                        if self.config.is_encoder_decoder:
+                            cross_attentions += (outputs.cross_attentions,)
+
+                    if output_hidden_states:
+                        decoder_hidden_states += (
+                            (outputs.decoder_hidden_states,)
+                            if self.config.is_encoder_decoder
+                            else (outputs.hidden_states,)
+                        )
+
+                # reshape for beam search
+                vocab_size = next_token_scores.shape[-1]
+                next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+                next_token_scores = next_token_scores * 1
+
+                # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search)
+                next_token_scores, next_tokens = torch.topk(
+                    next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True
                 )
-                next_token_logits = torch.matmul(one_hot, outputs.logits)
-                next_token_logits = next_token_logits.squeeze(1)
-            else:
-                next_token_logits = outputs.logits[:, -1, :]
-
-            # pre-process distribution
-            # Move to cpu to handle arbitrary logits_processor
-            next_tokens_scores = logits_processor(input_ids.to("cpu")[:, :seq_length], next_token_logits.to("cpu"))
-            next_tokens_scores = next_tokens_scores.to(input_ids.device)
-
-            # Store scores, attentions and hidden_states when required
-            if return_dict_in_generate:
-                if output_scores:
-                    scores += (next_tokens_scores,)
-                if output_attentions:
-                    decoder_attentions += (
-                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
-                    )
-                    if self.config.is_encoder_decoder:
-                        cross_attentions += (outputs.cross_attentions,)
-
-                if output_hidden_states:
-                    decoder_hidden_states += (
-                        (outputs.decoder_hidden_states,)
-                        if self.config.is_encoder_decoder
-                        else (outputs.hidden_states,)
-                    )
 
-            # argmax
-            next_tokens = torch.argmax(next_tokens_scores, dim=-1)
+                next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
+                next_tokens = next_tokens % vocab_size
 
-            # finished sentences should have their next token be a padding token
-            if eos_token_id is not None:
-                if pad_token_id is None:
-                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
-                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+            # stateless
+            beam_outputs = beam_scorer.process(
+                input_ids.to("cpu")[:, :cur_len],
+                next_token_scores.to("cpu"),
+                next_tokens.to("cpu"),
+                next_indices.to("cpu"),
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                beam_indices=beam_indices,
+            )
+
+            beam_scores = beam_outputs["next_beam_scores"]
+            beam_next_tokens = beam_outputs["next_beam_tokens"]
+            beam_idx = beam_outputs["next_beam_indices"]
 
-            # update generated ids, model inputs, and length for next step
-            batch_size, _ = input_ids.shape
             update_indices = torch.stack(
-                [torch.arange(batch_size), torch.tensor(seq_length).repeat(batch_size)], dim=-1
+                [torch.arange(batch_beam_size), torch.tensor(cur_len - 1).repeat(batch_beam_size)], dim=-1
             )
-            input_ids[update_indices[:, 0], update_indices[:, 1]] = next_tokens[:]
+            update_indices_2 = torch.stack(
+                [torch.arange(batch_beam_size), torch.tensor(cur_len).repeat(batch_beam_size)], dim=-1
+            )
+            # First select beam_indices
+            device = input_ids.device
+            beam_idx_device = beam_idx.to(device=input_ids.device)
+            input_ids[:, :] = input_ids[beam_idx_device.long(), :]
+
+            # Then append new tokens
+            input_ids[update_indices_2[:, 0], update_indices_2[:, 1], None] = beam_next_tokens.unsqueeze(-1).to(device)
+            input_ids = input_ids * 1  # Hack to materialize tensor
+
+            # update generated ids, model inputs, and length for next step
             model_kwargs = self._update_model_kwargs_for_xla_generation(
                 outputs,
                 model_kwargs,
-                batch_size=batch_size,
+                batch_size=batch_beam_size,
                 is_encoder_decoder=self.config.is_encoder_decoder,
                 max_length=stopping_criteria.max_length,
-                seq_length=seq_length,
-                use_cache=use_cache,
+                seq_length=cur_len,
+                use_cache=model_kwargs["use_cache"],
             )
+            if is_traced_inference:
+                self._reorder_cache(beam_idx.to(torch.int64))
+            elif model_kwargs["past_key_values"] is not None:
+                model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx)
 
-            seq_length += 1
-
-            # if eos_token was found in one sentence, set sentence to finished
-            if eos_token_id_tensor is not None:
-                unfinished_sequences = unfinished_sequences.mul(
-                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
-                )
+            if return_dict_in_generate and output_scores:
+                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
 
-            xm.mark_step()
+            # increase cur_len
+            cur_len = cur_len + 1
 
             # stop when each sentence is finished, or if we exceed the maximum length
-            stop_criterion_1 = unfinished_sequences.max() == 0
-
+            stop_criterion_1 = beam_scorer.is_done
             if isinstance(stopping_criteria, list):
                 if len(stopping_criteria) == 1:
                     stopping_criteria = stopping_criteria[0]
@@ -1351,34 +1303,51 @@ def greedy_search(
             # Cases that can be handled in XLA without requiring
             # non-padded input_ids
             if isinstance(stopping_criteria, MaxLengthCriteria):
-                stop_criterion_2 = seq_length >= stopping_criteria.max_length
+                stop_criterion_2 = cur_len >= stopping_criteria.max_length
             elif isinstance(stopping_criteria, MaxTimeCriteria):
                 stop_criterion_2 = stopping_criteria(input_ids, scores)
             else:
                 # Other cases will be handled on CPU
                 batch_size, _ = input_ids.shape
+                input_ids_cpu = input_ids.to("cpu")
                 mask = torch.cat(
-                    [torch.ones(batch_size, seq_length), torch.zeros(batch_size, input_ids.shape[1] - seq_length)],
-                    dim=1,
+                    [torch.ones(batch_size, cur_len), torch.zeros(batch_size, input_ids.shape[1] - cur_len)], dim=1
                 ).bool()
-                input_ids_cpu = torch.masked_select(input_ids, mask).reshape((batch_size, seq_length)).to("cpu")
+                input_ids_cpu = torch.masked_select(input_ids_cpu, mask).reshape((batch_size, cur_len))
                 scores_cpu = scores.to("cpu") if torch.is_tensor(scores) else scores
                 stop_criterion_2 = stopping_criteria(input_ids_cpu, scores_cpu)
 
             if stop_criterion_1 or stop_criterion_2:
-                this_peer_finished = True
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
 
-            if this_peer_finished and not synced_gpus:
-                break
+        sequence_outputs = beam_scorer.finalize(
+            input_ids.to("cpu"),
+            beam_scores.to("cpu"),
+            next_tokens.to("cpu"),
+            next_indices.to("cpu"),
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+            beam_indices=beam_indices,
+        )
 
-        if streamer is not None:
-            streamer.end()
+        for k, v in sequence_outputs.items():
+            if type(v) == torch.Tensor:
+                sequence_outputs[k] = sequence_outputs[k].to(input_ids.device)
 
         if return_dict_in_generate:
+            if not output_scores:
+                sequence_outputs["sequence_scores"] = None
+
             if self.config.is_encoder_decoder:
-                return GreedySearchEncoderDecoderOutput(
-                    sequences=input_ids,
+                return BeamSearchEncoderDecoderOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
                     scores=scores,
+                    beam_indices=sequence_outputs["beam_indices"],
                     encoder_attentions=encoder_attentions,
                     encoder_hidden_states=encoder_hidden_states,
                     decoder_attentions=decoder_attentions,
@@ -1386,11 +1355,13 @@ def greedy_search(
                     decoder_hidden_states=decoder_hidden_states,
                 )
             else:
-                return GreedySearchDecoderOnlyOutput(
-                    sequences=input_ids,
+                return BeamSearchDecoderOnlyOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
                     scores=scores,
+                    beam_indices=sequence_outputs["beam_indices"],
                     attentions=decoder_attentions,
                     hidden_states=decoder_hidden_states,
                 )
         else:
-            return input_ids
+            return sequence_outputs["sequences"]
diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index 23396002e..b52e7e863 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -26,19 +26,12 @@
 import torch
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModelForSeq2SeqLM, GenerationConfig
-from transformers.generation.beam_search import BeamScorer
 from transformers.generation.logits_process import (
     LogitsProcessorList,
 )
 from transformers.generation.stopping_criteria import (
-    MaxLengthCriteria,
-    MaxTimeCriteria,
     StoppingCriteriaList,
 )
-from transformers.generation.utils import (
-    BeamSearchOutput,
-    GreedySearchOutput,
-)
 from transformers.modeling_outputs import Seq2SeqLMOutput
 
 from ..exporters.neuron import (
@@ -59,6 +52,7 @@
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedModel
+    from transformers.utils import ModelOutput
 
 if is_neuronx_available():
     import torch_neuronx
@@ -357,6 +351,10 @@ def forward(
         decoder_attention_mask: Optional[torch.BoolTensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
         beam_scores=None,
+        # Leave following kwargs for compatibility, will not have any effect.
+        return_dict: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
     ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
         hidden_states = encoder_outputs["last_hidden_state"]
 
@@ -418,290 +416,14 @@ def generate(
             max_length=kwargs.pop("max_length", None) or max_length,
             num_beams=num_beams,
             do_sample=kwargs.pop("do_sample", False),
-            use_cache=kwargs.pop(
-                "use_cache", False
-            ),  # `use_cache` is supported by default in `optimum-neuron`, set to False to avoid warning
+            use_cache=True,  # pkv is cached by default
             decoder_attention_mask=decoder_attention_mask,
             # Pass fake encoder_outputs so the transfomers code will not invoke the encoder
             encoder_outputs={"last_hidden_state": torch.ones((batch_size, max_length, 1))},
+            is_traced_inference=True,
         )
         return output
 
-    def beam_search(
-        self,
-        input_ids: torch.LongTensor,
-        beam_scorer: "BeamScorer",
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_scores: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: Optional[bool] = False,
-        seq_length: Optional[int] = None,
-        **model_kwargs,
-    ) -> Union[BeamSearchOutput, torch.LongTensor]:
-        """
-        Overriding beam search to use next_token_scores returned from neuron device instead of logits.
-        """
-        if logits_processor is not None:
-            logger.warning(
-                "`logits_processor` will be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron."
-            )
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
-        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-
-        batch_size = len(beam_scorer._beam_hyps)
-        num_beams = beam_scorer.num_beams
-
-        batch_beam_size, cur_len = input_ids.shape
-
-        # Overwrite cur_len
-        cur_len = seq_length
-
-        if num_beams * batch_size != batch_beam_size:
-            raise ValueError(
-                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
-            )
-
-        # init attention / hidden states / scores tuples
-        scores = () if (return_dict_in_generate and output_scores) else None
-        beam_indices = (
-            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
-        )
-
-        # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
-        # of the first beam are considered to avoid sampling the exact same tokens across all beams.
-        beam_scores_device = "cpu"
-        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=beam_scores_device)
-        beam_scores[:, 1:] = -1e9
-        beam_scores = beam_scores.view((batch_size * num_beams,))
-
-        while True:
-            # prepare model inputs
-            # From max_length-sized input_ids, select first
-            # cur_len - 1 values.
-            update_indices = torch.stack(
-                [torch.arange(input_ids.size(0)), torch.tensor(cur_len - 1).repeat(input_ids.size(0))], dim=-1
-            )
-            input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None]
-            model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs)
-
-            next_token_scores, next_tokens, next_indices = self(**model_inputs, beam_scores=beam_scores)
-
-            # stateless
-            beam_outputs = beam_scorer.process(
-                input_ids.to("cpu")[:, :cur_len],
-                next_token_scores.to("cpu"),
-                next_tokens.to("cpu"),
-                next_indices.to("cpu"),
-                pad_token_id=pad_token_id,
-                eos_token_id=eos_token_id,
-                beam_indices=beam_indices,
-            )
-
-            beam_scores = beam_outputs["next_beam_scores"]
-            beam_next_tokens = beam_outputs["next_beam_tokens"]
-            beam_idx = beam_outputs["next_beam_indices"]
-
-            update_indices = torch.stack(
-                [torch.arange(batch_beam_size), torch.tensor(cur_len - 1).repeat(batch_beam_size)], dim=-1
-            )
-            update_indices_2 = torch.stack(
-                [torch.arange(batch_beam_size), torch.tensor(cur_len).repeat(batch_beam_size)], dim=-1
-            )
-            # First select beam_indices
-            device = input_ids.device
-            beam_idx_device = beam_idx.to(device=input_ids.device)
-            input_ids[:, :] = input_ids[beam_idx_device.long(), :]
-
-            # Then append new tokens
-            input_ids[update_indices_2[:, 0], update_indices_2[:, 1], None] = (
-                beam_next_tokens.unsqueeze(-1).to(device).to(torch.long)
-            )
-            input_ids = input_ids * 1  # Hack to materialize tensor
-
-            # update generated ids, model inputs, and length for next step
-            model_kwargs = self._update_model_kwargs_for_xla_generation(
-                model_kwargs,
-                batch_size=batch_beam_size,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-            )
-            self._reorder_cache(beam_idx.to(torch.int64))
-
-            if return_dict_in_generate and output_scores:
-                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
-
-            # increase cur_len
-            cur_len = cur_len + 1
-
-            # stop when each sentence is finished, or if we exceed the maximum length
-            stop_criterion_1 = beam_scorer.is_done
-            if isinstance(stopping_criteria, list):
-                if len(stopping_criteria) == 1:
-                    stopping_criteria = stopping_criteria[0]
-
-            # Cases that can be handled in XLA without requiring
-            # non-padded input_ids
-            if isinstance(stopping_criteria, MaxLengthCriteria):
-                stop_criterion_2 = cur_len >= stopping_criteria.max_length
-            elif isinstance(stopping_criteria, MaxTimeCriteria):
-                stop_criterion_2 = stopping_criteria(input_ids, scores)
-            else:
-                # Other cases will be handled on CPU
-                batch_size, _ = input_ids.shape
-                input_ids_cpu = input_ids.to("cpu")
-                mask = torch.cat(
-                    [torch.ones(batch_size, cur_len), torch.zeros(batch_size, input_ids.shape[1] - cur_len)], dim=1
-                ).bool()
-                input_ids_cpu = torch.masked_select(input_ids_cpu, mask).reshape((batch_size, cur_len))
-                scores_cpu = scores.to("cpu") if torch.is_tensor(scores) else scores
-                stop_criterion_2 = stopping_criteria(input_ids_cpu, scores_cpu)
-
-            if stop_criterion_1 or stop_criterion_2:
-                if not synced_gpus:
-                    break
-
-        sequence_outputs = beam_scorer.finalize(
-            input_ids.to("cpu"),
-            beam_scores.to("cpu"),
-            next_tokens.to("cpu"),
-            next_indices.to("cpu"),
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-            max_length=stopping_criteria.max_length,
-            beam_indices=beam_indices,
-        )
-
-        for k, v in sequence_outputs.items():
-            if type(v) == torch.Tensor:
-                sequence_outputs[k] = sequence_outputs[k].to(input_ids.device)
-
-        return sequence_outputs["sequences"]
-
-    def greedy_search(
-        self,
-        input_ids: torch.LongTensor,
-        logits_processor: Optional["LogitsProcessorList"] = None,
-        stopping_criteria: Optional["StoppingCriteriaList"] = None,
-        max_length: Optional[int] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_scores: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        seq_length: Optional[int] = int,
-        **model_kwargs,
-    ) -> Union[GreedySearchOutput, torch.LongTensor]:
-        """
-        Overriding greedy sampling to use next tokens returned from neuron device instead of logits.
-        """
-        # init values
-        if logits_processor is not None:
-            logger.warning(
-                "`logits_processor` will not be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron."
-            )
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-        if max_length is not None:
-            from transformers.generation.stopping_criteria import validate_stopping_criteria
-
-            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
-        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
-        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
-        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-
-        # init attention / hidden states / scores tuples
-        scores = () if (return_dict_in_generate and output_scores) else None
-
-        # keep track of which sequences are already finished
-        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
-
-        this_peer_finished = False  # used by synced_gpus only
-        while True:
-            # prepare model inputs
-            # From max_length-sized input_ids, select first
-            # seq_length - 1 values.
-
-            if model_kwargs.get("past_key_values") is None:
-                input_ids_ = input_ids[:, :seq_length]
-            else:
-                update_indices = torch.stack(
-                    [torch.arange(input_ids.size(0)), torch.tensor(seq_length - 1).repeat(input_ids.size(0))],
-                    dim=-1,
-                )
-                input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None]
-
-            model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs)
-
-            # forward pass to get next token
-            output = self(**model_inputs)
-            next_tokens = output[0]
-
-            # finished sentences should have their next token be a padding token
-            if eos_token_id is not None:
-                if pad_token_id is None:
-                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
-                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
-
-            # update generated ids, model inputs, and length for next step
-
-            batch_size, _ = input_ids.shape
-            update_indices = torch.stack(
-                [torch.arange(batch_size), torch.tensor(seq_length).repeat(batch_size)], dim=-1
-            )
-            input_ids[update_indices[:, 0], update_indices[:, 1]] = next_tokens[:]
-            model_kwargs = self._update_model_kwargs_for_xla_generation(
-                model_kwargs,
-                batch_size=batch_size,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-            )
-
-            seq_length += 1
-
-            # if eos_token was found in one sentence, set sentence to finished
-            if eos_token_id_tensor is not None:
-                unfinished_sequences = unfinished_sequences.mul(
-                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
-                )
-
-            # stop when each sentence is finished, or if we exceed the maximum length
-            stop_criterion_1 = unfinished_sequences.max() == 0
-
-            if isinstance(stopping_criteria, list):
-                if len(stopping_criteria) == 1:
-                    stopping_criteria = stopping_criteria[0]
-
-            # Cases that can be handled in XLA without requiring
-            # non-padded input_ids
-            if isinstance(stopping_criteria, MaxLengthCriteria):
-                stop_criterion_2 = seq_length >= stopping_criteria.max_length
-            elif isinstance(stopping_criteria, MaxTimeCriteria):
-                stop_criterion_2 = stopping_criteria(input_ids, scores)
-            else:
-                # Other cases will be handled on CPU
-                batch_size, _ = input_ids.shape
-                mask = torch.cat(
-                    [torch.ones(batch_size, seq_length), torch.zeros(batch_size, input_ids.shape[1] - seq_length)],
-                    dim=1,
-                ).bool()
-                input_ids_cpu = torch.masked_select(input_ids, mask).reshape((batch_size, seq_length)).to("cpu")
-                scores_cpu = scores.to("cpu") if torch.is_tensor(scores) else scores
-                stop_criterion_2 = stopping_criteria(input_ids_cpu, scores_cpu)
-
-            if stop_criterion_1 or stop_criterion_2:
-                this_peer_finished = True
-
-            if this_peer_finished:
-                break
-
-        return input_ids
-
     def _reorder_cache(self, beam_idx):
         """
         The cache was reordered during the tracing of the decoder so we can skip it here. This is needed for beam search and not greedy sampling.
@@ -716,6 +438,12 @@ def _update_model_kwargs_for_xla_generation(
         model_kwargs: Dict[str, Any],
         batch_size: int,
         is_encoder_decoder: bool = False,
+        # Leave following kwargs for compatibility, will not have any effect.
+        outputs: "ModelOutput" = None,
+        standardize_cache_format: bool = False,
+        max_length: Optional[int] = None,
+        seq_length: Optional[int] = None,
+        use_cache: bool = True,
     ) -> Dict[str, Any]:
         mask = self._update_attention(model_kwargs, batch_size, is_encoder_decoder)
         # sets the updated variables (mask and past_key_values)

From e8d72c2b5192b59e51620db658229c7dc4c72ddc Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Tue, 28 Nov 2023 22:58:01 +0000
Subject: [PATCH 23/30] fix beam

---
 optimum/neuron/generation/utils.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py
index 11f64d88e..e56ddd0b1 100644
--- a/optimum/neuron/generation/utils.py
+++ b/optimum/neuron/generation/utils.py
@@ -1150,9 +1150,6 @@ def beam_search(
 
             # prepare model inputs
             if model_kwargs["use_cache"]:
-                import pdb
-
-                pdb.set_trace()
                 # From max_length-sized input_ids, select first
                 # cur_len - 1 values.
                 update_indices = torch.stack(
@@ -1164,7 +1161,8 @@ def beam_search(
                 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
             if is_traced_inference:
-                next_token_scores, next_tokens, next_indices = self(**model_inputs, beam_scores=beam_scores)
+                outputs = self(**model_inputs, beam_scores=beam_scores)
+                next_token_scores, next_tokens, next_indices = outputs
             else:
                 outputs = self(
                     **model_inputs,
@@ -1270,13 +1268,21 @@ def beam_search(
             input_ids[:, :] = input_ids[beam_idx_device.long(), :]
 
             # Then append new tokens
-            input_ids[update_indices_2[:, 0], update_indices_2[:, 1], None] = beam_next_tokens.unsqueeze(-1).to(device)
+            if is_traced_inference:
+                # int64 is not natively supported by inf2 and has been cast down to int32
+                input_ids[update_indices_2[:, 0], update_indices_2[:, 1], None] = (
+                    beam_next_tokens.unsqueeze(-1).to(device).to(torch.long)
+                )
+            else:
+                input_ids[update_indices_2[:, 0], update_indices_2[:, 1], None] = beam_next_tokens.unsqueeze(-1).to(
+                    device
+                )
             input_ids = input_ids * 1  # Hack to materialize tensor
 
             # update generated ids, model inputs, and length for next step
             model_kwargs = self._update_model_kwargs_for_xla_generation(
-                outputs,
-                model_kwargs,
+                outputs=outputs,
+                model_kwargs=model_kwargs,
                 batch_size=batch_beam_size,
                 is_encoder_decoder=self.config.is_encoder_decoder,
                 max_length=stopping_criteria.max_length,

From dd4b1c7fe464a67da6d3a04b1350a690c2a4ef6e Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Wed, 29 Nov 2023 00:51:08 +0000
Subject: [PATCH 24/30] fix tests

---
 optimum/exporters/neuron/convert.py | 8 +++++---
 optimum/neuron/modeling_base.py     | 4 +++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index 9cace43f7..0c2d5eef3 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -169,9 +169,11 @@ def validate_model_outputs(
     with torch.no_grad():
         reference_model.eval()
         ref_inputs = config.generate_dummy_inputs(return_tuple=False, **input_shapes)
-        if reference_model.config.is_encoder_decoder:
+        if getattr(reference_model.config, "is_encoder_decoder", False):
             reference_model = config.patch_model_for_export(reference_model, device="cpu", **input_shapes)
-        if "AutoencoderKL" in getattr(config._config, "_class_name", "") or reference_model.config.is_encoder_decoder:
+        if "AutoencoderKL" in getattr(config._config, "_class_name", "") or getattr(
+            reference_model.config, "is_encoder_decoder", False
+        ):
             # VAE components for stable diffusion or Encoder-Decoder models
             ref_inputs = tuple(ref_inputs.values())
             ref_outputs = reference_model(*ref_inputs)
@@ -428,7 +430,7 @@ def export_neuronx(
     dummy_inputs_tuple = tuple(dummy_inputs.values())
 
     aliases = {}
-    if model.config.is_encoder_decoder:
+    if getattr(model.config, "is_encoder_decoder", False):
         checked_model = config.patch_model_for_export(model, **input_shapes)
         if getattr(config, "is_decoder", False):
             aliases = config.generate_io_aliases(checked_model)
diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py
index 7b3a28ecd..144826740 100644
--- a/optimum/neuron/modeling_base.py
+++ b/optimum/neuron/modeling_base.py
@@ -407,7 +407,9 @@ def _neuron_config_init(cls, config: "PretrainedConfig") -> "NeuronConfig":
         # Neuron config constructuor
         task = getattr(config, "task") or TasksManager.infer_task_from_model(cls.auto_model_class)
         task = TasksManager.map_from_synonym(task)
-        model_type = neuron_configs.get("model_type", None) or config.model_type
+        model_type = neuron_configs.get("model_type", None)
+        if not (model_type and model_type != "None"):
+            model_type = config.model_type
         neuron_config_constructor = TasksManager.get_exporter_config_constructor(
             model_type=model_type, exporter="neuron", task=task
         )

From df7cde79eef301bffb1ef6208c4f4490b694b775 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Wed, 29 Nov 2023 18:24:49 +0000
Subject: [PATCH 25/30] support optional outputs for decoder

---
 optimum/commands/export/neuronx.py         | 10 +++++++
 optimum/exporters/neuron/__main__.py       | 22 ++++++++++++++-
 optimum/exporters/neuron/base.py           |  7 +++++
 optimum/exporters/neuron/config.py         |  9 +++++++
 optimum/exporters/neuron/convert.py        |  1 +
 optimum/exporters/neuron/model_configs.py  |  9 ++++++-
 optimum/exporters/neuron/model_wrappers.py | 31 +++++++++++++++++++---
 optimum/exporters/neuron/utils.py          |  4 +++
 optimum/neuron/generation/utils.py         | 15 ++++++-----
 optimum/neuron/utils/argument_utils.py     |  4 +++
 10 files changed, 99 insertions(+), 13 deletions(-)

diff --git a/optimum/commands/export/neuronx.py b/optimum/commands/export/neuronx.py
index 1278b604b..5761bac44 100644
--- a/optimum/commands/export/neuronx.py
+++ b/optimum/commands/export/neuronx.py
@@ -141,6 +141,16 @@ def parse_args_neuronx(parser: "ArgumentParser"):
             "UNet model ID on huggingface.co or path on disk to load model from. This will replace the unet in the original Stable Diffusion pipeline."
         ),
     )
+    optional_group.add_argument(
+        "--output_hidden_states",
+        action="store_true",
+        help=("Whether or not for the traced model to return the hidden states of all layers."),
+    )
+    optional_group.add_argument(
+        "--output_attentions",
+        action="store_true",
+        help=("Whether or not for the traced model to return the attentions tensors of all attention layers."),
+    )
 
 
 class NeuronxExportCommand(BaseOptimumCLICommand):
diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
index 70abd9619..7d65fb241 100644
--- a/optimum/exporters/neuron/__main__.py
+++ b/optimum/exporters/neuron/__main__.py
@@ -121,6 +121,18 @@ def normalize_input_shapes(task: str, args: argparse.Namespace) -> Dict[str, int
     return input_shapes
 
 
+def customize_optional_outputs(args: argparse.Namespace) -> Dict[str, bool]:
+    """
+    Customize optional outputs of the traced model, eg. if `output_attentions=True`, the attentions tensors will be traced.
+    """
+    possible_outputs = ["output_attentions", "output_hidden_states"]
+
+    customized_outputs = {}
+    for name in possible_outputs:
+        customized_outputs[name] = getattr(args, name, False)
+    return customized_outputs
+
+
 def normalize_stable_diffusion_input_shapes(
     args: argparse.Namespace,
 ) -> Dict[str, Dict[str, int]]:
@@ -190,6 +202,7 @@ def _get_submodels_and_neuron_configs(
     dynamic_batch_size: bool = False,
     model_name_or_path: Optional[Union[str, Path]] = None,
     submodels: Dict[str, Union[Path, str]] = None,
+    optional_outputs: Dict[str, bool] = None,
 ):
     is_stable_diffusion = "stable-diffusion" in task
     is_encoder_decoder = (
@@ -202,7 +215,7 @@ def _get_submodels_and_neuron_configs(
         )
     elif is_encoder_decoder:
         models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_encoder_decoder(
-            model, input_shapes, task, output, dynamic_batch_size, model_name_or_path
+            model, input_shapes, task, output, dynamic_batch_size, model_name_or_path, optional_outputs
         )
     else:
         neuron_config_constructor = TasksManager.get_exporter_config_constructor(
@@ -273,6 +286,7 @@ def _get_submodels_and_neuron_configs_for_encoder_decoder(
     output: Path,
     dynamic_batch_size: bool = False,
     model_name_or_path: Optional[Union[str, Path]] = None,
+    optional_outputs: Dict[str, bool] = None,
 ):
     if is_neuron_available():
         raise RuntimeError(
@@ -284,6 +298,7 @@ def _get_submodels_and_neuron_configs_for_encoder_decoder(
         task=task,
         dynamic_batch_size=dynamic_batch_size,
         input_shapes=input_shapes,
+        optional_outputs=optional_outputs,
     )
     output_model_names = {
         ENCODER_NAME: os.path.join(ENCODER_NAME, NEURON_FILE_NAME),
@@ -310,6 +325,7 @@ def main_export(
     use_auth_token: Optional[Union[bool, str]] = None,
     do_validation: bool = True,
     submodels: Dict[str, Union[Path, str]] = None,
+    optional_outputs: Dict[str, bool] = None,
     **input_shapes,
 ):
     output = Path(output)
@@ -341,6 +357,7 @@ def main_export(
         dynamic_batch_size=dynamic_batch_size,
         model_name_or_path=model_name_or_path,
         submodels=submodels,
+        optional_outputs=optional_outputs,
     )
 
     _, neuron_outputs = export_models(
@@ -408,6 +425,8 @@ def main():
         input_shapes = normalize_input_shapes(task, args)
         submodels = None
 
+    optional_outputs = customize_optional_outputs(args)
+
     main_export(
         model_name_or_path=args.model,
         output=args.output,
@@ -419,6 +438,7 @@ def main():
         trust_remote_code=args.trust_remote_code,
         do_validation=not args.disable_validation,
         submodels=submodels,
+        optional_outputs=optional_outputs,
         **input_shapes,
     )
 
diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py
index 6b005869f..c5e3c9cbf 100644
--- a/optimum/exporters/neuron/base.py
+++ b/optimum/exporters/neuron/base.py
@@ -120,6 +120,8 @@ def __init__(
         point_batch_size: Optional[int] = None,
         nb_points_per_image: Optional[int] = None,
         num_beams: int = 1,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
         # TODO: add custom dtype after optimum 1.13 release
         # int_dtype: str = "int64",
         # float_dtype: str = "fp32",
@@ -156,6 +158,11 @@ def __init__(
                 input_shapes[name] = value
             setattr(self, name, value)
         setattr(self, "input_shapes", input_shapes)
+        setattr(
+            self,
+            "optional_outputs",
+            {"output_attentions": output_attentions, "output_hidden_states": output_hidden_states},
+        )
         setattr(self, "compiler_type", compiler_type)
         setattr(self, "compiler_version", compiler_version)
 
diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py
index fccac7e39..597b886c7 100644
--- a/optimum/exporters/neuron/config.py
+++ b/optimum/exporters/neuron/config.py
@@ -118,6 +118,15 @@ def outputs(self) -> List[str]:
                 + [f"past.{idx}.cross.key" for idx in range(self._config.num_decoder_layers)]
                 + [f"past.{idx}.cross.value" for idx in range(self._config.num_decoder_layers)]
             )
+            if self.optional_outputs["output_attentions"]:
+                # Flatten attentions tensors of all attention layers
+                common_outputs += [f"decoder_attention.{idx}" for idx in range(self._config.num_decoder_layers)]
+            if self.optional_outputs["output_hidden_states"]:
+                # Flatten hidden states of all layers
+                common_outputs += [
+                    f"decoder_hidden_state.{idx}" for idx in range(self._config.num_decoder_layers + 1)
+                ]  # +1 for the embedding layer
+
         return common_outputs
 
     def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]:
diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index 0c2d5eef3..a3bfa9857 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -340,6 +340,7 @@ def export_models(
                 compiler_version=NEURON_COMPILER_VERSION,
                 model_type=getattr(sub_neuron_config, "MODEL_TYPE", None),
                 task=getattr(sub_neuron_config, "task", None),
+                optional_outputs=getattr(sub_neuron_config, "optional_outputs", None),
             )
             if isinstance(model_config, PretrainedConfig):
                 model_config = DiffusersPretrainedConfig.from_dict(model_config.__dict__)
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index e4dda2fa5..eaf03ba51 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -457,8 +457,15 @@ def patch_model_for_export(self, model, device="xla", **kwargs):
         batch_size = kwargs.pop("batch_size", 1)
         sequence_length = kwargs.pop("sequence_length", 1)
         num_beams = kwargs.pop("num_beams", 1)
+
         return self.CUSTOM_MODEL_WRAPPER(
-            model, batch_size=batch_size, sequence_length=sequence_length, num_beams=num_beams, device=device
+            model,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            num_beams=num_beams,
+            output_hidden_states=self.optional_outputs["output_hidden_states"],
+            output_attentions=self.optional_outputs["output_attentions"],
+            device=device,
         )
 
     def generate_io_aliases(self, model):
diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py
index abc63c114..31d6d00ba 100644
--- a/optimum/exporters/neuron/model_wrappers.py
+++ b/optimum/exporters/neuron/model_wrappers.py
@@ -143,6 +143,8 @@ def __init__(
         batch_size: int,
         sequence_length: int,
         num_beams: int = 1,
+        output_hidden_states: bool = False,
+        output_attentions: bool = False,
         device: str = "xla",
         tp_degree: Optional[int] = None,
     ):
@@ -152,6 +154,8 @@ def __init__(
         self.batch_size = batch_size
         self.sequence_length = sequence_length
         self.num_beams = num_beams
+        self.output_hidden_states = output_hidden_states
+        self.output_attentions = output_attentions
         self.device = device
         self.tp_degree = tp_degree
 
@@ -259,12 +263,21 @@ def forward(
             encoder_hidden_states=encoder_hidden_states,
             encoder_attention_mask=encoder_attention_mask,
             use_cache=True,
-            output_attentions=False,
-            output_hidden_states=False,
+            output_attentions=self.output_attentions,
+            output_hidden_states=self.output_hidden_states,
         )
 
         last_hidden_state = decoder_output["last_hidden_state"]
         past_key_values = decoder_output["past_key_values"]
+        if self.output_hidden_states:
+            decoder_hidden_states = [
+                hidden_state for hidden_state in decoder_output["hidden_states"]
+            ]  # flatten `hidden_states` which is a tuple of tensors
+
+        if self.output_attentions:
+            decoder_attentions = [
+                attention for attention in decoder_output["attentions"]
+            ]  # flatten `hidden_states` which is a tuple of tensors
 
         if self.config.tie_word_embeddings:
             # Rescale output before projecting on vocab
@@ -307,8 +320,18 @@ def forward(
             next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
             next_tokens = next_tokens % vocab_size
 
-            return [next_token_scores, next_tokens, next_indices] + past_key_values_sa + past_key_values_ca
+            neuron_outputs = [next_token_scores, next_tokens, next_indices] + past_key_values_sa + past_key_values_ca
+
         else:
             # Greedy
             next_tokens = torch.argmax(next_token_logits, dim=-1)
-            return [next_tokens] + past_key_values_sa + past_key_values_ca
+
+            neuron_outputs = [next_tokens] + past_key_values_sa + past_key_values_ca
+
+        if self.output_hidden_states:
+            neuron_outputs += decoder_hidden_states
+
+        if self.output_attentions:
+            neuron_outputs += decoder_attentions
+
+        return neuron_outputs
diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py
index 0e7741ec8..afca52597 100644
--- a/optimum/exporters/neuron/utils.py
+++ b/optimum/exporters/neuron/utils.py
@@ -352,6 +352,7 @@ def get_encoder_decoder_models_for_export(
     task: str,
     input_shapes: Dict[str, int],
     dynamic_batch_size: Optional[bool] = False,
+    optional_outputs: Dict[str, bool] = None,
 ) -> Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]:
     """
     Returns the components of an encoder-decoder model and their subsequent neuron configs.
@@ -366,6 +367,8 @@ def get_encoder_decoder_models_for_export(
             Static shapes used for compiling the encoder and the decoder.
         dynamic_batch_size (`bool`, defaults to `False`):
             Whether the Neuron compiled model supports dynamic batch size.
+        optional_outputs (`Dict[str, bool]`, defaults to `None`)
+            Whether to trace some optional output tensors.
 
     Returns:
         `Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]`: A Dict containing the model and
@@ -397,6 +400,7 @@ def get_encoder_decoder_models_for_export(
         config=model.config,
         task=task,
         dynamic_batch_size=dynamic_batch_size,
+        **optional_outputs,
         **input_shapes,
     )
     models_for_export[DECODER_NAME] = (model, decoder_neuron_config)
diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py
index e56ddd0b1..3476fe0a8 100644
--- a/optimum/neuron/generation/utils.py
+++ b/optimum/neuron/generation/utils.py
@@ -17,7 +17,7 @@
 import copy
 import inspect
 import warnings
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.distributed as dist
@@ -51,9 +51,6 @@
 from transformers.utils import ModelOutput, logging
 
 
-if TYPE_CHECKING:
-    pass
-
 logger = logging.get_logger(__name__)
 
 
@@ -873,6 +870,11 @@ def greedy_search(
             else:
                 next_tokens = outputs[0]
 
+                if return_dict_in_generate and output_scores:
+                    logger.warning(
+                        "`output_scores` will be neglected because currently we do not trace `next_token_scores` for greedy search. If you want us to support the option during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron."
+                    )
+
             # finished sentences should have their next token be a padding token
             if eos_token_id is not None:
                 if pad_token_id is None:
@@ -1068,7 +1070,7 @@ def beam_search(
         # init values
         if logits_processor is not None and is_traced_inference:
             logger.warning(
-                "`logits_processor` will not be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron."
+                "`logits_processor` will be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron."
             )
         elif logits_processor is None:
             logits_processor = LogitsProcessorList()
@@ -1130,7 +1132,6 @@ def beam_search(
 
         # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
         # of the first beam are considered to avoid sampling the exact same tokens across all beams.
-        # beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
         beam_scores_device = "cpu"
         beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=beam_scores_device)
         beam_scores[:, 1:] = -1e9
@@ -1203,7 +1204,7 @@ def beam_search(
                 xm.mark_step()
 
                 # We don't want to change every single logit processor, so
-                # we peform this processing on CPU.
+                # we perform this processing on CPU.
                 input_ids_ = input_ids.to("cpu")[:, :cur_len]
                 next_token_scores_ = next_token_scores.to("cpu")
                 next_token_scores_processed = logits_processor(input_ids_, next_token_scores_)
diff --git a/optimum/neuron/utils/argument_utils.py b/optimum/neuron/utils/argument_utils.py
index b7e9b4ab0..5400c3065 100644
--- a/optimum/neuron/utils/argument_utils.py
+++ b/optimum/neuron/utils/argument_utils.py
@@ -147,6 +147,7 @@ def store_compilation_config(
     compiler_version: str,
     model_type: Optional[str] = None,
     task: str = None,
+    optional_outputs: Dict[str, bool] = None,
     **kwargs,
 ):
     if isinstance(config, OrderedDict):
@@ -182,6 +183,9 @@ def store_compilation_config(
     elif neuron_model_type != original_model_type:
         config_args["model_type"] = neuron_model_type  # Neuron custom model_type, eg. `t5-encoder`.
 
+    if optional_outputs is not None:
+        config_args["optional_outputs"] = optional_outputs
+
     update_func("neuron", config_args)
 
     if hasattr(config, "_diffusers_version"):

From 92cd6e5b297fcfa465b9fe85dc4a34878fa27378 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Fri, 1 Dec 2023 16:07:23 +0000
Subject: [PATCH 26/30] enhance tests

---
 optimum/exporters/neuron/__main__.py       |  36 ++++++--
 optimum/exporters/neuron/base.py           |   7 +-
 optimum/exporters/neuron/config.py         |  12 ++-
 optimum/exporters/neuron/convert.py        |   3 +-
 optimum/exporters/neuron/model_configs.py  |   4 +-
 optimum/exporters/neuron/model_wrappers.py |   6 +-
 optimum/exporters/neuron/utils.py          |  12 ++-
 optimum/neuron/generation/utils.py         | 102 ++++++++++++---------
 optimum/neuron/modeling_base.py            |  17 ++--
 optimum/neuron/modeling_seq2seq.py         |  55 ++++++++---
 optimum/neuron/utils/argument_utils.py     |   8 +-
 tests/cli/test_export_cli.py               |  33 ++++++-
 tests/exporters/test_export.py             |   4 +-
 tests/generation/conftest.py               |  46 +++++++++-
 tests/generation/test_generate.py          |  80 +++++++++++++++-
 15 files changed, 324 insertions(+), 101 deletions(-)

diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
index 7d65fb241..0fcf91a79 100644
--- a/optimum/exporters/neuron/__main__.py
+++ b/optimum/exporters/neuron/__main__.py
@@ -201,8 +201,9 @@ def _get_submodels_and_neuron_configs(
     output: Path,
     dynamic_batch_size: bool = False,
     model_name_or_path: Optional[Union[str, Path]] = None,
-    submodels: Dict[str, Union[Path, str]] = None,
-    optional_outputs: Dict[str, bool] = None,
+    submodels: Optional[Dict[str, Union[Path, str]]] = None,
+    output_attentions: bool = False,
+    output_hidden_states: bool = False,
 ):
     is_stable_diffusion = "stable-diffusion" in task
     is_encoder_decoder = (
@@ -210,14 +211,25 @@ def _get_submodels_and_neuron_configs(
     )
 
     if is_stable_diffusion:
+        # TODO: Enable optional outputs for Stable Diffusion
+        if output_attentions or output_hidden_states:
+            raise ValueError(
+                f"`output_attentions` and `output_hidden_states` are not supported by the {task} task yet."
+            )
         models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_stable_diffusion(
             model, input_shapes, task, output, dynamic_batch_size, submodels
         )
     elif is_encoder_decoder:
+        optional_outputs = {"output_attentions": output_attentions, "output_hidden_states": output_hidden_states}
         models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_encoder_decoder(
-            model, input_shapes, task, output, dynamic_batch_size, model_name_or_path, optional_outputs
+            model, input_shapes, task, output, dynamic_batch_size, model_name_or_path, **optional_outputs
         )
     else:
+        # TODO: Enable optional outputs for encoders
+        if output_attentions or output_hidden_states:
+            raise ValueError(
+                f"`output_attentions` and `output_hidden_states` are not supported by the {task} task yet."
+            )
         neuron_config_constructor = TasksManager.get_exporter_config_constructor(
             model=model, exporter="neuron", task=task
         )
@@ -235,7 +247,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
     task: str,
     output: Path,
     dynamic_batch_size: bool = False,
-    submodels: Dict[str, Union[Path, str]] = None,
+    submodels: Optional[Dict[str, Union[Path, str]]] = None,
 ):
     model = replace_stable_diffusion_submodels(model, submodels)
     check_compiler_compatibility_for_stable_diffusion()
@@ -286,7 +298,8 @@ def _get_submodels_and_neuron_configs_for_encoder_decoder(
     output: Path,
     dynamic_batch_size: bool = False,
     model_name_or_path: Optional[Union[str, Path]] = None,
-    optional_outputs: Dict[str, bool] = None,
+    output_attentions: bool = False,
+    output_hidden_states: bool = False,
 ):
     if is_neuron_available():
         raise RuntimeError(
@@ -298,7 +311,8 @@ def _get_submodels_and_neuron_configs_for_encoder_decoder(
         task=task,
         dynamic_batch_size=dynamic_batch_size,
         input_shapes=input_shapes,
-        optional_outputs=optional_outputs,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
     )
     output_model_names = {
         ENCODER_NAME: os.path.join(ENCODER_NAME, NEURON_FILE_NAME),
@@ -324,8 +338,9 @@ def main_export(
     local_files_only: bool = False,
     use_auth_token: Optional[Union[bool, str]] = None,
     do_validation: bool = True,
-    submodels: Dict[str, Union[Path, str]] = None,
-    optional_outputs: Dict[str, bool] = None,
+    submodels: Optional[Dict[str, Union[Path, str]]] = None,
+    output_attentions: bool = False,
+    output_hidden_states: bool = False,
     **input_shapes,
 ):
     output = Path(output)
@@ -357,7 +372,8 @@ def main_export(
         dynamic_batch_size=dynamic_batch_size,
         model_name_or_path=model_name_or_path,
         submodels=submodels,
-        optional_outputs=optional_outputs,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
     )
 
     _, neuron_outputs = export_models(
@@ -438,7 +454,7 @@ def main():
         trust_remote_code=args.trust_remote_code,
         do_validation=not args.disable_validation,
         submodels=submodels,
-        optional_outputs=optional_outputs,
+        **optional_outputs,
         **input_shapes,
     )
 
diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py
index c5e3c9cbf..5f7277b53 100644
--- a/optimum/exporters/neuron/base.py
+++ b/optimum/exporters/neuron/base.py
@@ -158,11 +158,8 @@ def __init__(
                 input_shapes[name] = value
             setattr(self, name, value)
         setattr(self, "input_shapes", input_shapes)
-        setattr(
-            self,
-            "optional_outputs",
-            {"output_attentions": output_attentions, "output_hidden_states": output_hidden_states},
-        )
+        setattr(self, "output_attentions", output_attentions)
+        setattr(self, "output_hidden_states", output_hidden_states)
         setattr(self, "compiler_type", compiler_type)
         setattr(self, "compiler_version", compiler_version)
 
diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py
index 597b886c7..01a3ae86a 100644
--- a/optimum/exporters/neuron/config.py
+++ b/optimum/exporters/neuron/config.py
@@ -118,15 +118,19 @@ def outputs(self) -> List[str]:
                 + [f"past.{idx}.cross.key" for idx in range(self._config.num_decoder_layers)]
                 + [f"past.{idx}.cross.value" for idx in range(self._config.num_decoder_layers)]
             )
-            if self.optional_outputs["output_attentions"]:
-                # Flatten attentions tensors of all attention layers
-                common_outputs += [f"decoder_attention.{idx}" for idx in range(self._config.num_decoder_layers)]
-            if self.optional_outputs["output_hidden_states"]:
+
+            if self.output_hidden_states:
                 # Flatten hidden states of all layers
                 common_outputs += [
                     f"decoder_hidden_state.{idx}" for idx in range(self._config.num_decoder_layers + 1)
                 ]  # +1 for the embedding layer
 
+            if self.output_attentions:
+                # Flatten attentions tensors of all attention layers
+                common_outputs += [f"decoder_attention.{idx}" for idx in range(self._config.num_decoder_layers)]
+                if getattr(self._config, "is_encoder_decoder", False) is True:
+                    common_outputs += [f"cross_attention.{idx}" for idx in range(self._config.num_decoder_layers)]
+
         return common_outputs
 
     def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]:
diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index a3bfa9857..6f712d4cb 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -340,7 +340,8 @@ def export_models(
                 compiler_version=NEURON_COMPILER_VERSION,
                 model_type=getattr(sub_neuron_config, "MODEL_TYPE", None),
                 task=getattr(sub_neuron_config, "task", None),
-                optional_outputs=getattr(sub_neuron_config, "optional_outputs", None),
+                output_attentions=getattr(sub_neuron_config, "output_attentions", False),
+                output_hidden_states=getattr(sub_neuron_config, "output_hidden_states", False),
             )
             if isinstance(model_config, PretrainedConfig):
                 model_config = DiffusersPretrainedConfig.from_dict(model_config.__dict__)
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index eaf03ba51..fe5835198 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -463,8 +463,8 @@ def patch_model_for_export(self, model, device="xla", **kwargs):
             batch_size=batch_size,
             sequence_length=sequence_length,
             num_beams=num_beams,
-            output_hidden_states=self.optional_outputs["output_hidden_states"],
-            output_attentions=self.optional_outputs["output_attentions"],
+            output_hidden_states=self.output_hidden_states,
+            output_attentions=self.output_attentions,
             device=device,
         )
 
diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py
index 31d6d00ba..c9e3a6e93 100644
--- a/optimum/exporters/neuron/model_wrappers.py
+++ b/optimum/exporters/neuron/model_wrappers.py
@@ -277,7 +277,10 @@ def forward(
         if self.output_attentions:
             decoder_attentions = [
                 attention for attention in decoder_output["attentions"]
-            ]  # flatten `hidden_states` which is a tuple of tensors
+            ]  # flatten `decoder_attentions` which is a tuple of tensors
+            cross_attentions = [
+                attention for attention in decoder_output["cross_attentions"]
+            ]  # flatten `cross_attentions` which is a tuple of tensors
 
         if self.config.tie_word_embeddings:
             # Rescale output before projecting on vocab
@@ -333,5 +336,6 @@ def forward(
 
         if self.output_attentions:
             neuron_outputs += decoder_attentions
+            neuron_outputs += cross_attentions
 
         return neuron_outputs
diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py
index afca52597..b49817f40 100644
--- a/optimum/exporters/neuron/utils.py
+++ b/optimum/exporters/neuron/utils.py
@@ -352,7 +352,8 @@ def get_encoder_decoder_models_for_export(
     task: str,
     input_shapes: Dict[str, int],
     dynamic_batch_size: Optional[bool] = False,
-    optional_outputs: Dict[str, bool] = None,
+    output_attentions: bool = False,
+    output_hidden_states: bool = False,
 ) -> Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]:
     """
     Returns the components of an encoder-decoder model and their subsequent neuron configs.
@@ -367,8 +368,10 @@ def get_encoder_decoder_models_for_export(
             Static shapes used for compiling the encoder and the decoder.
         dynamic_batch_size (`bool`, defaults to `False`):
             Whether the Neuron compiled model supports dynamic batch size.
-        optional_outputs (`Dict[str, bool]`, defaults to `None`)
-            Whether to trace some optional output tensors.
+        output_attentions (`bool`, defaults to `False`):
+            Whether or not for the traced model to return the attentions tensors of all attention layers.
+        output_hidden_states (`bool`, defaults to `False`):
+            Whether or not for the traced model to return the hidden states of all layers.
 
     Returns:
         `Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]`: A Dict containing the model and
@@ -400,7 +403,8 @@ def get_encoder_decoder_models_for_export(
         config=model.config,
         task=task,
         dynamic_batch_size=dynamic_batch_size,
-        **optional_outputs,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
         **input_shapes,
     )
     models_for_export[DECODER_NAME] = (model, decoder_neuron_config)
diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py
index 3476fe0a8..51027af4d 100644
--- a/optimum/neuron/generation/utils.py
+++ b/optimum/neuron/generation/utils.py
@@ -416,7 +416,7 @@ def generate(
             model_kwargs["use_cache"] = generation_config.use_cache
 
         accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys())
-        requires_attention_mask = "encoder_outputs" not in model_kwargs
+        requires_attention_mask = "encoder_outputs" not in model_kwargs and not is_traced_inference
 
         if model_kwargs.get("attention_mask", None) is None and requires_attention_mask and accepts_attention_mask:
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
@@ -434,7 +434,7 @@ def generate(
                     "generation results, please set `padding_side='left'` when initializing the tokenizer."
                 )
 
-        if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
+        if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs and not is_traced_inference:
             # if model is encoder decoder encoder_outputs are created
             # and added to `model_kwargs`
             model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
@@ -767,7 +767,14 @@ def greedy_search(
         )
 
         # init attention / hidden states / scores tuples
-        scores = () if (return_dict_in_generate and output_scores) else None
+        scores = None
+        if return_dict_in_generate and output_scores:
+            if is_traced_inference:
+                logger.warning(
+                    "`output_scores` will be neglected because currently we do not trace `next_token_scores` for greedy search (we do only in beam search). If you want us to support the option during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron."
+                )
+            else:
+                scores = ()
         decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
         cross_attentions = () if (return_dict_in_generate and output_attentions) else None
         decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
@@ -847,32 +854,28 @@ def greedy_search(
                 next_tokens_scores = logits_processor(input_ids.to("cpu")[:, :seq_length], next_token_logits.to("cpu"))
                 next_tokens_scores = next_tokens_scores.to(input_ids.device)
 
-                # Store scores, attentions and hidden_states when required
-                if return_dict_in_generate:
-                    if output_scores:
-                        scores += (next_tokens_scores,)
-                    if output_attentions:
-                        decoder_attentions += (
-                            (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
-                        )
-                        if self.config.is_encoder_decoder:
-                            cross_attentions += (outputs.cross_attentions,)
-
-                    if output_hidden_states:
-                        decoder_hidden_states += (
-                            (outputs.decoder_hidden_states,)
-                            if self.config.is_encoder_decoder
-                            else (outputs.hidden_states,)
-                        )
-
                 # argmax
                 next_tokens = torch.argmax(next_tokens_scores, dim=-1)
+
+                if return_dict_in_generate and output_scores:
+                    scores += (next_tokens_scores,)
             else:
                 next_tokens = outputs[0]
 
-                if return_dict_in_generate and output_scores:
-                    logger.warning(
-                        "`output_scores` will be neglected because currently we do not trace `next_token_scores` for greedy search. If you want us to support the option during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron."
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
                     )
 
             # finished sentences should have their next token be a padding token
@@ -1162,8 +1165,19 @@ def beam_search(
                 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
             if is_traced_inference:
-                outputs = self(**model_inputs, beam_scores=beam_scores)
-                next_token_scores, next_tokens, next_indices = outputs
+                outputs = self(
+                    **model_inputs,
+                    beam_scores=beam_scores,
+                    return_dict=True,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                )
+                next_token_scores = outputs.next_token_scores
+                next_tokens = outputs.next_tokens
+                next_indices = outputs.next_indices
+
+                if return_dict_in_generate and output_scores:
+                    scores += (next_token_scores,)
             else:
                 outputs = self(
                     **model_inputs,
@@ -1211,24 +1225,6 @@ def beam_search(
 
                 next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores)
 
-                # Store scores, attentions and hidden_states when required
-                if return_dict_in_generate:
-                    if output_scores:
-                        scores += (next_token_scores_processed,)
-                    if output_attentions:
-                        decoder_attentions += (
-                            (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
-                        )
-                        if self.config.is_encoder_decoder:
-                            cross_attentions += (outputs.cross_attentions,)
-
-                    if output_hidden_states:
-                        decoder_hidden_states += (
-                            (outputs.decoder_hidden_states,)
-                            if self.config.is_encoder_decoder
-                            else (outputs.hidden_states,)
-                        )
-
                 # reshape for beam search
                 vocab_size = next_token_scores.shape[-1]
                 next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
@@ -1242,6 +1238,24 @@ def beam_search(
                 next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
                 next_tokens = next_tokens % vocab_size
 
+                if return_dict_in_generate and output_scores:
+                    scores += (next_token_scores_processed,)
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
             # stateless
             beam_outputs = beam_scorer.process(
                 input_ids.to("cpu")[:, :cur_len],
diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py
index 144826740..d9daa46ac 100644
--- a/optimum/neuron/modeling_base.py
+++ b/optimum/neuron/modeling_base.py
@@ -297,14 +297,15 @@ def _from_transformers(
         )
 
         store_compilation_config(
-            config,
-            input_shapes,
-            compiler_kwargs,
-            input_names,
-            output_names,
-            dynamic_batch_size,
-            compiler_type,
-            compiler_version,
+            config=config,
+            input_shapes=input_shapes,
+            compiler_kwargs=compiler_kwargs,
+            input_names=input_names,
+            output_names=output_names,
+            dynamic_batch_size=dynamic_batch_size,
+            compiler_type=compiler_type,
+            compiler_version=compiler_version,
+            task=task,
         )
 
         config.save_pretrained(save_dir_path)
diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index b52e7e863..3395cc7a6 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -32,7 +32,7 @@
 from transformers.generation.stopping_criteria import (
     StoppingCriteriaList,
 )
-from transformers.modeling_outputs import Seq2SeqLMOutput
+from transformers.modeling_outputs import ModelOutput
 
 from ..exporters.neuron import (
     NeuronConfig,
@@ -268,6 +268,8 @@ def _from_transformers(
         disable_fast_relayout: Optional[bool] = False,
         disable_fallback: bool = False,
         dynamic_batch_size: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
         **kwargs_shapes,
     ) -> "NeuronModelForConditionalGeneration":
         if dynamic_batch_size is True:
@@ -304,6 +306,8 @@ def _from_transformers(
             local_files_only=local_files_only,
             use_auth_token=use_auth_token,
             do_validation=False,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
             **kwargs_shapes,
         )
 
@@ -350,12 +354,11 @@ def forward(
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.BoolTensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        beam_scores=None,
-        # Leave following kwargs for compatibility, will not have any effect.
+        beam_scores: Optional[torch.FloatTensor] = None,
         return_dict: bool = False,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
-    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+    ) -> Union[Tuple[torch.FloatTensor], ModelOutput]:
         hidden_states = encoder_outputs["last_hidden_state"]
 
         if not hasattr(self, "beam_idx"):
@@ -363,15 +366,40 @@ def forward(
             num_beams = attention_mask.shape[0]
             self.beam_idx = torch.arange(0, num_beams, dtype=torch.int64)
 
-        decoder_outputs = self.decoder(
+        outputs = self.decoder(
             decoder_input_ids, decoder_attention_mask, hidden_states, attention_mask, self.beam_idx, beam_scores
         )
 
-        next_token_scores = decoder_outputs[0]
-        next_tokens = decoder_outputs[1]
-        next_indices = decoder_outputs[2]
+        # Fetch optional outputs
+        cur_idx = 0
+        cross_attentions = None
+        decoder_attentions = None
+        decoder_hidden_states = None
+
+        # Skip pkv which can't be copied from memory to buffer
+        if output_attentions and self.config.neuron.get("output_attentions"):
+            if self.config.is_encoder_decoder:
+                cross_attentions = outputs[-self.config.num_decoder_layers :]
+                cur_idx += self.config.num_decoder_layers
+            decoder_attentions = outputs[-(self.config.num_decoder_layers + cur_idx) : -cur_idx]
+            cur_idx += self.config.num_decoder_layers
+
+        if output_hidden_states and self.config.neuron.get("output_hidden_states"):
+            decoder_hidden_states = outputs[-(self.config.num_decoder_layers + 1 + cur_idx) : -cur_idx]
+
+        decoder_outputs = ModelOutput(
+            next_token_scores=outputs[0],
+            next_tokens=outputs[1],
+            next_indices=outputs[2],
+            cross_attentions=cross_attentions,
+            decoder_attentions=decoder_attentions,
+            decoder_hidden_states=decoder_hidden_states,
+        )
 
-        return next_token_scores, next_tokens, next_indices
+        if return_dict:
+            return decoder_outputs
+        else:
+            return decoder_outputs.to_tuple()
 
     def generate(
         self,
@@ -382,7 +410,7 @@ def generate(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
         assistant_model: Optional["PreTrainedModel"] = None,
-        num_return_sequences: Optional[int] = None,
+        num_return_sequences: int = 1,
         **kwargs,
     ):
         max_length = self.neuron_configs[ENCODER_NAME].sequence_length
@@ -414,9 +442,14 @@ def generate(
             assistant_model=assistant_model,
             num_return_sequences=num_return_sequences,
             max_length=kwargs.pop("max_length", None) or max_length,
+            max_new_tokens=kwargs.pop("max_new_tokens", None),
+            output_attentions=kwargs.pop("output_attentions", False),
+            output_hidden_states=kwargs.pop("output_hidden_states", False),
+            output_scores=kwargs.pop("output_scores", False),
+            return_dict_in_generate=kwargs.pop("return_dict_in_generate", False),
             num_beams=num_beams,
             do_sample=kwargs.pop("do_sample", False),
-            use_cache=True,  # pkv is cached by default
+            use_cache=True,  # pkv is cached by default in
             decoder_attention_mask=decoder_attention_mask,
             # Pass fake encoder_outputs so the transfomers code will not invoke the encoder
             encoder_outputs={"last_hidden_state": torch.ones((batch_size, max_length, 1))},
diff --git a/optimum/neuron/utils/argument_utils.py b/optimum/neuron/utils/argument_utils.py
index 5400c3065..4798136e1 100644
--- a/optimum/neuron/utils/argument_utils.py
+++ b/optimum/neuron/utils/argument_utils.py
@@ -147,7 +147,8 @@ def store_compilation_config(
     compiler_version: str,
     model_type: Optional[str] = None,
     task: str = None,
-    optional_outputs: Dict[str, bool] = None,
+    output_attentions: bool = False,
+    output_hidden_states: bool = False,
     **kwargs,
 ):
     if isinstance(config, OrderedDict):
@@ -183,8 +184,9 @@ def store_compilation_config(
     elif neuron_model_type != original_model_type:
         config_args["model_type"] = neuron_model_type  # Neuron custom model_type, eg. `t5-encoder`.
 
-    if optional_outputs is not None:
-        config_args["optional_outputs"] = optional_outputs
+    # Add args of optional outputs
+    config_args["output_attentions"] = output_attentions
+    config_args["output_hidden_states"] = output_hidden_states
 
     update_func("neuron", config_args)
 
diff --git a/tests/cli/test_export_cli.py b/tests/cli/test_export_cli.py
index 61ed9d5af..a8abf30f4 100644
--- a/tests/cli/test_export_cli.py
+++ b/tests/cli/test_export_cli.py
@@ -250,7 +250,7 @@ def test_replace_unet(self):
             )
 
     @requires_neuronx
-    def test_t5(self):
+    def test_encoder_decoder(self):
         model_id = "hf-internal-testing/tiny-random-t5"
         with tempfile.TemporaryDirectory() as tempdir:
             subprocess.run(
@@ -277,3 +277,34 @@ def test_t5(self):
                 shell=False,
                 check=True,
             )
+
+    @requires_neuronx
+    def test_encoder_decoder_optional_outputs(self):
+        model_id = "hf-internal-testing/tiny-random-t5"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                [
+                    "optimum-cli",
+                    "export",
+                    "neuron",
+                    "--model",
+                    model_id,
+                    "--task",
+                    "text2text-generation",
+                    "--batch_size",
+                    "1",
+                    "--sequence_length",
+                    "18",
+                    "--num_beams",
+                    "4",
+                    "--auto_cast",
+                    "matmul",
+                    "--auto_cast_type",
+                    "bf16",
+                    "--output_hidden_states",
+                    "--output_attentions",
+                    tempdir,
+                ],
+                shell=False,
+                check=True,
+            )
diff --git a/tests/exporters/test_export.py b/tests/exporters/test_export.py
index 76b24a560..41507453a 100644
--- a/tests/exporters/test_export.py
+++ b/tests/exporters/test_export.py
@@ -225,7 +225,7 @@ class NeuronEncoderDecoderExportTestCase(unittest.TestCase):
     """
 
     @parameterized.expand(ENCODER_DECODER_MODELS_TINY.items())
-    def test_export_for_encoder_decoder_models(self, model_name, model_id):
+    def test_export_encoder_decoder_models(self, model_name, model_id):
         set_seed(SEED)
 
         # prepare neuron config / models
@@ -239,6 +239,8 @@ def test_export_for_encoder_decoder_models(self, model_name, model_id):
                 task="text2text-generation",
                 output=Path(tmpdirname),
                 model_name_or_path=model_id,
+                output_attentions=True,
+                output_hidden_states=True,
             )
             _, neuron_outputs = export_models(
                 models_and_neuron_configs=models_and_neuron_configs,
diff --git a/tests/generation/conftest.py b/tests/generation/conftest.py
index 85f203f85..c39a03b38 100644
--- a/tests/generation/conftest.py
+++ b/tests/generation/conftest.py
@@ -65,7 +65,7 @@ def neuron_decoder_path(export_decoder_id):
 
 @pytest.fixture(scope="module")
 @requires_neuronx
-def neuron_seq2seq_path(export_seq2seq_id):
+def neuron_seq2seq_beam_path(export_seq2seq_id):
     model = NeuronModelForSeq2SeqLM.from_pretrained(
         export_seq2seq_id, export=True, batch_size=1, sequence_length=64, num_beams=4
     )
@@ -79,6 +79,28 @@ def neuron_seq2seq_path(export_seq2seq_id):
     yield model_path
 
 
+@pytest.fixture(scope="module")
+@requires_neuronx
+def neuron_seq2seq_beam_path_with_optional_outputs(export_seq2seq_id):
+    model = NeuronModelForSeq2SeqLM.from_pretrained(
+        export_seq2seq_id,
+        export=True,
+        batch_size=1,
+        sequence_length=64,
+        num_beams=4,
+        output_attentions=True,
+        output_hidden_states=True,
+    )
+    model_dir = TemporaryDirectory()
+    model_path = model_dir.name
+    model.save_pretrained(model_path)
+    del model
+    # Yield instead of returning to keep a reference to the temporary directory.
+    # It will go out of scope and be released only once all tests needing the fixture
+    # have been completed.
+    yield model_path
+
+
 @pytest.fixture(scope="module")
 @requires_neuronx
 def neuron_seq2seq_greedy_path(export_seq2seq_id):
@@ -95,6 +117,28 @@ def neuron_seq2seq_greedy_path(export_seq2seq_id):
     yield model_path
 
 
+@pytest.fixture(scope="module")
+@requires_neuronx
+def neuron_seq2seq_greedy_path_with_optional_outputs(export_seq2seq_id):
+    model = NeuronModelForSeq2SeqLM.from_pretrained(
+        export_seq2seq_id,
+        export=True,
+        batch_size=1,
+        sequence_length=64,
+        num_beams=1,
+        output_attentions=True,
+        output_hidden_states=True,
+    )
+    model_dir = TemporaryDirectory()
+    model_path = model_dir.name
+    model.save_pretrained(model_path)
+    del model
+    # Yield instead of returning to keep a reference to the temporary directory.
+    # It will go out of scope and be released only once all tests needing the fixture
+    # have been completed.
+    yield model_path
+
+
 @pytest.fixture(scope="module")
 def neuron_push_decoder_id(export_decoder_id):
     model_name = export_decoder_id.split("/")[-1]
diff --git a/tests/generation/test_generate.py b/tests/generation/test_generate.py
index 06cbed335..f50b0fb59 100644
--- a/tests/generation/test_generate.py
+++ b/tests/generation/test_generate.py
@@ -63,11 +63,47 @@ def test_model_generation_input_dimensions(neuron_decoder_path):
 
 @is_inferentia_test
 @requires_neuronx
-def test_seq2seq_generation_beam(neuron_seq2seq_path):
-    model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path)
-    tokenizer = AutoTokenizer.from_pretrained(neuron_seq2seq_path)
+def test_seq2seq_generation_beam(neuron_seq2seq_beam_path):
+    model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_beam_path)
+    tokenizer = AutoTokenizer.from_pretrained(neuron_seq2seq_beam_path)
     inputs = tokenizer("translate English to German: Lets eat good food.", return_tensors="pt")
-    output = model.generate(**inputs, num_return_sequences=1)
+
+    # 1. max length
+    output = model.generate(**inputs, num_return_sequences=2, max_length=5)
+    assert len(output[0]) <= 5
+
+    # 2. min length
+    output = model.generate(**inputs, num_return_sequences=2, min_length=10)
+    assert len(output[0]) >= 10
+
+    # 3. max new tokens
+    output = model.generate(**inputs, num_return_sequences=2, max_new_tokens=5)
+    assert len(output[0].unique()) <= 5
+
+    return output
+
+
+@is_inferentia_test
+@requires_neuronx
+def test_seq2seq_generation_beam_with_optional_outputs(neuron_seq2seq_beam_path_with_optional_outputs):
+    model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_beam_path_with_optional_outputs)
+    tokenizer = AutoTokenizer.from_pretrained(neuron_seq2seq_beam_path_with_optional_outputs)
+    inputs = tokenizer("translate English to German: Lets eat good food.", return_tensors="pt")
+
+    output = model.generate(
+        **inputs,
+        num_return_sequences=1,
+        max_length=20,
+        output_scores=True,
+        output_attentions=True,
+        output_hidden_states=True,
+        return_dict_in_generate=True,
+    )
+    assert "scores" in output
+    assert "decoder_attentions" in output
+    assert "cross_attentions" in output
+    assert "decoder_hidden_states" in output
+
     return output
 
 
@@ -77,5 +113,39 @@ def test_seq2seq_generation_greedy(neuron_seq2seq_greedy_path):
     model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_greedy_path)
     tokenizer = AutoTokenizer.from_pretrained(neuron_seq2seq_greedy_path)
     inputs = tokenizer("translate English to German: Lets eat good food.", return_tensors="pt")
-    output = model.generate(**inputs, num_return_sequences=1)
+
+    # 1. max length
+    output = model.generate(**inputs, num_return_sequences=1, max_length=5)
+    assert len(output[0]) <= 5
+
+    # 2. min length
+    output = model.generate(**inputs, num_return_sequences=1, min_length=10)
+    assert len(output[0]) >= 10
+
+    # 3. max new tokens
+    output = model.generate(**inputs, num_return_sequences=1, max_new_tokens=5)
+    assert len(output[0].unique()) <= 5
+
+    return output
+
+
+@is_inferentia_test
+@requires_neuronx
+def test_seq2seq_generation_greedy_with_optional_outputs(neuron_seq2seq_greedy_path_with_optional_outputs):
+    model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_greedy_path_with_optional_outputs)
+    tokenizer = AutoTokenizer.from_pretrained(neuron_seq2seq_greedy_path_with_optional_outputs)
+    inputs = tokenizer("translate English to German: Lets eat good food.", return_tensors="pt")
+
+    output = model.generate(
+        **inputs,
+        num_return_sequences=1,
+        max_length=20,
+        output_attentions=True,
+        output_hidden_states=True,
+        return_dict_in_generate=True,
+    )
+    assert "decoder_attentions" in output
+    assert "cross_attentions" in output
+    assert "decoder_hidden_states" in output
+
     return output

From 6f69d6d5af63b903c2f929d77e1bea4bc6c04a7d Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Fri, 1 Dec 2023 16:29:59 +0000
Subject: [PATCH 27/30] fix style

---
 optimum/exporters/neuron/model_wrappers.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py
index c9e3a6e93..0b1ae4504 100644
--- a/optimum/exporters/neuron/model_wrappers.py
+++ b/optimum/exporters/neuron/model_wrappers.py
@@ -270,17 +270,17 @@ def forward(
         last_hidden_state = decoder_output["last_hidden_state"]
         past_key_values = decoder_output["past_key_values"]
         if self.output_hidden_states:
-            decoder_hidden_states = [
-                hidden_state for hidden_state in decoder_output["hidden_states"]
-            ]  # flatten `hidden_states` which is a tuple of tensors
+            decoder_hidden_states = list(
+                decoder_output["hidden_states"]
+            )  # flatten `hidden_states` which is a tuple of tensors
 
         if self.output_attentions:
-            decoder_attentions = [
-                attention for attention in decoder_output["attentions"]
-            ]  # flatten `decoder_attentions` which is a tuple of tensors
-            cross_attentions = [
-                attention for attention in decoder_output["cross_attentions"]
-            ]  # flatten `cross_attentions` which is a tuple of tensors
+            decoder_attentions = list(
+                decoder_output["attentions"]
+            )  # flatten `decoder_attentions` which is a tuple of tensors
+            cross_attentions = list(
+                decoder_output["cross_attentions"]
+            )  # flatten `cross_attentions` which is a tuple of tensors
 
         if self.config.tie_word_embeddings:
             # Rescale output before projecting on vocab

From 9f461f8f746a0f8fd9020b869765062d21f029ba Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Fri, 1 Dec 2023 16:34:10 +0000
Subject: [PATCH 28/30] fix style

---
 optimum/neuron/modeling_seq2seq.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index 3395cc7a6..3e6a4f45d 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -32,7 +32,6 @@
 from transformers.generation.stopping_criteria import (
     StoppingCriteriaList,
 )
-from transformers.modeling_outputs import ModelOutput
 
 from ..exporters.neuron import (
     NeuronConfig,
@@ -358,7 +357,7 @@ def forward(
         return_dict: bool = False,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
-    ) -> Union[Tuple[torch.FloatTensor], ModelOutput]:
+    ) -> Union[Tuple[torch.FloatTensor], "ModelOutput"]:
         hidden_states = encoder_outputs["last_hidden_state"]
 
         if not hasattr(self, "beam_idx"):

From d6a24b63bf9627b50edb96843fa0f552874bad79 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Sat, 2 Dec 2023 00:28:04 +0000
Subject: [PATCH 29/30] apply suggestions

---
 optimum/exporters/neuron/__main__.py      | 12 ++++++------
 optimum/exporters/neuron/convert.py       |  2 +-
 optimum/exporters/neuron/model_configs.py | 10 ++--------
 optimum/neuron/modeling_base.py           | 12 +++++-------
 optimum/neuron/modeling_seq2seq.py        | 17 ++++++++---------
 optimum/neuron/utils/argument_utils.py    |  8 +++++---
 tests/generation/test_generate.py         | 12 ++----------
 7 files changed, 29 insertions(+), 44 deletions(-)

diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
index 0fcf91a79..8e70ee4d7 100644
--- a/optimum/exporters/neuron/__main__.py
+++ b/optimum/exporters/neuron/__main__.py
@@ -249,8 +249,8 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
     dynamic_batch_size: bool = False,
     submodels: Optional[Dict[str, Union[Path, str]]] = None,
 ):
-    model = replace_stable_diffusion_submodels(model, submodels)
     check_compiler_compatibility_for_stable_diffusion()
+    model = replace_stable_diffusion_submodels(model, submodels)
     if is_neuron_available():
         raise RuntimeError(
             "Stable diffusion export is not supported by neuron-cc on inf1, please use neuronx-cc on either inf2/trn1 instead."
@@ -259,11 +259,11 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
 
     # Saving the model config and preprocessor as this is needed sometimes.
     model.scheduler.save_pretrained(output.joinpath("scheduler"))
-    if hasattr(model, "tokenizer") and model.tokenizer is not None:
+    if getattr(model, "tokenizer", None) is not None:
         model.tokenizer.save_pretrained(output.joinpath("tokenizer"))
-    if hasattr(model, "tokenizer_2") and model.tokenizer_2 is not None:
+    if getattr(model, "tokenizer_2", None) is not None:
         model.tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
-    if hasattr(model, "feature_extractor") and model.feature_extractor is not None:
+    if getattr(model, "feature_extractor", None) is not None:
         model.feature_extractor.save_pretrained(output.joinpath("feature_extractor"))
     model.save_config(output)
 
@@ -278,11 +278,11 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
         DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME),
         DIFFUSION_MODEL_VAE_DECODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_DECODER_NAME, NEURON_FILE_NAME),
     }
-    if hasattr(model, "text_encoder") and model.text_encoder is not None:
+    if getattr(model, "text_encoder", None) is not None:
         output_model_names[DIFFUSION_MODEL_TEXT_ENCODER_NAME] = os.path.join(
             DIFFUSION_MODEL_TEXT_ENCODER_NAME, NEURON_FILE_NAME
         )
-    if hasattr(model, "text_encoder_2") and model.text_encoder_2 is not None:
+    if getattr(model, "text_encoder_2", None) is not None:
         output_model_names[DIFFUSION_MODEL_TEXT_ENCODER_2_NAME] = os.path.join(
             DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, NEURON_FILE_NAME
         )
diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index 6f712d4cb..d5b826ee6 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -223,7 +223,7 @@ def validate_model_outputs(
     value_failures = []
     for i, (name, output) in enumerate(zip(neuron_output_names_list, neuron_outputs)):
         if isinstance(output, torch.Tensor):
-            ref_output = ref_outputs[name].numpy() if isinstance(ref_outputs, Dict) else ref_outputs[i].numpy()
+            ref_output = ref_outputs[name].numpy() if isinstance(ref_outputs, dict) else ref_outputs[i].numpy()
             output = output.numpy()
         elif isinstance(output, tuple):  # eg. `hidden_states` of `AutoencoderKL` is a tuple of tensors.
             ref_output = torch.stack(ref_outputs[name]).numpy()
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index fe5835198..aa7d05fa8 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -31,6 +31,7 @@
     NormalizedTextAndVisionConfig,
     is_diffusers_available,
 )
+from ...utils.normalized_config import T5LikeNormalizedTextConfig
 from ..tasks import TasksManager
 from .config import (
     TextAndVisionNeuronConfig,
@@ -416,14 +417,7 @@ class T5DecoderNeuronConfig(TextSeq2SeqNeuronConfig):
     MANDATORY_AXES = ("batch_size", "sequence_length", "num_beams")
     MODEL_TYPE = "t5-decoder"
     CUSTOM_MODEL_WRAPPER = T5DecoderWrapper
-    NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
-        hidden_size="d_model",
-        num_attention_heads="num_heads",
-        encoder_num_layers="num_layers",
-        decoder_num_layers="num_decoder_layers",
-        key_value_dim="d_kv",
-        allow_new=True,
-    )
+    NORMALIZED_CONFIG_CLASS = T5LikeNormalizedTextConfig
 
     @property
     def is_decoder(self) -> bool:
diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py
index d9daa46ac..6cc1cd95c 100644
--- a/optimum/neuron/modeling_base.py
+++ b/optimum/neuron/modeling_base.py
@@ -393,10 +393,10 @@ def _neuron_config_init(cls, config: "PretrainedConfig") -> "NeuronConfig":
             )
             return
 
-        neuron_configs = config.neuron
+        neuron_config = config.neuron
         # Fetch compiler information
-        compiler_type = neuron_configs.get("compiler_type")
-        compiler_version = neuron_configs.get("compiler_version")
+        compiler_type = neuron_config.get("compiler_type")
+        compiler_version = neuron_config.get("compiler_version")
 
         # Fetch mandatory shapes from config
         compile_shapes = {
@@ -408,16 +408,14 @@ def _neuron_config_init(cls, config: "PretrainedConfig") -> "NeuronConfig":
         # Neuron config constructuor
         task = getattr(config, "task") or TasksManager.infer_task_from_model(cls.auto_model_class)
         task = TasksManager.map_from_synonym(task)
-        model_type = neuron_configs.get("model_type", None)
-        if not (model_type and model_type != "None"):
-            model_type = config.model_type
+        model_type = neuron_config.get("model_type", None) or config.model_type
         neuron_config_constructor = TasksManager.get_exporter_config_constructor(
             model_type=model_type, exporter="neuron", task=task
         )
 
         return neuron_config_constructor(
             config,
-            dynamic_batch_size=neuron_configs.get("dynamic_batch_size", False),
+            dynamic_batch_size=neuron_config.get("dynamic_batch_size", False),
             compiler_type=compiler_type,
             compiler_version=compiler_version,
             **compile_shapes,
diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py
index 3e6a4f45d..3d42a7129 100644
--- a/optimum/neuron/modeling_seq2seq.py
+++ b/optimum/neuron/modeling_seq2seq.py
@@ -26,12 +26,9 @@
 import torch
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModelForSeq2SeqLM, GenerationConfig
-from transformers.generation.logits_process import (
-    LogitsProcessorList,
-)
-from transformers.generation.stopping_criteria import (
-    StoppingCriteriaList,
-)
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers.utils import ModelOutput
 
 from ..exporters.neuron import (
     NeuronConfig,
@@ -51,7 +48,6 @@
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedModel
-    from transformers.utils import ModelOutput
 
 if is_neuronx_available():
     import torch_neuronx
@@ -357,7 +353,7 @@ def forward(
         return_dict: bool = False,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
-    ) -> Union[Tuple[torch.FloatTensor], "ModelOutput"]:
+    ) -> Union[Tuple[torch.FloatTensor], ModelOutput]:
         hidden_states = encoder_outputs["last_hidden_state"]
 
         if not hasattr(self, "beam_idx"):
@@ -424,7 +420,10 @@ def generate(
         past_key_values = self.encoder(**inputs)
 
         decoder_attention_mask = torch.cat(
-            [torch.zeros((batch_size, max_length - 1), dtype=torch.int64), torch.ones((1, 1), dtype=torch.int64)],
+            [
+                torch.zeros((batch_size, max_length - 1), dtype=torch.int64),
+                torch.ones((batch_size, 1), dtype=torch.int64),
+            ],
             axis=1,
         )
 
diff --git a/optimum/neuron/utils/argument_utils.py b/optimum/neuron/utils/argument_utils.py
index 4798136e1..d910cd074 100644
--- a/optimum/neuron/utils/argument_utils.py
+++ b/optimum/neuron/utils/argument_utils.py
@@ -176,13 +176,15 @@ def store_compilation_config(
     config_args["output_names"] = output_names
 
     original_model_type = getattr(config, "model_type", None)
-    neuron_model_type = str(model_type).replace("_", "-")
+    neuron_model_type = str(model_type).replace("_", "-") if model_type is not None else model_type
     if original_model_type is None:
         update_func(
             "model_type", neuron_model_type
         )  # Add model_type to the config if it doesn't exist before, eg. submodel of Stable Diffusion.
-    elif neuron_model_type != original_model_type:
-        config_args["model_type"] = neuron_model_type  # Neuron custom model_type, eg. `t5-encoder`.
+    else:
+        config_args["model_type"] = (
+            neuron_model_type or original_model_type
+        )  # Prioritize Neuron custom model_type, eg. `t5-encoder`.
 
     # Add args of optional outputs
     config_args["output_attentions"] = output_attentions
diff --git a/tests/generation/test_generate.py b/tests/generation/test_generate.py
index f50b0fb59..1f7630b4d 100644
--- a/tests/generation/test_generate.py
+++ b/tests/generation/test_generate.py
@@ -78,9 +78,7 @@ def test_seq2seq_generation_beam(neuron_seq2seq_beam_path):
 
     # 3. max new tokens
     output = model.generate(**inputs, num_return_sequences=2, max_new_tokens=5)
-    assert len(output[0].unique()) <= 5
-
-    return output
+    assert len(output[0].unique()) <= 5 + 1  # +1 for `decoder_start_token_id`
 
 
 @is_inferentia_test
@@ -104,8 +102,6 @@ def test_seq2seq_generation_beam_with_optional_outputs(neuron_seq2seq_beam_path_
     assert "cross_attentions" in output
     assert "decoder_hidden_states" in output
 
-    return output
-
 
 @is_inferentia_test
 @requires_neuronx
@@ -124,9 +120,7 @@ def test_seq2seq_generation_greedy(neuron_seq2seq_greedy_path):
 
     # 3. max new tokens
     output = model.generate(**inputs, num_return_sequences=1, max_new_tokens=5)
-    assert len(output[0].unique()) <= 5
-
-    return output
+    assert len(output[0]) <= 5 + 1  # +1 for `decoder_start_token_id`
 
 
 @is_inferentia_test
@@ -147,5 +141,3 @@ def test_seq2seq_generation_greedy_with_optional_outputs(neuron_seq2seq_greedy_p
     assert "decoder_attentions" in output
     assert "cross_attentions" in output
     assert "decoder_hidden_states" in output
-
-    return output

From 3b07ba1dd2c827e9e671f7bda35e4d3c775d4e10 Mon Sep 17 00:00:00 2001
From: JingyaHuang <jingya@huggingface.co>
Date: Sat, 2 Dec 2023 08:52:13 +0000
Subject: [PATCH 30/30] fix tests

---
 tests/generation/test_export.py |  4 ++--
 tests/generation/test_hub.py    | 12 +++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/generation/test_export.py b/tests/generation/test_export.py
index 32c53c4a4..fb69f2a88 100644
--- a/tests/generation/test_export.py
+++ b/tests/generation/test_export.py
@@ -71,6 +71,6 @@ def test_seq2seq_export(export_seq2seq_id, batch_size, sequence_length, num_beam
 
 @is_inferentia_test
 @requires_neuronx
-def test_seq2seq_model_from_path(neuron_seq2seq_path):
-    model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path)
+def test_seq2seq_model_from_path(neuron_seq2seq_greedy_path):
+    model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_greedy_path)
     return model
diff --git a/tests/generation/test_hub.py b/tests/generation/test_hub.py
index ff8e90615..7e1faa196 100644
--- a/tests/generation/test_hub.py
+++ b/tests/generation/test_hub.py
@@ -61,17 +61,19 @@ def test_seq2seq_model_from_hub():
 
 @is_inferentia_test
 @requires_neuronx
-def test_push_seq2seq_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id):
-    model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path)
-    model.push_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING)
+def test_push_seq2seq_to_hub(neuron_seq2seq_greedy_path, neuron_push_seq2seq_id):
+    model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_greedy_path)
+    model.push_to_hub(
+        neuron_seq2seq_greedy_path, neuron_push_seq2seq_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING
+    )
     api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
     try:
         hub_files_info = api.list_files_info(neuron_push_seq2seq_id)
         hub_files_path = [info.rfilename for info in hub_files_info]
-        for path, _, files in os.walk(neuron_seq2seq_path):
+        for path, _, files in os.walk(neuron_seq2seq_greedy_path):
             for name in files:
                 local_file_path = os.path.join(path, name)
-                hub_file_path = os.path.relpath(local_file_path, neuron_seq2seq_path)
+                hub_file_path = os.path.relpath(local_file_path, neuron_seq2seq_greedy_path)
                 assert hub_file_path in hub_files_path
     finally:
         api.delete_repo(neuron_push_seq2seq_id)