From 64f12e3a066841675269ac831a6dba37cb961e69 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Thu, 19 Oct 2023 17:29:48 +0000 Subject: [PATCH 01/30] init --- optimum/exporters/neuron/config.py | 72 +++++++- optimum/exporters/neuron/model_configs.py | 30 ++-- optimum/exporters/neuron/model_wrappers.py | 32 ++++ optimum/neuron/modeling_seq2seq.py | 199 +++++++++++++++++++++ optimum/neuron/utils/__init__.py | 2 + optimum/neuron/utils/constant.py | 2 + 6 files changed, 318 insertions(+), 19 deletions(-) create mode 100644 optimum/exporters/neuron/model_wrappers.py create mode 100644 optimum/neuron/modeling_seq2seq.py diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py index 0e3d61bc8..1564de82d 100644 --- a/optimum/exporters/neuron/config.py +++ b/optimum/exporters/neuron/config.py @@ -16,14 +16,17 @@ Common Neuron configuration classes that handle most of the features for building model specific configurations. """ - +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union from ...utils import ( + DummyInputGenerator, DummyBboxInputGenerator, DummyTextInputGenerator, + DummySeq2SeqDecoderTextInputGenerator, + DummySeq2SeqPastKeyValuesGenerator, DummyVisionInputGenerator, logging, ) -from .base import NeuronConfig, NeuronDecoderConfig +from .base import NeuronConfig, NeuronDecoderConfig, NeuronSeq2SeqConfigWithPast logger = logging.get_logger(__name__) @@ -61,3 +64,68 @@ class TextNeuronDecoderConfig(NeuronDecoderConfig): """ pass + + +class TextSeq2SeqNeuronConfig(NeuronConfig): + """ + Handles encoder-decoder-based text architectures. + """ + + DUMMY_INPUT_GENERATOR_CLASSES = ( + DummyTextInputGenerator, + DummySeq2SeqDecoderTextInputGenerator, + DummySeq2SeqPastKeyValuesGenerator, + ) + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + common_inputs = [] + # encoder + decoder without past + if "encoder" in self.MODEL_TYPE: + common_inputs = ["input_ids", "attention_mask"] + + # decoder with past + if "decoder" in self.MODEL_TYPE: + common_inputs = [ + "decoder_input_ids", + "decoder_attention_mask", + "encoder_hidden_states", + "encoder_attention_mask", + "beam_idx", + "beam_scores", + ] + + return common_inputs + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + # encoder + decoder without past + if "encoder" in self.MODEL_TYPE: + common_outputs = ["past_key_values"] + # decoder with past + if "decoder" in self.MODEL_TYPE: + common_outputs = ["next_tokens", ""] + return common_outputs + + def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]: + dummy_text_input_generator = self.DUMMY_INPUT_GENERATOR_CLASSES[0]( + self.task, self._normalized_config, **kwargs + ) + dummy_decoder_text_input_generator = self.DUMMY_INPUT_GENERATOR_CLASSES[1]( + self.task, + self._normalized_config, + **kwargs, + ) + dummy_seq2seq_past_key_values_generator = self.DUMMY_INPUT_GENERATOR_CLASSES[2]( + self.task, + self._normalized_config, + encoder_sequence_length=dummy_text_input_generator.sequence_length, + **kwargs, + ) + dummy_inputs_generators = [ + dummy_text_input_generator, + dummy_decoder_text_input_generator, + dummy_seq2seq_past_key_values_generator, + ] + + return dummy_inputs_generators \ No newline at end of file diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index d603d7379..31cbae6f9 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -35,6 +35,9 @@ TextNeuronDecoderConfig, VisionNeuronConfig, ) +from .model_wrappers import ( + UnetNeuronWrapper, +) if TYPE_CHECKING: @@ -278,26 +281,10 @@ def generate_dummy_inputs(self, return_tuple: bool = False, **kwargs): else: return dummy_inputs - class ModelWrapper(torch.nn.Module): - def __init__(self, model): - super().__init__() - self.model = model - - def forward(self, sample, timestep, encoder_hidden_states, text_embeds=None, time_ids=None): - out_tuple = self.model( - sample, - timestep.float().expand((sample.shape[0],)), - encoder_hidden_states, - added_cond_kwargs={"text_embeds": text_embeds, "time_ids": time_ids}, - return_dict=False, - ) - - return out_tuple - def check_model_inputs_order(self, model, dummy_inputs): return super().check_model_inputs_order( model=model, - custom_model_wrapper=self.ModelWrapper, + custom_model_wrapper=UnetNeuronWrapper, ) @@ -372,3 +359,12 @@ class GPT2NeuronConfig(TextNeuronDecoderConfig): @register_in_tasks_manager("llama", "text-generation") class LLamaNeuronConfig(TextNeuronDecoderConfig): NEURONX_CLASS = "llama.model.LlamaForSampling" + + +@register_in_tasks_manager("t5", "text2text-generation") +class T5EncoderNeuronConfig(TextNeuronDecoderConfig): + ATOL_FOR_VALIDATION = 1e-3 + MANDATORY_AXES = ("batch_size", "sequence_length") + MODEL_TYPE = "t5-encoder" + + diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py new file mode 100644 index 000000000..d7f9d0ade --- /dev/null +++ b/optimum/exporters/neuron/model_wrappers.py @@ -0,0 +1,32 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model wrappers for Neuron export.""" +import torch + +class UnetNeuronWrapper(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, sample, timestep, encoder_hidden_states, text_embeds=None, time_ids=None): + out_tuple = self.model( + sample, + timestep.float().expand((sample.shape[0],)), + encoder_hidden_states, + added_cond_kwargs={"text_embeds": text_embeds, "time_ids": time_ids}, + return_dict=False, + ) + + return out_tuple \ No newline at end of file diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py new file mode 100644 index 000000000..555e77f56 --- /dev/null +++ b/optimum/neuron/modeling_seq2seq.py @@ -0,0 +1,199 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""NeuroModelForXXX classes for seq2seq models' inference on neuron devices.""" +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from abc import abstractmethod +from pathlib import Path +from tempfile import TemporaryDirectory +import torch +from transformers import AutoModelForSeq2SeqLM +from .modeling_base import NeuronBaseModel, NeuronConfig +from .generation import NeuronGenerationMixin +from .utils import ( + ENCODER_NAME, + DECODER_NAME, + NEURON_FILE_NAME, + is_neuronx_available, +) + +if TYPE_CHECKING: + from transformers import PretrainedConfig, PreTrainedModel + +if is_neuronx_available(): + torch_neuronx + + +class NeuronModelForConditionalGeneration(NeuronBaseModel): + base_model_prefix = "neuron_model" + + def __init__( + self, + encoder: torch.jit._script.ScriptModule, + decoder: torch.jit._script.ScriptModule, + config: "PretrainedConfig", + model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + model_file_name: Optional[str] = None, + preprocessors: Optional[List] = None, + neuron_config: Optional["NeuronConfig"] = None, + **kwargs, + ): + pass + + @staticmethod + def load_model( + encoder_path: Union[str, Path], + decoder_path: Optional[Union[str, Path]] = None, + device_ids: Optional[List[int]] = None, + dynamic_batch_size: bool = False, + ): + pass + + def _save_pretrained( + self, + save_directory: Union[str, Path], + encoder_file_name: str = NEURON_FILE_NAME, + decoder_file_name: str = NEURON_FILE_NAME, + ): + """ + Saves the model encoder and decoder as well as their configuration files to a + directory, so that it can be re-loaded using the + [`~optimum.neuron.modeling_seq2seq.NeuronModelForSeq2SeqLM.from_pretrained`] class method. + + Args: + save_directory (`Union[str, Path`]): + The directory where to save the model files. + """ + pass + + @classmethod + def _from_pretrained( + cls, + model_id: Union[str, Path], + config: Dict[str, Any], + use_auth_token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + force_download: bool = False, + cache_dir: Optional[str] = None, + encoder_file_name: Optional[str] = NEURON_FILE_NAME, + decoder_file_name: Optional[str] = NEURON_FILE_NAME, + subfolder: str = "", + local_files_only: bool = False, + model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + device_ids: Optional[List[int]] = None, + **kwargs, + ): + pass + + @classmethod + def _from_transformers( + cls, + model_id: str, + config: "PretrainedConfig", + use_auth_token: Optional[Union[bool, str]] = None, + revision: str = "main", + force_download: bool = True, + cache_dir: Optional[str] = None, + subfolder: str = "", + local_files_only: bool = False, + trust_remote_code: bool = False, + task: Optional[str] = None, + auto_cast: Optional[str] = "matmul", + auto_cast_type: Optional[str] = "bf16", + disable_fast_relayout: Optional[bool] = False, + disable_fallback: bool = False, + dynamic_batch_size: bool = False, + device_ids: Optional[List[int]] = None, + ) -> "NeuronModelForConditionalGeneration": + pass + + +class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin): + auto_model_class = AutoModelForSeq2SeqLM + main_input_name = "input_ids" + + + +class _NeuronSeq2SeqModelPart: + """ + For Seq2Seq architecture, we usually compile it to multiple neuron models. Each represents a part of the model. + """ + + def __init__( + self, + model: torch.jit._script.ScriptModule, + parent_model: NeuronBaseModel, + config: Optional["PretrainedConfig"] = None, + neuron_config: Optional["NeuronConfig"] = None, + model_type: str = "encoder", + device: Optional[int] = None, + ): + self.model = model + self.parent_model = parent_model + self.config = config + self.neuron_config = neuron_config + self.model_type = model_type + self.device = device + + @abstractmethod + def forward(self, *args, **kwargs): + pass + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + + +class NeuronEncoder(_NeuronSeq2SeqModelPart): + """ + Encoder part of the encoder-decoder model for Neuron inference. (Actually it's a monolith of encoder + decoder without past_key_values to workaround the control flow in the decoder). + """ + def __init__( + self, + model: torch.jit._script.ScriptModule, + parent_model: NeuronBaseModel, + config: Optional["PretrainedConfig"] = None, + neuron_config: Optional[Dict[str, str]] = None, + ): + super().__init__(model, parent_model, config, neuron_config, "encoder") + + def forward(self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor): + inputs = (input_ids, attention_mask, ) + outputs = self.model(*inputs) + return outputs + +class NeuronDecoder(_NeuronSeq2SeqModelPart): + """ + Decoder part of the encoder-decoder model for Neuron inference. (Actually it's decoder with past_key_values). + """ + def __init__( + self, + model: torch.jit._script.ScriptModule, + parent_model: NeuronBaseModel, + config: Optional["PretrainedConfig"] = None, + neuron_config: Optional[Dict[str, str]] = None, + ): + super().__init__(model, parent_model, config, neuron_config, "decoder") + + def forward( + self, + input_ids: torch.LongTensor, + decoder_attention_mask: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor, + encoder_attention_mask: torch.FloatTensor, + beam_idx: torch.LongTensor, + beam_scores: torch.FloatTensor, + ): + inputs = (input_ids, decoder_attention_mask, encoder_hidden_states, encoder_attention_mask, beam_idx, beam_scores) + outputs = self.model(*inputs) + return outputs \ No newline at end of file diff --git a/optimum/neuron/utils/__init__.py b/optimum/neuron/utils/__init__.py index 559f501c3..8eee6dbe9 100644 --- a/optimum/neuron/utils/__init__.py +++ b/optimum/neuron/utils/__init__.py @@ -21,6 +21,8 @@ DIFFUSION_MODEL_VAE_DECODER_NAME, DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME, + ENCODER_NAME, + DECODER_NAME, ) from .import_utils import ( is_accelerate_available, diff --git a/optimum/neuron/utils/constant.py b/optimum/neuron/utils/constant.py index 7719ce8a2..edc6eebb8 100644 --- a/optimum/neuron/utils/constant.py +++ b/optimum/neuron/utils/constant.py @@ -15,6 +15,8 @@ """Constants used as default values.""" NEURON_FILE_NAME = "model.neuron" +ENCODER_NAME = "encoder" +DECODER_NAME = "decoder" DIFFUSION_MODEL_TEXT_ENCODER_NAME = "text_encoder" DIFFUSION_MODEL_TEXT_ENCODER_2_NAME = "text_encoder_2" DIFFUSION_MODEL_UNET_NAME = "unet" From aa5a3794f1cacda333449370a733d7cac4a6e56f Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Fri, 20 Oct 2023 22:11:01 +0000 Subject: [PATCH 02/30] update wrappers --- optimum/exporters/neuron/base.py | 4 +- optimum/exporters/neuron/convert.py | 4 +- optimum/exporters/neuron/model_configs.py | 28 ++- optimum/exporters/neuron/model_wrappers.py | 215 ++++++++++++++++++++- optimum/exporters/neuron/utils.py | 48 ++++- 5 files changed, 290 insertions(+), 9 deletions(-) diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py index 57ececa61..8255b1b4e 100644 --- a/optimum/exporters/neuron/base.py +++ b/optimum/exporters/neuron/base.py @@ -119,6 +119,7 @@ def __init__( audio_sequence_length: Optional[int] = None, point_batch_size: Optional[int] = None, nb_points_per_image: Optional[int] = None, + num_beams: Optional[int] = None, # TODO: add custom dtype after optimum 1.13 release # int_dtype: str = "int64", # float_dtype: str = "fp32", @@ -147,6 +148,7 @@ def __init__( "audio_sequence_length": audio_sequence_length, "point_batch_size": point_batch_size, "nb_points_per_image": nb_points_per_image, + "num_beams": num_beams, } input_shapes = {} for name, value in axes_values.items(): @@ -290,7 +292,7 @@ def flatten_inputs(cls, inputs: Dict[str, Any]) -> Dict[str, Any]: flatten[name] = value return flatten - def check_model_inputs_order( + def patch_model( self, model: "PreTrainedModel", dummy_inputs: Optional[Dict[str, torch.Tensor]] = None, diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index c19d19530..70322634e 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -424,7 +424,7 @@ def export_neuronx( dummy_inputs = config.generate_dummy_inputs(**input_shapes) dummy_inputs = config.flatten_inputs(dummy_inputs) dummy_inputs_tuple = tuple(dummy_inputs.values()) - checked_model = config.check_model_inputs_order(model, dummy_inputs) + checked_model = config.patch_model(model, dummy_inputs) if auto_cast is not None: logger.info(f"Using Neuron: --auto-cast {auto_cast}") @@ -533,7 +533,7 @@ def export_neuron( dummy_inputs = config.generate_dummy_inputs(**input_shapes) dummy_inputs_tuple = tuple(dummy_inputs.values()) - checked_model = config.check_model_inputs_order(model, dummy_inputs) + checked_model = config.patch_model(model, dummy_inputs) compiler_args = convert_neuronx_compiler_args_to_neuron(auto_cast, auto_cast_type, disable_fast_relayout) neuron_model = neuron.trace( diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index 31cbae6f9..6c6ad0291 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -37,6 +37,8 @@ ) from .model_wrappers import ( UnetNeuronWrapper, + T5EncoderWrapper, + T5DecoderWrapper, ) @@ -281,8 +283,8 @@ def generate_dummy_inputs(self, return_tuple: bool = False, **kwargs): else: return dummy_inputs - def check_model_inputs_order(self, model, dummy_inputs): - return super().check_model_inputs_order( + def patch_model(self, model, dummy_inputs): + return super().patch_model( model=model, custom_model_wrapper=UnetNeuronWrapper, ) @@ -342,13 +344,13 @@ def inputs(self) -> List[str]: def outputs(self) -> List[str]: return ["sample"] - def check_model_inputs_order( + def patch_model( self, model: "VaeDecoder", dummy_inputs: Dict[str, torch.Tensor], **kwargs, ): - return super().check_model_inputs_order(model=model, dummy_inputs=dummy_inputs, forward_with_tuple=True) + return super().patch_model(model=model, dummy_inputs=dummy_inputs, forward_with_tuple=True) @register_in_tasks_manager("gpt2", "text-generation") @@ -367,4 +369,22 @@ class T5EncoderNeuronConfig(TextNeuronDecoderConfig): MANDATORY_AXES = ("batch_size", "sequence_length") MODEL_TYPE = "t5-encoder" + def patch_model(self, model, num_beams=1): + return super().patch_model( + model=model, + custom_model_wrapper=T5EncoderWrapper, + custom_wrapper_kwargs={"num_beams": num_beams} + ) + +@register_in_tasks_manager("t5", "text2text-generation") +class T5DecoderNeuronConfig(TextNeuronDecoderConfig): + ATOL_FOR_VALIDATION = 1e-3 + MANDATORY_AXES = ("batch_size", "sequence_length", "num_beams") + MODEL_TYPE = "t5-decoder" + + def patch_model(self, model, dummy_inputs): + return super().patch_model( + model=model, + custom_model_wrapper=T5DecoderWrapper, + ) \ No newline at end of file diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py index d7f9d0ade..5eed4891f 100644 --- a/optimum/exporters/neuron/model_wrappers.py +++ b/optimum/exporters/neuron/model_wrappers.py @@ -13,8 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. """Model wrappers for Neuron export.""" +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union import torch +from transformers.models.t5.modeling_t5 import T5Stack, T5LayerCrossAttention + + +if TYPE_CHECKING: + from transformers.modeling_utils import PreTrainedModel + + class UnetNeuronWrapper(torch.nn.Module): def __init__(self, model): super().__init__() @@ -29,4 +37,209 @@ def forward(self, sample, timestep, encoder_hidden_states, text_embeds=None, tim return_dict=False, ) - return out_tuple \ No newline at end of file + return out_tuple + + +# Adapted from https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/torch-neuronx/t5-inference-tutorial.html +class T5EncoderWrapper(torch.nn.Module): + """Wrapper to trace the encoder and the kv cache initialization in the decoder.""" + def __init__( + self, + model: "PreTrainedModel" , + num_beams: int = 1, # defaults to greedy search + tp_degree=None, + ): + super().__init__() + self.model = model + self.config = model.config + self.num_beams = num_beams + self.device = "xla" + self.tp_degree = tp_degree + + def forward(self, input_ids, attention_mask): + # Infer shapes + batch_size = input_ids.shape[0] + sequence_length = input_ids.shape[1] + + encoder_output = self.model.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + output_attentions=False, + output_hidden_states=False + ) + + last_hidden_state = encoder_output["last_hidden_state"] + encoder_hidden_states = torch.concat([tensor.unsqueeze(0).repeat(self.num_beams, 1, 1) for tensor in last_hidden_state]) + + decoder_blocks = self.model.decoder.block + present_key_value_states_sa = [] + present_key_value_states_ca = [] + + for block in decoder_blocks: + + # Cross attention has to be initialized with the encoder hidden state + cross_attention: T5LayerCrossAttention = block.layer[1] + attention = cross_attention.EncDecAttention + + def shape(states): + """projection""" + return states.view(batch_size, -1, self.config.num_heads, attention.key_value_proj_dim).transpose(1, 2) + + key_states = shape(attention.k(encoder_hidden_states)) + value_states = shape(attention.v(encoder_hidden_states)) + + # cross_attn_kv_state + present_key_value_states_ca.append(key_states) + present_key_value_states_ca.append(value_states) + + # Self attention kv states are initialized to zeros. This is done to keep the size of the kv cache tensor constant. + # The kv cache is padded here to keep a fixed shape. + # [key states] + present_key_value_states_sa.append(torch.zeros(( + batch_size, + self.config.num_heads, + sequence_length-1, + self.config.d_kv), dtype=torch.float32, device=self.device)) + # [value states] + present_key_value_states_sa.append(torch.zeros(( + batch_size, + self.config.num_heads, + sequence_length-1, + self.config.d_kv), dtype=torch.float32, device=self.device)) + + return present_key_value_states_sa + present_key_value_states_ca + + +# Adapted from https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/torch-neuronx/t5-inference-tutorial.html +class T5DecoderWrapper(torch.nn.Module): + """Wrapper to trace the decoder with past with a language head.""" + def __init__(self, + model: "PreTrainedModel" , + num_beams: int, + sequence_length: int, + tp_degree=None): + super().__init__() + self.model = model + self.config = model.config + self.num_beams = num_beams + self.device = "xla" + self.tp_degree = tp_degree + + # Initialize KV cache (num_beams, n_heads, seq_length, dim_per_head) + self.past_key_values_sa = torch.nn.ParameterList( + [torch.nn.Parameter( + torch.ones( + (num_beams, self.config.num_heads, sequence_length - 1, self.config.d_kv), + dtype=torch.float32 + ), + requires_grad=False + ) for _ in range(self.config.num_decoder_layers * 2)] + ) + self.past_key_values_ca = torch.nn.ParameterList( + [torch.nn.Parameter( + torch.ones( + (num_beams, self.config.num_heads, sequence_length, self.config.d_kv), + dtype=torch.float32 + ), + requires_grad=False + ) for _ in range(self.config.num_decoder_layers * 2)] + ) + + def update_past(self, past_key_values): + new_past_sa = [] + new_past_ca = [] + for past_layer in past_key_values: + new_past_layer = list(past_layer) + for i in range(len(new_past_layer[:2])): + new_past_layer[i] = past_layer[i][:, :, 1:] + new_past_sa += [new_past_layer[:2],] + new_past_ca += [new_past_layer[2:],] + return new_past_sa, new_past_ca + + def reorder_cache(self, past_key_values, beam_idx): + for i in range(len(past_key_values)): + gather_index = beam_idx.view([beam_idx.shape[0],1,1,1]).expand_as(past_key_values[i]) + past_key_values[i] = torch.gather(past_key_values[i], dim = 0, index=gather_index) + return past_key_values + + def forward( + self, + input_ids, + decoder_attention_mask, + encoder_hidden_states, + encoder_attention_mask, + beam_idx, + beam_scores, + **kwargs + ): + # Infer shapes + batch_size = input_ids.shape[0] or 1 + + if self.num_beams > 1: + # We reorder the cache based on the beams selected in each iteration. Required step for beam search. + past_key_values_sa = self.reorder_cache(self.past_key_values_sa, beam_idx) + past_key_values_ca = self.reorder_cache(self.past_key_values_ca, beam_idx) + else: + # We do not need to reorder for greedy sampling + past_key_values_sa = self.past_key_values_sa + past_key_values_ca = self.past_key_values_ca + + # The cache is stored in a flatten form. We order the cache per layer before passing it to the decoder. + # Each layer has 4 tensors, so we group by 4. + past_key_values = [[*past_key_values_sa[i*2:i*2+2], *past_key_values_ca[i*2:i*2+2]] for i in range(0, int(len(past_key_values_ca)/2))] + + decoder_output = self.model.decoder( + input_ids=input_ids, + attention_mask=decoder_attention_mask, + past_key_values=past_key_values, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=True, + output_attentions=False, + output_hidden_states=False) + + last_hidden_state = decoder_output['last_hidden_state'] + past_key_values = decoder_output['past_key_values'] + + if self.config.tie_word_embeddings: + # Rescale output before projecting on vocab + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 + last_hidden_state = last_hidden_state * (self.model.config.d_model**-0.5) + + lm_logits = self.model.lm_head(last_hidden_state) + + past_key_values_sa, past_key_values_ca = self.update_past(past_key_values) + + # We flatten the cache to a single array. This is required for the input output aliasing to work + past_key_values_sa = [vec for kv_per_layer in past_key_values_sa for vec in kv_per_layer] + past_key_values_ca = [vec for kv_per_layer in past_key_values_ca for vec in kv_per_layer] + + # We calculate topk inside the wrapper + next_token_logits = lm_logits[:, -1, :] + + if self.num_beams > 1: + # This section of beam search is run outside the decoder in the huggingface t5 implementation. + # To maximize the computation within the neuron device, we move this within the wrapper + logit_max, _ = torch.max(next_token_logits, dim=-1, keepdim=True) + logsumexp = torch.log(torch.exp(next_token_logits - logit_max).sum(dim=-1, keepdim=True)) + next_token_scores = next_token_logits - logit_max - logsumexp + next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores) + + # reshape for beam search + vocab_size = next_token_scores.shape[-1] + next_token_scores = next_token_scores.view(batch_size, self.num_beams * vocab_size) + next_token_scores = next_token_scores * 1 + + # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search) + next_token_scores, next_tokens = torch.topk( + next_token_scores, 2 * self.num_beams, dim=1, largest=True, sorted=True + ) + + next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor") + next_tokens = next_tokens % vocab_size + + return [next_token_scores, next_tokens, next_indices] + past_key_values_sa + past_key_values_ca + else: + # Greedy + next_tokens = torch.argmax(next_token_logits, dim=-1) + return [next_tokens] + past_key_values_sa + past_key_values_ca \ No newline at end of file diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py index 6b69429cf..ead9ab71f 100644 --- a/optimum/exporters/neuron/utils.py +++ b/optimum/exporters/neuron/utils.py @@ -156,7 +156,7 @@ def get_stable_diffusion_models_for_export( Whether the Neuron compiled model supports dynamic batch size. Returns: - `Dict[str, Tuple[Union[`PreTrainedModel`, `ModelMixin`], `NeuronConfig`]: A Dict containing the model and + `Dict[str, Tuple[Union[`PreTrainedModel`, `ModelMixin`], `NeuronConfig`]`: A Dict containing the model and Neuron configs for the different components of the model. """ models_for_export = _get_submodels_for_export_stable_diffusion(pipeline=pipeline, task=task) @@ -320,3 +320,49 @@ def override_diffusers_2_0_attn_processors(model): elif isinstance(submodule.processor, AttnAddedKVProcessor2_0): submodule.set_processor(AttnAddedKVProcessor()) return model + + +def get_encoder_decoder_models_for_export( + model: "PreTrainedModel", + encoder_input_shapes: Dict[str, int], + decoder_input_shapes: Dict[str, int], + dynamic_batch_size: Optional[bool] = False, +) -> Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]: + """ + Returns the components of an encoder-decoder model and their subsequent neuron configs. + The encoder includes the compute of encoder hidden states and the initialization of KV + cache. The decoder the autoprogressive process of generating tokens, which takes past + key values as inputs to save the compute. + + Args: + model ("PreTrainedModel"): + The model to export. + encoder_input_shapes (`Dict[str, int]`): + Static shapes used for compiling the encoder. + decoder_input_shapes (`Dict[str, int]`): + Static shapes used for compiling the decoder. + dynamic_batch_size (`bool`, defaults to `False`): + Whether the Neuron compiled model supports dynamic batch size. + + Returns: + `Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]`: A Dict containing the model and + Neuron configs for the different components of the model. + """ + # Encoder + encoder = {"encoder": model.encoder, "decoder": model.decoder} + encoder_config_constructor = TasksManager.get_exporter_config_constructor( + model=model, exporter="neuron", task="feature-extraction" + ) + encoder_neuron_config = encoder_config_constructor( + text_encoder.config, + task="feature-extraction", + dynamic_batch_size=dynamic_batch_size, + **encoder_input_shapes, + ) + models_for_export[DIFFUSION_MODEL_TEXT_ENCODER_NAME] = (text_encoder, encoder_neuron_config) + + # Decoder + decoder = {"decoder": model.decoder, "lm_head": model.lm_head} + decoder_config_constructor = TasksManager.get_exporter_config_constructor( + model=model, exporter="neuron", task="feature-extraction" + ) From 658087518d33fcf29d850fb365f4ec398797217b Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Mon, 23 Oct 2023 14:14:50 +0000 Subject: [PATCH 03/30] encoder support --- optimum/exporters/neuron/base.py | 4 +- optimum/exporters/neuron/config.py | 29 +++-- optimum/exporters/neuron/convert.py | 4 +- optimum/exporters/neuron/model_configs.py | 70 +++++++++--- optimum/exporters/neuron/model_wrappers.py | 121 +++++++++++---------- optimum/exporters/neuron/utils.py | 31 ++++-- optimum/neuron/modeling_seq2seq.py | 53 +++++---- optimum/neuron/utils/__init__.py | 4 +- 8 files changed, 190 insertions(+), 126 deletions(-) diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py index 8255b1b4e..6414996f8 100644 --- a/optimum/exporters/neuron/base.py +++ b/optimum/exporters/neuron/base.py @@ -119,7 +119,6 @@ def __init__( audio_sequence_length: Optional[int] = None, point_batch_size: Optional[int] = None, nb_points_per_image: Optional[int] = None, - num_beams: Optional[int] = None, # TODO: add custom dtype after optimum 1.13 release # int_dtype: str = "int64", # float_dtype: str = "fp32", @@ -148,7 +147,6 @@ def __init__( "audio_sequence_length": audio_sequence_length, "point_batch_size": point_batch_size, "nb_points_per_image": nb_points_per_image, - "num_beams": num_beams, } input_shapes = {} for name, value in axes_values.items(): @@ -292,7 +290,7 @@ def flatten_inputs(cls, inputs: Dict[str, Any]) -> Dict[str, Any]: flatten[name] = value return flatten - def patch_model( + def patch_model_for_export( self, model: "PreTrainedModel", dummy_inputs: Optional[Dict[str, torch.Tensor]] = None, diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py index 1564de82d..e9cbc37fe 100644 --- a/optimum/exporters/neuron/config.py +++ b/optimum/exporters/neuron/config.py @@ -16,17 +16,18 @@ Common Neuron configuration classes that handle most of the features for building model specific configurations. """ -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Dict, List + from ...utils import ( - DummyInputGenerator, DummyBboxInputGenerator, - DummyTextInputGenerator, + DummyInputGenerator, DummySeq2SeqDecoderTextInputGenerator, DummySeq2SeqPastKeyValuesGenerator, + DummyTextInputGenerator, DummyVisionInputGenerator, logging, ) -from .base import NeuronConfig, NeuronDecoderConfig, NeuronSeq2SeqConfigWithPast +from .base import NeuronConfig, NeuronDecoderConfig logger = logging.get_logger(__name__) @@ -70,7 +71,7 @@ class TextSeq2SeqNeuronConfig(NeuronConfig): """ Handles encoder-decoder-based text architectures. """ - + DUMMY_INPUT_GENERATOR_CLASSES = ( DummyTextInputGenerator, DummySeq2SeqDecoderTextInputGenerator, @@ -87,26 +88,24 @@ def inputs(self) -> Dict[str, Dict[int, str]]: # decoder with past if "decoder" in self.MODEL_TYPE: common_inputs = [ - "decoder_input_ids", - "decoder_attention_mask", - "encoder_hidden_states", + "decoder_input_ids", + "decoder_attention_mask", + "encoder_hidden_states", "encoder_attention_mask", - "beam_idx", - "beam_scores", ] return common_inputs - + @property def outputs(self) -> Dict[str, Dict[int, str]]: # encoder + decoder without past if "encoder" in self.MODEL_TYPE: - common_outputs = ["past_key_values"] + common_outputs = ["present_key_values_self_attn", "past_key_values_cross_attn"] # decoder with past if "decoder" in self.MODEL_TYPE: - common_outputs = ["next_tokens", ""] + common_outputs = ["next_tokens", "past_key_values_self_attn", "past_key_values_cross_attn"] return common_outputs - + def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]: dummy_text_input_generator = self.DUMMY_INPUT_GENERATOR_CLASSES[0]( self.task, self._normalized_config, **kwargs @@ -128,4 +127,4 @@ def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGen dummy_seq2seq_past_key_values_generator, ] - return dummy_inputs_generators \ No newline at end of file + return dummy_inputs_generators diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index 70322634e..bd7a894bc 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -424,7 +424,7 @@ def export_neuronx( dummy_inputs = config.generate_dummy_inputs(**input_shapes) dummy_inputs = config.flatten_inputs(dummy_inputs) dummy_inputs_tuple = tuple(dummy_inputs.values()) - checked_model = config.patch_model(model, dummy_inputs) + checked_model = config.patch_model_for_export(model, dummy_inputs) if auto_cast is not None: logger.info(f"Using Neuron: --auto-cast {auto_cast}") @@ -533,7 +533,7 @@ def export_neuron( dummy_inputs = config.generate_dummy_inputs(**input_shapes) dummy_inputs_tuple = tuple(dummy_inputs.values()) - checked_model = config.patch_model(model, dummy_inputs) + checked_model = config.patch_model_for_export(model, dummy_inputs) compiler_args = convert_neuronx_compiler_args_to_neuron(auto_cast, auto_cast_type, disable_fast_relayout) neuron_model = neuron.trace( diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index 6c6ad0291..8edabd177 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -25,6 +25,7 @@ DummyVisionInputGenerator, NormalizedConfig, NormalizedConfigManager, + NormalizedSeq2SeqConfig, NormalizedTextAndVisionConfig, is_diffusers_available, ) @@ -33,12 +34,13 @@ TextAndVisionNeuronConfig, TextEncoderNeuronConfig, TextNeuronDecoderConfig, + TextSeq2SeqNeuronConfig, VisionNeuronConfig, ) from .model_wrappers import ( - UnetNeuronWrapper, - T5EncoderWrapper, T5DecoderWrapper, + T5EncoderWrapper, + UnetNeuronWrapper, ) @@ -344,13 +346,13 @@ def inputs(self) -> List[str]: def outputs(self) -> List[str]: return ["sample"] - def patch_model( + def patch_model_for_export( self, model: "VaeDecoder", dummy_inputs: Dict[str, torch.Tensor], **kwargs, ): - return super().patch_model(model=model, dummy_inputs=dummy_inputs, forward_with_tuple=True) + return super().patch_model_for_export(model=model, dummy_inputs=dummy_inputs, forward_with_tuple=True) @register_in_tasks_manager("gpt2", "text-generation") @@ -363,28 +365,60 @@ class LLamaNeuronConfig(TextNeuronDecoderConfig): NEURONX_CLASS = "llama.model.LlamaForSampling" -@register_in_tasks_manager("t5", "text2text-generation") -class T5EncoderNeuronConfig(TextNeuronDecoderConfig): +@register_in_tasks_manager("t5-encoder", "text2text-generation") +class T5EncoderNeuronConfig(TextSeq2SeqNeuronConfig): ATOL_FOR_VALIDATION = 1e-3 MANDATORY_AXES = ("batch_size", "sequence_length") MODEL_TYPE = "t5-encoder" - - def patch_model(self, model, num_beams=1): - return super().patch_model( + NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args( + hidden_size="d_model", + num_attention_heads="num_heads", + encoder_num_layers="num_layers", + decoder_num_layers="num_decoder_layers", + key_value_dim="d_kv", + allow_new=True, + ) + + def generate_dummy_inputs(self, **kwargs): + dummy_inputs = super().generate_dummy_inputs(**kwargs) + + return dummy_inputs + + def patch_model_for_export(self, model, num_beams=1): + return super().patch_model_for_export( model=model, custom_model_wrapper=T5EncoderWrapper, - custom_wrapper_kwargs={"num_beams": num_beams} ) - - -@register_in_tasks_manager("t5", "text2text-generation") -class T5DecoderNeuronConfig(TextNeuronDecoderConfig): + + +@register_in_tasks_manager("t5-decoder", "text2text-generation") +class T5DecoderNeuronConfig(TextSeq2SeqNeuronConfig): ATOL_FOR_VALIDATION = 1e-3 - MANDATORY_AXES = ("batch_size", "sequence_length", "num_beams") + MANDATORY_AXES = ("batch_size", "sequence_length") MODEL_TYPE = "t5-decoder" - - def patch_model(self, model, dummy_inputs): + NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args( + hidden_size="d_model", + num_attention_heads="num_heads", + encoder_num_layers="num_layers", + decoder_num_layers="num_decoder_layers", + key_value_dim="d_kv", + allow_new=True, + ) + + @property + def inputs(self) -> List[str]: + common_inputs = super().inputs() + ["beam_idx", "beam_scores"] + + return common_inputs + + def patch_model_for_export(self, model, dummy_inputs): return super().patch_model( model=model, custom_model_wrapper=T5DecoderWrapper, - ) \ No newline at end of file + ) + + def generate_io_aliases(self, model, dummy_inputs): + return super().patch_model( + model=model, + custom_model_wrapper=T5DecoderWrapper, + ) diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py index 5eed4891f..3c51668e2 100644 --- a/optimum/exporters/neuron/model_wrappers.py +++ b/optimum/exporters/neuron/model_wrappers.py @@ -13,10 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. """Model wrappers for Neuron export.""" -from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union -import torch +from typing import TYPE_CHECKING -from transformers.models.t5.modeling_t5 import T5Stack, T5LayerCrossAttention +import torch +from transformers.models.t5.modeling_t5 import T5LayerCrossAttention if TYPE_CHECKING: @@ -43,16 +43,15 @@ def forward(self, sample, timestep, encoder_hidden_states, text_embeds=None, tim # Adapted from https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/torch-neuronx/t5-inference-tutorial.html class T5EncoderWrapper(torch.nn.Module): """Wrapper to trace the encoder and the kv cache initialization in the decoder.""" + def __init__( - self, - model: "PreTrainedModel" , - num_beams: int = 1, # defaults to greedy search + self, + model: "PreTrainedModel", tp_degree=None, ): super().__init__() self.model = model self.config = model.config - self.num_beams = num_beams self.device = "xla" self.tp_degree = tp_degree @@ -60,23 +59,18 @@ def forward(self, input_ids, attention_mask): # Infer shapes batch_size = input_ids.shape[0] sequence_length = input_ids.shape[1] - + encoder_output = self.model.encoder( - input_ids=input_ids, - attention_mask=attention_mask, - output_attentions=False, - output_hidden_states=False + input_ids=input_ids, attention_mask=attention_mask, output_attentions=False, output_hidden_states=False ) last_hidden_state = encoder_output["last_hidden_state"] - encoder_hidden_states = torch.concat([tensor.unsqueeze(0).repeat(self.num_beams, 1, 1) for tensor in last_hidden_state]) decoder_blocks = self.model.decoder.block present_key_value_states_sa = [] present_key_value_states_ca = [] for block in decoder_blocks: - # Cross attention has to be initialized with the encoder hidden state cross_attention: T5LayerCrossAttention = block.layer[1] attention = cross_attention.EncDecAttention @@ -85,8 +79,8 @@ def shape(states): """projection""" return states.view(batch_size, -1, self.config.num_heads, attention.key_value_proj_dim).transpose(1, 2) - key_states = shape(attention.k(encoder_hidden_states)) - value_states = shape(attention.v(encoder_hidden_states)) + key_states = shape(attention.k(last_hidden_state)) + value_states = shape(attention.v(last_hidden_state)) # cross_attn_kv_state present_key_value_states_ca.append(key_states) @@ -95,17 +89,21 @@ def shape(states): # Self attention kv states are initialized to zeros. This is done to keep the size of the kv cache tensor constant. # The kv cache is padded here to keep a fixed shape. # [key states] - present_key_value_states_sa.append(torch.zeros(( - batch_size, - self.config.num_heads, - sequence_length-1, - self.config.d_kv), dtype=torch.float32, device=self.device)) + present_key_value_states_sa.append( + torch.zeros( + (batch_size, self.config.num_heads, sequence_length - 1, self.config.d_kv), + dtype=torch.float32, + device=self.device, + ) + ) # [value states] - present_key_value_states_sa.append(torch.zeros(( - batch_size, - self.config.num_heads, - sequence_length-1, - self.config.d_kv), dtype=torch.float32, device=self.device)) + present_key_value_states_sa.append( + torch.zeros( + (batch_size, self.config.num_heads, sequence_length - 1, self.config.d_kv), + dtype=torch.float32, + device=self.device, + ) + ) return present_key_value_states_sa + present_key_value_states_ca @@ -113,36 +111,37 @@ def shape(states): # Adapted from https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/torch-neuronx/t5-inference-tutorial.html class T5DecoderWrapper(torch.nn.Module): """Wrapper to trace the decoder with past with a language head.""" - def __init__(self, - model: "PreTrainedModel" , - num_beams: int, - sequence_length: int, - tp_degree=None): + + def __init__(self, model: "PreTrainedModel", num_beams: int, sequence_length: int, tp_degree=None): super().__init__() self.model = model self.config = model.config self.num_beams = num_beams self.device = "xla" self.tp_degree = tp_degree - + # Initialize KV cache (num_beams, n_heads, seq_length, dim_per_head) self.past_key_values_sa = torch.nn.ParameterList( - [torch.nn.Parameter( - torch.ones( - (num_beams, self.config.num_heads, sequence_length - 1, self.config.d_kv), - dtype=torch.float32 - ), - requires_grad=False - ) for _ in range(self.config.num_decoder_layers * 2)] + [ + torch.nn.Parameter( + torch.ones( + (num_beams, self.config.num_heads, sequence_length - 1, self.config.d_kv), dtype=torch.float32 + ), + requires_grad=False, + ) + for _ in range(self.config.num_decoder_layers * 2) + ] ) self.past_key_values_ca = torch.nn.ParameterList( - [torch.nn.Parameter( - torch.ones( - (num_beams, self.config.num_heads, sequence_length, self.config.d_kv), - dtype=torch.float32 - ), - requires_grad=False - ) for _ in range(self.config.num_decoder_layers * 2)] + [ + torch.nn.Parameter( + torch.ones( + (num_beams, self.config.num_heads, sequence_length, self.config.d_kv), dtype=torch.float32 + ), + requires_grad=False, + ) + for _ in range(self.config.num_decoder_layers * 2) + ] ) def update_past(self, past_key_values): @@ -152,14 +151,18 @@ def update_past(self, past_key_values): new_past_layer = list(past_layer) for i in range(len(new_past_layer[:2])): new_past_layer[i] = past_layer[i][:, :, 1:] - new_past_sa += [new_past_layer[:2],] - new_past_ca += [new_past_layer[2:],] + new_past_sa += [ + new_past_layer[:2], + ] + new_past_ca += [ + new_past_layer[2:], + ] return new_past_sa, new_past_ca def reorder_cache(self, past_key_values, beam_idx): for i in range(len(past_key_values)): - gather_index = beam_idx.view([beam_idx.shape[0],1,1,1]).expand_as(past_key_values[i]) - past_key_values[i] = torch.gather(past_key_values[i], dim = 0, index=gather_index) + gather_index = beam_idx.view([beam_idx.shape[0], 1, 1, 1]).expand_as(past_key_values[i]) + past_key_values[i] = torch.gather(past_key_values[i], dim=0, index=gather_index) return past_key_values def forward( @@ -170,11 +173,11 @@ def forward( encoder_attention_mask, beam_idx, beam_scores, - **kwargs + **kwargs, ): # Infer shapes batch_size = input_ids.shape[0] or 1 - + if self.num_beams > 1: # We reorder the cache based on the beams selected in each iteration. Required step for beam search. past_key_values_sa = self.reorder_cache(self.past_key_values_sa, beam_idx) @@ -186,7 +189,10 @@ def forward( # The cache is stored in a flatten form. We order the cache per layer before passing it to the decoder. # Each layer has 4 tensors, so we group by 4. - past_key_values = [[*past_key_values_sa[i*2:i*2+2], *past_key_values_ca[i*2:i*2+2]] for i in range(0, int(len(past_key_values_ca)/2))] + past_key_values = [ + [*past_key_values_sa[i * 2 : i * 2 + 2], *past_key_values_ca[i * 2 : i * 2 + 2]] + for i in range(0, int(len(past_key_values_ca) / 2)) + ] decoder_output = self.model.decoder( input_ids=input_ids, @@ -196,10 +202,11 @@ def forward( encoder_attention_mask=encoder_attention_mask, use_cache=True, output_attentions=False, - output_hidden_states=False) + output_hidden_states=False, + ) - last_hidden_state = decoder_output['last_hidden_state'] - past_key_values = decoder_output['past_key_values'] + last_hidden_state = decoder_output["last_hidden_state"] + past_key_values = decoder_output["past_key_values"] if self.config.tie_word_embeddings: # Rescale output before projecting on vocab @@ -242,4 +249,4 @@ def forward( else: # Greedy next_tokens = torch.argmax(next_token_logits, dim=-1) - return [next_tokens] + past_key_values_sa + past_key_values_ca \ No newline at end of file + return [next_tokens] + past_key_values_sa + past_key_values_ca diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py index ead9ab71f..d4c9a85bf 100644 --- a/optimum/exporters/neuron/utils.py +++ b/optimum/exporters/neuron/utils.py @@ -23,11 +23,13 @@ from transformers import PretrainedConfig from ...neuron.utils import ( + DECODER_NAME, DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, DIFFUSION_MODEL_TEXT_ENCODER_NAME, DIFFUSION_MODEL_UNET_NAME, DIFFUSION_MODEL_VAE_DECODER_NAME, DIFFUSION_MODEL_VAE_ENCODER_NAME, + ENCODER_NAME, get_attention_scores_sd, get_attention_scores_sdxl, ) @@ -331,7 +333,7 @@ def get_encoder_decoder_models_for_export( """ Returns the components of an encoder-decoder model and their subsequent neuron configs. The encoder includes the compute of encoder hidden states and the initialization of KV - cache. The decoder the autoprogressive process of generating tokens, which takes past + cache. The decoder the autoprogressive process of generating tokens, which takes past key values as inputs to save the compute. Args: @@ -348,21 +350,32 @@ def get_encoder_decoder_models_for_export( `Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]`: A Dict containing the model and Neuron configs for the different components of the model. """ + models_for_export = [] + # Encoder - encoder = {"encoder": model.encoder, "decoder": model.decoder} + model_type = getattr(model.config, "model_type") + "-encoder" encoder_config_constructor = TasksManager.get_exporter_config_constructor( - model=model, exporter="neuron", task="feature-extraction" + exporter="neuron", model_type=model_type, task="text2text-generation" ) encoder_neuron_config = encoder_config_constructor( - text_encoder.config, - task="feature-extraction", + config=model.config, + task="text2text-generation", dynamic_batch_size=dynamic_batch_size, **encoder_input_shapes, ) - models_for_export[DIFFUSION_MODEL_TEXT_ENCODER_NAME] = (text_encoder, encoder_neuron_config) - + models_for_export[ENCODER_NAME] = (model, encoder_neuron_config) + # Decoder - decoder = {"decoder": model.decoder, "lm_head": model.lm_head} + model_type = getattr(model.config, "model_type") + "-decoder" decoder_config_constructor = TasksManager.get_exporter_config_constructor( - model=model, exporter="neuron", task="feature-extraction" + exporter="neuron", model_type=model_type, task="text2text-generation" ) + decoder_neuron_config = decoder_config_constructor( + config=model.config, + task="text2text-generation", + dynamic_batch_size=dynamic_batch_size, + **decoder_input_shapes, + ) + models_for_export[DECODER_NAME] = (model, decoder_neuron_config) + + return models_for_export diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index 555e77f56..f39505cba 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -13,31 +13,32 @@ # See the License for the specific language governing permissions and # limitations under the License. """NeuroModelForXXX classes for seq2seq models' inference on neuron devices.""" -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union from abc import abstractmethod from pathlib import Path from tempfile import TemporaryDirectory +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + import torch from transformers import AutoModelForSeq2SeqLM -from .modeling_base import NeuronBaseModel, NeuronConfig + from .generation import NeuronGenerationMixin +from .modeling_base import NeuronBaseModel, NeuronConfig from .utils import ( - ENCODER_NAME, - DECODER_NAME, NEURON_FILE_NAME, is_neuronx_available, ) + if TYPE_CHECKING: - from transformers import PretrainedConfig, PreTrainedModel + from transformers import PretrainedConfig if is_neuronx_available(): - torch_neuronx + pass class NeuronModelForConditionalGeneration(NeuronBaseModel): base_model_prefix = "neuron_model" - + def __init__( self, encoder: torch.jit._script.ScriptModule, @@ -50,7 +51,7 @@ def __init__( **kwargs, ): pass - + @staticmethod def load_model( encoder_path: Union[str, Path], @@ -59,7 +60,7 @@ def load_model( dynamic_batch_size: bool = False, ): pass - + def _save_pretrained( self, save_directory: Union[str, Path], @@ -76,7 +77,7 @@ def _save_pretrained( The directory where to save the model files. """ pass - + @classmethod def _from_pretrained( cls, @@ -95,7 +96,7 @@ def _from_pretrained( **kwargs, ): pass - + @classmethod def _from_transformers( cls, @@ -116,13 +117,12 @@ def _from_transformers( dynamic_batch_size: bool = False, device_ids: Optional[List[int]] = None, ) -> "NeuronModelForConditionalGeneration": - pass + pass class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin): auto_model_class = AutoModelForSeq2SeqLM main_input_name = "input_ids" - class _NeuronSeq2SeqModelPart: @@ -158,6 +158,7 @@ class NeuronEncoder(_NeuronSeq2SeqModelPart): """ Encoder part of the encoder-decoder model for Neuron inference. (Actually it's a monolith of encoder + decoder without past_key_values to workaround the control flow in the decoder). """ + def __init__( self, model: torch.jit._script.ScriptModule, @@ -166,16 +167,21 @@ def __init__( neuron_config: Optional[Dict[str, str]] = None, ): super().__init__(model, parent_model, config, neuron_config, "encoder") - + def forward(self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor): - inputs = (input_ids, attention_mask, ) + inputs = ( + input_ids, + attention_mask, + ) outputs = self.model(*inputs) return outputs + class NeuronDecoder(_NeuronSeq2SeqModelPart): """ Decoder part of the encoder-decoder model for Neuron inference. (Actually it's decoder with past_key_values). """ + def __init__( self, model: torch.jit._script.ScriptModule, @@ -184,16 +190,23 @@ def __init__( neuron_config: Optional[Dict[str, str]] = None, ): super().__init__(model, parent_model, config, neuron_config, "decoder") - + def forward( - self, - input_ids: torch.LongTensor, + self, + input_ids: torch.LongTensor, decoder_attention_mask: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, encoder_attention_mask: torch.FloatTensor, beam_idx: torch.LongTensor, beam_scores: torch.FloatTensor, ): - inputs = (input_ids, decoder_attention_mask, encoder_hidden_states, encoder_attention_mask, beam_idx, beam_scores) + inputs = ( + input_ids, + decoder_attention_mask, + encoder_hidden_states, + encoder_attention_mask, + beam_idx, + beam_scores, + ) outputs = self.model(*inputs) - return outputs \ No newline at end of file + return outputs diff --git a/optimum/neuron/utils/__init__.py b/optimum/neuron/utils/__init__.py index 8eee6dbe9..96af3e158 100644 --- a/optimum/neuron/utils/__init__.py +++ b/optimum/neuron/utils/__init__.py @@ -15,14 +15,14 @@ from .argument_utils import convert_neuronx_compiler_args_to_neuron, store_compilation_config from .constant import ( + DECODER_NAME, DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, DIFFUSION_MODEL_TEXT_ENCODER_NAME, DIFFUSION_MODEL_UNET_NAME, DIFFUSION_MODEL_VAE_DECODER_NAME, DIFFUSION_MODEL_VAE_ENCODER_NAME, - NEURON_FILE_NAME, ENCODER_NAME, - DECODER_NAME, + NEURON_FILE_NAME, ) from .import_utils import ( is_accelerate_available, From e997f5fcbae3eceeaa9f5f6e96be6528eb4af41b Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Tue, 24 Oct 2023 23:12:27 +0000 Subject: [PATCH 04/30] decoder export --- optimum/exporters/neuron/base.py | 2 + optimum/exporters/neuron/config.py | 2 +- optimum/exporters/neuron/convert.py | 15 +++++- optimum/exporters/neuron/model_configs.py | 58 ++++++++++++++++------ optimum/exporters/neuron/model_wrappers.py | 40 ++++++++++----- optimum/neuron/utils/__init__.py | 1 + optimum/neuron/utils/input_generators.py | 45 +++++++++++++++++ 7 files changed, 132 insertions(+), 31 deletions(-) create mode 100644 optimum/neuron/utils/input_generators.py diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py index 6414996f8..a2e2eb520 100644 --- a/optimum/exporters/neuron/base.py +++ b/optimum/exporters/neuron/base.py @@ -119,6 +119,7 @@ def __init__( audio_sequence_length: Optional[int] = None, point_batch_size: Optional[int] = None, nb_points_per_image: Optional[int] = None, + num_beams: Optional[int] = None, # TODO: add custom dtype after optimum 1.13 release # int_dtype: str = "int64", # float_dtype: str = "fp32", @@ -147,6 +148,7 @@ def __init__( "audio_sequence_length": audio_sequence_length, "point_batch_size": point_batch_size, "nb_points_per_image": nb_points_per_image, + "num_beams": num_beams, } input_shapes = {} for name, value in axes_values.items(): diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py index e9cbc37fe..3f18d67d6 100644 --- a/optimum/exporters/neuron/config.py +++ b/optimum/exporters/neuron/config.py @@ -91,7 +91,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]: "decoder_input_ids", "decoder_attention_mask", "encoder_hidden_states", - "encoder_attention_mask", + "attention_mask", # TODO: replace with `encoder_attention_mask` after optimum 1.14 release ] return common_inputs diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index bd7a894bc..2466cf9c5 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -424,7 +424,13 @@ def export_neuronx( dummy_inputs = config.generate_dummy_inputs(**input_shapes) dummy_inputs = config.flatten_inputs(dummy_inputs) dummy_inputs_tuple = tuple(dummy_inputs.values()) - checked_model = config.patch_model_for_export(model, dummy_inputs) + + aliases = {} + if model.config.is_encoder_decoder: + checked_model = config.patch_model_for_export(model, **input_shapes) + aliases = config.generate_io_aliases(checked_model) + else: + checked_model = config.patch_model_for_export(model, dummy_inputs) if auto_cast is not None: logger.info(f"Using Neuron: --auto-cast {auto_cast}") @@ -440,7 +446,12 @@ def export_neuronx( # diffusers specific compiler_args = add_stable_diffusion_compiler_args(config, compiler_args) - neuron_model = neuronx.trace(checked_model, dummy_inputs_tuple, compiler_args=compiler_args) + neuron_model = neuronx.trace( + checked_model, + dummy_inputs_tuple, + compiler_args=compiler_args, + input_output_aliases=aliases, + ) if config.dynamic_batch_size is True: neuron_model = neuronx.dynamic_batch(neuron_model) diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index 8edabd177..92d566323 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -19,7 +19,9 @@ import torch +from ...neuron.utils import DummyBeamValuesGenerator from ...utils import ( + DummyInputGenerator, DummySeq2SeqDecoderTextInputGenerator, DummyTimestepInputGenerator, DummyVisionInputGenerator, @@ -368,7 +370,7 @@ class LLamaNeuronConfig(TextNeuronDecoderConfig): @register_in_tasks_manager("t5-encoder", "text2text-generation") class T5EncoderNeuronConfig(TextSeq2SeqNeuronConfig): ATOL_FOR_VALIDATION = 1e-3 - MANDATORY_AXES = ("batch_size", "sequence_length") + MANDATORY_AXES = ("batch_size", "sequence_length", "num_beams") MODEL_TYPE = "t5-encoder" NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args( hidden_size="d_model", @@ -379,22 +381,20 @@ class T5EncoderNeuronConfig(TextSeq2SeqNeuronConfig): allow_new=True, ) - def generate_dummy_inputs(self, **kwargs): - dummy_inputs = super().generate_dummy_inputs(**kwargs) - - return dummy_inputs - - def patch_model_for_export(self, model, num_beams=1): + def patch_model_for_export(self, model, **kwargs): + num_beams = kwargs.pop("num_beams", 1) return super().patch_model_for_export( model=model, custom_model_wrapper=T5EncoderWrapper, + custom_wrapper_kwargs={"num_beams": num_beams}, ) @register_in_tasks_manager("t5-decoder", "text2text-generation") class T5DecoderNeuronConfig(TextSeq2SeqNeuronConfig): ATOL_FOR_VALIDATION = 1e-3 - MANDATORY_AXES = ("batch_size", "sequence_length") + DUMMY_INPUT_GENERATOR_CLASSES = TextSeq2SeqNeuronConfig.DUMMY_INPUT_GENERATOR_CLASSES + (DummyBeamValuesGenerator,) + MANDATORY_AXES = ("batch_size", "sequence_length", "num_beams") MODEL_TYPE = "t5-decoder" NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args( hidden_size="d_model", @@ -407,18 +407,46 @@ class T5DecoderNeuronConfig(TextSeq2SeqNeuronConfig): @property def inputs(self) -> List[str]: - common_inputs = super().inputs() + ["beam_idx", "beam_scores"] + common_inputs = super().inputs + ["beam_idx", "beam_scores"] return common_inputs - def patch_model_for_export(self, model, dummy_inputs): - return super().patch_model( - model=model, - custom_model_wrapper=T5DecoderWrapper, + def generate_dummy_inputs(self, **kwargs): + batch_size = kwargs.pop("batch_size") * kwargs.get("num_beams") + dummy_inputs = super().generate_dummy_inputs(batch_size=batch_size, **kwargs) + dummy_inputs["decoder_input_ids"] = dummy_inputs["decoder_input_ids"][:, :1] # sequence_length = 1 + dummy_inputs["encoder_hidden_states"] = dummy_inputs["encoder_hidden_states"][0] + + return dummy_inputs + + def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]: + dummy_inputs_generators = super()._create_dummy_input_generator_classes(**kwargs) + dummy_beam_values_generator = self.DUMMY_INPUT_GENERATOR_CLASSES[-1]( + self.task, + self._normalized_config, + num_beams=kwargs.pop("num_beams", 1), + **kwargs, ) + dummy_inputs_generators.append(dummy_beam_values_generator) + return dummy_inputs_generators - def generate_io_aliases(self, model, dummy_inputs): - return super().patch_model( + def patch_model_for_export(self, model, **kwargs): + return super().patch_model_for_export( model=model, custom_model_wrapper=T5DecoderWrapper, + custom_wrapper_kwargs={ + "batch_size": kwargs.pop("batch_size", 1), + "sequence_length": kwargs.pop("sequence_length", 1), + "num_beams": kwargs.pop("num_beams", 1), + }, ) + + def generate_io_aliases(self, model): + num_outputs_from_trace = 3 if model.num_beams > 1 else 1 + aliases = {} + for i in range(len(model.past_key_values_sa)): + aliases[model.past_key_values_sa[i]] = i + num_outputs_from_trace + for i in range(len(model.past_key_values_ca)): + aliases[model.past_key_values_ca[i]] = len(model.past_key_values_sa) + i + num_outputs_from_trace + + return aliases diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py index 3c51668e2..8477fdd07 100644 --- a/optimum/exporters/neuron/model_wrappers.py +++ b/optimum/exporters/neuron/model_wrappers.py @@ -47,11 +47,13 @@ class T5EncoderWrapper(torch.nn.Module): def __init__( self, model: "PreTrainedModel", + num_beams: int = 1, tp_degree=None, ): super().__init__() self.model = model self.config = model.config + self.num_beams = num_beams self.device = "xla" self.tp_degree = tp_degree @@ -65,6 +67,9 @@ def forward(self, input_ids, attention_mask): ) last_hidden_state = encoder_output["last_hidden_state"] + encoder_hidden_states = torch.concat( + [tensor.unsqueeze(0).repeat(self.num_beams, 1, 1) for tensor in last_hidden_state] + ) decoder_blocks = self.model.decoder.block present_key_value_states_sa = [] @@ -77,10 +82,12 @@ def forward(self, input_ids, attention_mask): def shape(states): """projection""" - return states.view(batch_size, -1, self.config.num_heads, attention.key_value_proj_dim).transpose(1, 2) + return states.view( + self.num_beams * batch_size, -1, self.config.num_heads, attention.key_value_proj_dim + ).transpose(1, 2) - key_states = shape(attention.k(last_hidden_state)) - value_states = shape(attention.v(last_hidden_state)) + key_states = shape(attention.k(encoder_hidden_states)) + value_states = shape(attention.v(encoder_hidden_states)) # cross_attn_kv_state present_key_value_states_ca.append(key_states) @@ -91,7 +98,7 @@ def shape(states): # [key states] present_key_value_states_sa.append( torch.zeros( - (batch_size, self.config.num_heads, sequence_length - 1, self.config.d_kv), + (self.num_beams * batch_size, self.config.num_heads, sequence_length - 1, self.config.d_kv), dtype=torch.float32, device=self.device, ) @@ -99,7 +106,7 @@ def shape(states): # [value states] present_key_value_states_sa.append( torch.zeros( - (batch_size, self.config.num_heads, sequence_length - 1, self.config.d_kv), + (self.num_beams * batch_size, self.config.num_heads, sequence_length - 1, self.config.d_kv), dtype=torch.float32, device=self.device, ) @@ -110,12 +117,15 @@ def shape(states): # Adapted from https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/torch-neuronx/t5-inference-tutorial.html class T5DecoderWrapper(torch.nn.Module): - """Wrapper to trace the decoder with past with a language head.""" + """Wrapper to trace the decoder with past keys values with a language head.""" - def __init__(self, model: "PreTrainedModel", num_beams: int, sequence_length: int, tp_degree=None): + def __init__( + self, model: "PreTrainedModel", batch_size: int, sequence_length: int, num_beams: int = 1, tp_degree=None + ): super().__init__() self.model = model self.config = model.config + self.batch_size = batch_size self.num_beams = num_beams self.device = "xla" self.tp_degree = tp_degree @@ -125,7 +135,13 @@ def __init__(self, model: "PreTrainedModel", num_beams: int, sequence_length: in [ torch.nn.Parameter( torch.ones( - (num_beams, self.config.num_heads, sequence_length - 1, self.config.d_kv), dtype=torch.float32 + ( + self.batch_size * self.num_beams, + self.config.num_heads, + sequence_length - 1, + self.config.d_kv, + ), + dtype=torch.float32, ), requires_grad=False, ) @@ -136,7 +152,8 @@ def __init__(self, model: "PreTrainedModel", num_beams: int, sequence_length: in [ torch.nn.Parameter( torch.ones( - (num_beams, self.config.num_heads, sequence_length, self.config.d_kv), dtype=torch.float32 + (self.batch_size * self.num_beams, self.config.num_heads, sequence_length, self.config.d_kv), + dtype=torch.float32, ), requires_grad=False, ) @@ -175,9 +192,6 @@ def forward( beam_scores, **kwargs, ): - # Infer shapes - batch_size = input_ids.shape[0] or 1 - if self.num_beams > 1: # We reorder the cache based on the beams selected in each iteration. Required step for beam search. past_key_values_sa = self.reorder_cache(self.past_key_values_sa, beam_idx) @@ -234,7 +248,7 @@ def forward( # reshape for beam search vocab_size = next_token_scores.shape[-1] - next_token_scores = next_token_scores.view(batch_size, self.num_beams * vocab_size) + next_token_scores = next_token_scores.view(self.batch_size, self.num_beams * vocab_size) next_token_scores = next_token_scores * 1 # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search) diff --git a/optimum/neuron/utils/__init__.py b/optimum/neuron/utils/__init__.py index 96af3e158..c859ba71b 100644 --- a/optimum/neuron/utils/__init__.py +++ b/optimum/neuron/utils/__init__.py @@ -33,6 +33,7 @@ is_torch_xla_available, is_transformers_neuronx_available, ) +from .input_generators import DummyBeamValuesGenerator from .optimization_utils import get_attention_scores_sd, get_attention_scores_sdxl from .patching import DynamicPatch, ModelPatcher, Patcher, patch_everywhere, patch_within_function from .training_utils import ( diff --git a/optimum/neuron/utils/input_generators.py b/optimum/neuron/utils/input_generators.py new file mode 100644 index 000000000..1616123a9 --- /dev/null +++ b/optimum/neuron/utils/input_generators.py @@ -0,0 +1,45 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Dummy input generation classes.""" +import torch + +from ...utils import DTYPE_MAPPER, DummyInputGenerator, NormalizedTextConfig + + +class DummyBeamValuesGenerator(DummyInputGenerator): + """ + Generates dummy beam search inputs. + """ + + SUPPORTED_INPUT_NAMES = ( + "beam_idx", + "beam_scores", + ) + + def __init__( + self, + task: str, + normalized_config: NormalizedTextConfig, + num_beams: int = 1, + **kwargs, + ): + self.task = task + self.num_beams = num_beams + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "beam_idx": + return torch.arange(0, self.num_beams, dtype=DTYPE_MAPPER.pt(int_dtype)) + elif input_name == "beam_scores": + return torch.zeros((self.num_beams,), dtype=DTYPE_MAPPER.pt(float_dtype)) From 7621e39ea652910aded31a28da17ec2618738f15 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Wed, 25 Oct 2023 10:53:20 +0000 Subject: [PATCH 05/30] CLI support --- optimum/commands/export/neuronx.py | 5 + optimum/exporters/neuron/__main__.py | 178 +++++++++++++++------- optimum/exporters/neuron/config.py | 4 + optimum/exporters/neuron/convert.py | 3 +- optimum/exporters/neuron/model_configs.py | 8 + optimum/exporters/neuron/utils.py | 24 ++- 6 files changed, 156 insertions(+), 66 deletions(-) diff --git a/optimum/commands/export/neuronx.py b/optimum/commands/export/neuronx.py index 72673b8a4..616cee693 100644 --- a/optimum/commands/export/neuronx.py +++ b/optimum/commands/export/neuronx.py @@ -102,6 +102,11 @@ def parse_args_neuronx(parser: "ArgumentParser"): type=int, help=f"Sequence length {doc_input}", ) + input_group.add_argument( + "--num_beams", + type=int, + help=f"Number of beams for beam search {doc_input}", + ) input_group.add_argument( "--num_choices", type=int, diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index bac4906da..f0402552d 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -25,11 +25,13 @@ from transformers import AutoConfig from ...neuron.utils import ( + DECODER_NAME, DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, DIFFUSION_MODEL_TEXT_ENCODER_NAME, DIFFUSION_MODEL_UNET_NAME, DIFFUSION_MODEL_VAE_DECODER_NAME, DIFFUSION_MODEL_VAE_ENCODER_NAME, + ENCODER_NAME, NEURON_FILE_NAME, is_neuron_available, is_neuronx_available, @@ -43,6 +45,7 @@ from .model_configs import * # noqa: F403 from .utils import ( build_stable_diffusion_components_mandatory_shapes, + get_encoder_decoder_models_for_export, get_stable_diffusion_models_for_export, ) @@ -63,8 +66,10 @@ if TYPE_CHECKING: + from transformers import PreTrainedModel + if is_diffusers_available(): - from diffusers import StableDiffusionPipeline + from diffusers import DiffusionPipeline, StableDiffusionPipeline logger = logging.get_logger() @@ -102,7 +107,11 @@ def infer_task(task: str, model_name_or_path: str) -> str: def normalize_input_shapes(task: str, args: argparse.Namespace) -> Dict[str, int]: config = AutoConfig.from_pretrained(args.model) + model_type = config.model_type.replace("_", "-") + if config.is_encoder_decoder: + model_type = model_type + "-encoder" + neuron_config_constructor = TasksManager.get_exporter_config_constructor( model_type=model_type, exporter="neuron", task=task ) @@ -172,6 +181,113 @@ def infer_stable_diffusion_shapes_from_diffusers( return input_shapes +def _get_submodels_and_neuron_configs( + model: Union["PreTrainedModel", "DiffusionPipeline"], + input_shapes: Dict[str, int], + task: str, + output: Path, + dynamic_batch_size: bool = False, + model_name_or_path: Optional[Union[str, Path]] = None, +): + is_stable_diffusion = "stable-diffusion" in task + is_encoder_decoder = model.config.is_encoder_decoder + + if is_stable_diffusion: + return _get_submodels_and_neuron_configs_for_stable_diffusion( + model, input_shapes, task, output, dynamic_batch_size + ) + elif is_encoder_decoder: + return _get_submodels_and_neuron_configs_for_encoder_decoder( + model, input_shapes, task, output, dynamic_batch_size, model_name_or_path + ) + else: + neuron_config_constructor = TasksManager.get_exporter_config_constructor( + model=model, exporter="neuron", task=task + ) + neuron_config = neuron_config_constructor(model.config, dynamic_batch_size=dynamic_batch_size, **input_shapes) + model_name = model.name_or_path.split("/")[-1] + output_model_names = {model_name: "model.neuron"} + models_and_neuron_configs = {model_name: (model, neuron_config)} + maybe_save_preprocessors(model_name_or_path, output) + return models_and_neuron_configs, output_model_names + + +def _get_submodels_and_neuron_configs_for_stable_diffusion( + model: Union["PreTrainedModel", "DiffusionPipeline"], + input_shapes: Dict[str, int], + task: str, + output: Path, + dynamic_batch_size: bool = False, +): + check_compiler_compatibility_for_stable_diffusion() + if is_neuron_available(): + raise RuntimeError( + "Stable diffusion export is not supported by neuron-cc on inf1, please use neuronx-cc on either inf2/trn1 instead." + ) + input_shapes = infer_stable_diffusion_shapes_from_diffusers(input_shapes, model) + + # Saving the model config and preprocessor as this is needed sometimes. + model.scheduler.save_pretrained(output.joinpath("scheduler")) + if hasattr(model, "tokenizer") and model.tokenizer is not None: + model.tokenizer.save_pretrained(output.joinpath("tokenizer")) + if hasattr(model, "tokenizer_2") and model.tokenizer_2 is not None: + model.tokenizer_2.save_pretrained(output.joinpath("tokenizer_2")) + if hasattr(model, "feature_extractor"): + model.feature_extractor.save_pretrained(output.joinpath("feature_extractor")) + model.save_config(output) + + models_and_neuron_configs = get_stable_diffusion_models_for_export( + pipeline=model, + task=task, + dynamic_batch_size=dynamic_batch_size, + **input_shapes, + ) + output_model_names = { + DIFFUSION_MODEL_UNET_NAME: os.path.join(DIFFUSION_MODEL_UNET_NAME, NEURON_FILE_NAME), + DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME), + DIFFUSION_MODEL_VAE_DECODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_DECODER_NAME, NEURON_FILE_NAME), + } + if hasattr(model, "text_encoder") and model.text_encoder is not None: + output_model_names[DIFFUSION_MODEL_TEXT_ENCODER_NAME] = os.path.join( + DIFFUSION_MODEL_TEXT_ENCODER_NAME, NEURON_FILE_NAME + ) + if hasattr(model, "text_encoder_2") and model.text_encoder_2 is not None: + output_model_names[DIFFUSION_MODEL_TEXT_ENCODER_2_NAME] = os.path.join( + DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, NEURON_FILE_NAME + ) + del model + + return models_and_neuron_configs, output_model_names + + +def _get_submodels_and_neuron_configs_for_encoder_decoder( + model: Union["PreTrainedModel", "DiffusionPipeline"], + input_shapes: Dict[str, int], + task: str, + output: Path, + dynamic_batch_size: bool = False, + model_name_or_path: Optional[Union[str, Path]] = None, +): + if is_neuron_available(): + raise RuntimeError( + "Encoder-decoder models export is not supported by neuron-cc on inf1, please use neuronx-cc on either inf2/trn1 instead." + ) + + models_and_neuron_configs = get_encoder_decoder_models_for_export( + model=model, + task=task, + dynamic_batch_size=dynamic_batch_size, + input_shapes=input_shapes, + ) + output_model_names = { + ENCODER_NAME: os.path.join(ENCODER_NAME, NEURON_FILE_NAME), + DECODER_NAME: os.path.join(DECODER_NAME, NEURON_FILE_NAME), + } + maybe_save_preprocessors(model_name_or_path, output) + + return models_and_neuron_configs, output_model_names + + def main_export( model_name_or_path: str, output: Union[str, Path], @@ -194,6 +310,7 @@ def main_export( output.parent.mkdir(parents=True) task = TasksManager.map_from_synonym(task) + is_stable_diffusion = "stable-diffusion" in task model_kwargs = { "task": task, @@ -209,57 +326,14 @@ def main_export( } model = TasksManager.get_model_from_task(**model_kwargs) - is_stable_diffusion = "stable-diffusion" in task - if not is_stable_diffusion: - neuron_config_constructor = TasksManager.get_exporter_config_constructor( - model=model, exporter="neuron", task=task - ) - neuron_config = neuron_config_constructor(model.config, dynamic_batch_size=dynamic_batch_size, **input_shapes) - if atol is None: - atol = neuron_config.ATOL_FOR_VALIDATION - model_name = model.name_or_path.split("/")[-1] - output_model_names = {model_name: "model.neuron"} - models_and_neuron_configs = {model_name: (model, neuron_config)} - maybe_save_preprocessors(model, output.parent) - - if is_stable_diffusion: - check_compiler_compatibility_for_stable_diffusion() - if is_neuron_available(): - raise RuntimeError( - "Stable diffusion export is not supported by neuron-cc on inf1, please use neuronx-cc on either inf2/trn1 instead." - ) - input_shapes = infer_stable_diffusion_shapes_from_diffusers(input_shapes, model) - - # Saving the model config and preprocessor as this is needed sometimes. - model.scheduler.save_pretrained(output.joinpath("scheduler")) - if hasattr(model, "tokenizer") and model.tokenizer is not None: - model.tokenizer.save_pretrained(output.joinpath("tokenizer")) - if hasattr(model, "tokenizer_2") and model.tokenizer_2 is not None: - model.tokenizer_2.save_pretrained(output.joinpath("tokenizer_2")) - if hasattr(model, "feature_extractor"): - model.feature_extractor.save_pretrained(output.joinpath("feature_extractor")) - model.save_config(output) - - models_and_neuron_configs = get_stable_diffusion_models_for_export( - pipeline=model, - task=task, - dynamic_batch_size=dynamic_batch_size, - **input_shapes, - ) - output_model_names = { - DIFFUSION_MODEL_UNET_NAME: os.path.join(DIFFUSION_MODEL_UNET_NAME, NEURON_FILE_NAME), - DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME), - DIFFUSION_MODEL_VAE_DECODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_DECODER_NAME, NEURON_FILE_NAME), - } - if hasattr(model, "text_encoder") and model.text_encoder is not None: - output_model_names[DIFFUSION_MODEL_TEXT_ENCODER_NAME] = os.path.join( - DIFFUSION_MODEL_TEXT_ENCODER_NAME, NEURON_FILE_NAME - ) - if hasattr(model, "text_encoder_2") and model.text_encoder_2 is not None: - output_model_names[DIFFUSION_MODEL_TEXT_ENCODER_2_NAME] = os.path.join( - DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, NEURON_FILE_NAME - ) - del model + models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs( + model=model, + input_shapes=input_shapes, + task=task, + output=output, + dynamic_batch_size=dynamic_batch_size, + model_name_or_path=model_name_or_path, + ) _, neuron_outputs = export_models( models_and_neuron_configs=models_and_neuron_configs, diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py index 3f18d67d6..42fc7d593 100644 --- a/optimum/exporters/neuron/config.py +++ b/optimum/exporters/neuron/config.py @@ -78,6 +78,10 @@ class TextSeq2SeqNeuronConfig(NeuronConfig): DummySeq2SeqPastKeyValuesGenerator, ) + @property + def is_decoder(self) -> bool: + raise NotImplementedError() + @property def inputs(self) -> Dict[str, Dict[int, str]]: common_inputs = [] diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index 2466cf9c5..185bbf1d3 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -428,7 +428,8 @@ def export_neuronx( aliases = {} if model.config.is_encoder_decoder: checked_model = config.patch_model_for_export(model, **input_shapes) - aliases = config.generate_io_aliases(checked_model) + if getattr(config, "is_decoder", False): + aliases = config.generate_io_aliases(checked_model) else: checked_model = config.patch_model_for_export(model, dummy_inputs) diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index 92d566323..3d3bd3395 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -381,6 +381,10 @@ class T5EncoderNeuronConfig(TextSeq2SeqNeuronConfig): allow_new=True, ) + @property + def is_decoder(self) -> bool: + return False + def patch_model_for_export(self, model, **kwargs): num_beams = kwargs.pop("num_beams", 1) return super().patch_model_for_export( @@ -405,6 +409,10 @@ class T5DecoderNeuronConfig(TextSeq2SeqNeuronConfig): allow_new=True, ) + @property + def is_decoder(self) -> bool: + return True + @property def inputs(self) -> List[str]: common_inputs = super().inputs + ["beam_idx", "beam_scores"] diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py index d4c9a85bf..04cec9208 100644 --- a/optimum/exporters/neuron/utils.py +++ b/optimum/exporters/neuron/utils.py @@ -326,8 +326,8 @@ def override_diffusers_2_0_attn_processors(model): def get_encoder_decoder_models_for_export( model: "PreTrainedModel", - encoder_input_shapes: Dict[str, int], - decoder_input_shapes: Dict[str, int], + task: str, + input_shapes: Dict[str, int], dynamic_batch_size: Optional[bool] = False, ) -> Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]: """ @@ -339,10 +339,8 @@ def get_encoder_decoder_models_for_export( Args: model ("PreTrainedModel"): The model to export. - encoder_input_shapes (`Dict[str, int]`): - Static shapes used for compiling the encoder. - decoder_input_shapes (`Dict[str, int]`): - Static shapes used for compiling the decoder. + input_shapes (`Dict[str, int]`): + Static shapes used for compiling the encoder and the decoder. dynamic_batch_size (`bool`, defaults to `False`): Whether the Neuron compiled model supports dynamic batch size. @@ -350,31 +348,31 @@ def get_encoder_decoder_models_for_export( `Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]`: A Dict containing the model and Neuron configs for the different components of the model. """ - models_for_export = [] + models_for_export = {} # Encoder model_type = getattr(model.config, "model_type") + "-encoder" encoder_config_constructor = TasksManager.get_exporter_config_constructor( - exporter="neuron", model_type=model_type, task="text2text-generation" + exporter="neuron", model_type=model_type, task=task ) encoder_neuron_config = encoder_config_constructor( config=model.config, - task="text2text-generation", + task=task, dynamic_batch_size=dynamic_batch_size, - **encoder_input_shapes, + **input_shapes, ) models_for_export[ENCODER_NAME] = (model, encoder_neuron_config) # Decoder model_type = getattr(model.config, "model_type") + "-decoder" decoder_config_constructor = TasksManager.get_exporter_config_constructor( - exporter="neuron", model_type=model_type, task="text2text-generation" + exporter="neuron", model_type=model_type, task=task ) decoder_neuron_config = decoder_config_constructor( config=model.config, - task="text2text-generation", + task=task, dynamic_batch_size=dynamic_batch_size, - **decoder_input_shapes, + **input_shapes, ) models_for_export[DECODER_NAME] = (model, decoder_neuron_config) From 1eaa54adccc87e73a3421eb5d6840393b09827c1 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Wed, 25 Oct 2023 22:55:44 +0000 Subject: [PATCH 06/30] validation --- optimum/exporters/neuron/config.py | 21 +++++- optimum/exporters/neuron/convert.py | 12 ++-- optimum/exporters/neuron/model_configs.py | 13 ++-- optimum/exporters/neuron/model_wrappers.py | 83 +++++++++++++++------- 4 files changed, 91 insertions(+), 38 deletions(-) diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py index 42fc7d593..82cbf4450 100644 --- a/optimum/exporters/neuron/config.py +++ b/optimum/exporters/neuron/config.py @@ -88,7 +88,6 @@ def inputs(self) -> Dict[str, Dict[int, str]]: # encoder + decoder without past if "encoder" in self.MODEL_TYPE: common_inputs = ["input_ids", "attention_mask"] - # decoder with past if "decoder" in self.MODEL_TYPE: common_inputs = [ @@ -102,12 +101,28 @@ def inputs(self) -> Dict[str, Dict[int, str]]: @property def outputs(self) -> Dict[str, Dict[int, str]]: + common_outputs = [] # encoder + decoder without past if "encoder" in self.MODEL_TYPE: - common_outputs = ["present_key_values_self_attn", "past_key_values_cross_attn"] + common_outputs = ( + [f"present.{idx}.self.key" for idx in range(self._config.num_decoder_layers)] + + [f"present.{idx}.self.value" for idx in range(self._config.num_decoder_layers)] + + [f"present.{idx}.cross.key" for idx in range(self._config.num_decoder_layers)] + + [f"present.{idx}.cross.value" for idx in range(self._config.num_decoder_layers)] + ) # decoder with past if "decoder" in self.MODEL_TYPE: - common_outputs = ["next_tokens", "past_key_values_self_attn", "past_key_values_cross_attn"] + beam_outputs = ( + ["next_token_scores", "next_tokens", "next_indices"] if self.num_beams > 1 else ["next_tokens"] + ) + # for i in range(self._config.num_decoder_layers): + common_outputs = ( + beam_outputs + + [f"past.{idx}.self.key" for idx in range(self._config.num_decoder_layers)] + + [f"past.{idx}.self.value" for idx in range(self._config.num_decoder_layers)] + + [f"past.{idx}.cross.key" for idx in range(self._config.num_decoder_layers)] + + [f"past.{idx}.cross.value" for idx in range(self._config.num_decoder_layers)] + ) return common_outputs def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]: diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index 185bbf1d3..c4072cc2a 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -169,8 +169,12 @@ def validate_model_outputs( with torch.no_grad(): reference_model.eval() ref_inputs = config.generate_dummy_inputs(return_tuple=False, **input_shapes) - if hasattr(config._config, "_class_name") and "AutoencoderKL" in config._config._class_name: - # VAE components for stable diffusion + if reference_model.config.is_encoder_decoder: + reference_model = config.patch_model_for_export(reference_model, device="cpu", **input_shapes) + if ( + hasattr(config._config, "_class_name") and "AutoencoderKL" in config._config._class_name + ) or reference_model.config.is_encoder_decoder: + # VAE components for stable diffusion or Encoder-Decoder models ref_inputs = tuple(ref_inputs.values()) ref_outputs = reference_model(*ref_inputs) neuron_inputs = ref_inputs @@ -217,9 +221,9 @@ def validate_model_outputs( # Check the shape and values match shape_failures = [] value_failures = [] - for name, output in zip(neuron_output_names_list, neuron_outputs): + for i, (name, output) in enumerate(zip(neuron_output_names_list, neuron_outputs)): if isinstance(output, torch.Tensor): - ref_output = ref_outputs[name].numpy() + ref_output = ref_outputs[name].numpy() if isinstance(ref_outputs, Dict) else ref_outputs[i].numpy() output = output.numpy() elif isinstance(output, tuple): # eg. `hidden_states` of `AutoencoderKL` is a tuple of tensors. ref_output = torch.stack(ref_outputs[name]).numpy() diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index 3d3bd3395..0b251cc27 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -372,6 +372,7 @@ class T5EncoderNeuronConfig(TextSeq2SeqNeuronConfig): ATOL_FOR_VALIDATION = 1e-3 MANDATORY_AXES = ("batch_size", "sequence_length", "num_beams") MODEL_TYPE = "t5-encoder" + CUSTOM_MODEL_WRAPPER = T5EncoderWrapper NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args( hidden_size="d_model", num_attention_heads="num_heads", @@ -385,12 +386,12 @@ class T5EncoderNeuronConfig(TextSeq2SeqNeuronConfig): def is_decoder(self) -> bool: return False - def patch_model_for_export(self, model, **kwargs): + def patch_model_for_export(self, model, device="xla", **kwargs): num_beams = kwargs.pop("num_beams", 1) return super().patch_model_for_export( model=model, - custom_model_wrapper=T5EncoderWrapper, - custom_wrapper_kwargs={"num_beams": num_beams}, + custom_model_wrapper=self.CUSTOM_MODEL_WRAPPER, + custom_wrapper_kwargs={"num_beams": num_beams, "device": device}, ) @@ -400,6 +401,7 @@ class T5DecoderNeuronConfig(TextSeq2SeqNeuronConfig): DUMMY_INPUT_GENERATOR_CLASSES = TextSeq2SeqNeuronConfig.DUMMY_INPUT_GENERATOR_CLASSES + (DummyBeamValuesGenerator,) MANDATORY_AXES = ("batch_size", "sequence_length", "num_beams") MODEL_TYPE = "t5-decoder" + CUSTOM_MODEL_WRAPPER = T5DecoderWrapper NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args( hidden_size="d_model", num_attention_heads="num_heads", @@ -438,11 +440,12 @@ def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGen dummy_inputs_generators.append(dummy_beam_values_generator) return dummy_inputs_generators - def patch_model_for_export(self, model, **kwargs): + def patch_model_for_export(self, model, device="xla", **kwargs): return super().patch_model_for_export( model=model, - custom_model_wrapper=T5DecoderWrapper, + custom_model_wrapper=self.CUSTOM_MODEL_WRAPPER, custom_wrapper_kwargs={ + "device": device, "batch_size": kwargs.pop("batch_size", 1), "sequence_length": kwargs.pop("sequence_length", 1), "num_beams": kwargs.pop("num_beams", 1), diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py index 8477fdd07..6bc869203 100644 --- a/optimum/exporters/neuron/model_wrappers.py +++ b/optimum/exporters/neuron/model_wrappers.py @@ -48,13 +48,14 @@ def __init__( self, model: "PreTrainedModel", num_beams: int = 1, + device: str = "xla", tp_degree=None, ): super().__init__() self.model = model self.config = model.config self.num_beams = num_beams - self.device = "xla" + self.device = device self.tp_degree = tp_degree def forward(self, input_ids, attention_mask): @@ -120,46 +121,72 @@ class T5DecoderWrapper(torch.nn.Module): """Wrapper to trace the decoder with past keys values with a language head.""" def __init__( - self, model: "PreTrainedModel", batch_size: int, sequence_length: int, num_beams: int = 1, tp_degree=None + self, + model: "PreTrainedModel", + batch_size: int, + sequence_length: int, + num_beams: int = 1, + device: str = "xla", + tp_degree=None, ): super().__init__() self.model = model self.config = model.config self.batch_size = batch_size + self.sequence_length = sequence_length self.num_beams = num_beams - self.device = "xla" + self.device = device self.tp_degree = tp_degree # Initialize KV cache (num_beams, n_heads, seq_length, dim_per_head) - self.past_key_values_sa = torch.nn.ParameterList( - [ - torch.nn.Parameter( - torch.ones( - ( - self.batch_size * self.num_beams, - self.config.num_heads, - sequence_length - 1, - self.config.d_kv, - ), - dtype=torch.float32, - ), - requires_grad=False, + if device == "cpu": + self.past_key_values_sa = [ + torch.ones( + (num_beams, self.config.num_heads, self.sequence_length - 1, self.config.d_kv), dtype=torch.float32 ) for _ in range(self.config.num_decoder_layers * 2) ] - ) - self.past_key_values_ca = torch.nn.ParameterList( - [ - torch.nn.Parameter( - torch.ones( - (self.batch_size * self.num_beams, self.config.num_heads, sequence_length, self.config.d_kv), - dtype=torch.float32, - ), - requires_grad=False, + self.past_key_values_ca = [ + torch.ones( + (num_beams, self.config.num_heads, self.sequence_length, self.config.d_kv), dtype=torch.float32 ) for _ in range(self.config.num_decoder_layers * 2) ] - ) + elif device == "xla": + self.past_key_values_sa = torch.nn.ParameterList( + [ + torch.nn.Parameter( + torch.ones( + ( + self.batch_size * self.num_beams, + self.config.num_heads, + sequence_length - 1, + self.config.d_kv, + ), + dtype=torch.float32, + ), + requires_grad=False, + ) + for _ in range(self.config.num_decoder_layers * 2) + ] + ) + self.past_key_values_ca = torch.nn.ParameterList( + [ + torch.nn.Parameter( + torch.ones( + ( + self.batch_size * self.num_beams, + self.config.num_heads, + sequence_length, + self.config.d_kv, + ), + dtype=torch.float32, + ), + requires_grad=False, + ) + for _ in range(self.config.num_decoder_layers * 2) + ] + ) def update_past(self, past_key_values): new_past_sa = [] @@ -235,6 +262,10 @@ def forward( past_key_values_sa = [vec for kv_per_layer in past_key_values_sa for vec in kv_per_layer] past_key_values_ca = [vec for kv_per_layer in past_key_values_ca for vec in kv_per_layer] + if self.device == "cpu": + self.past_key_values_sa = past_key_values_sa + self.past_key_values_ca = past_key_values_ca + # We calculate topk inside the wrapper next_token_logits = lm_logits[:, -1, :] From 2231afbdd9db0dd9b07ffd357947bed7cfec3b35 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Thu, 26 Oct 2023 17:24:28 +0000 Subject: [PATCH 07/30] add seq2seq base model --- optimum/exporters/neuron/utils.py | 11 ++ optimum/neuron/__init__.py | 2 + optimum/neuron/modeling_diffusion.py | 2 + optimum/neuron/modeling_seq2seq.py | 156 +++++++++++++++++++++++---- 4 files changed, 149 insertions(+), 22 deletions(-) diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py index 04cec9208..29e898f2d 100644 --- a/optimum/exporters/neuron/utils.py +++ b/optimum/exporters/neuron/utils.py @@ -324,6 +324,15 @@ def override_diffusers_2_0_attn_processors(model): return model +def check_mandatory_input_shapes(neuron_config_constructor, task, input_shapes): + mandatory_shapes = neuron_config_constructor.func.get_mandatory_axes_for_task(task) + for name in mandatory_shapes: + if input_shapes.get(name, None) is None: + raise AttributeError( + f"Cannot find the value of `{name}` which is mandatory for exporting the model to the neuron format, please set the value explicitly." + ) + + def get_encoder_decoder_models_for_export( model: "PreTrainedModel", task: str, @@ -355,6 +364,7 @@ def get_encoder_decoder_models_for_export( encoder_config_constructor = TasksManager.get_exporter_config_constructor( exporter="neuron", model_type=model_type, task=task ) + check_mandatory_input_shapes(encoder_config_constructor, task, input_shapes) encoder_neuron_config = encoder_config_constructor( config=model.config, task=task, @@ -368,6 +378,7 @@ def get_encoder_decoder_models_for_export( decoder_config_constructor = TasksManager.get_exporter_config_constructor( exporter="neuron", model_type=model_type, task=task ) + check_mandatory_input_shapes(encoder_config_constructor, task, input_shapes) decoder_neuron_config = decoder_config_constructor( config=model.config, task=task, diff --git a/optimum/neuron/__init__.py b/optimum/neuron/__init__.py index 12dcb93a9..1398d3f8a 100644 --- a/optimum/neuron/__init__.py +++ b/optimum/neuron/__init__.py @@ -41,6 +41,7 @@ "NeuronStableDiffusionXLInpaintPipeline", ], "modeling_decoder": ["NeuronDecoderModel"], + "modeling_seq2seq": ["NeuronModelForSeq2SeqLM"], "accelerate": [ "NeuronAccelerator", "NeuronAcceleratorState", @@ -71,6 +72,7 @@ NeuronStableDiffusionXLInpaintPipeline, NeuronStableDiffusionXLPipeline, ) + from .modeling_seq2seq import NeuronModelForSeq2SeqLM from .pipelines import pipeline from .trainers import NeuronTrainer, Seq2SeqNeuronTrainer from .training_args import NeuronTrainingArguments, Seq2SeqNeuronTrainingArguments diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py index d2e947d47..5f0befdd1 100644 --- a/optimum/neuron/modeling_diffusion.py +++ b/optimum/neuron/modeling_diffusion.py @@ -360,6 +360,7 @@ def _from_pretrained( config: Dict[str, Any], use_auth_token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, + force_download: bool = False, cache_dir: Optional[str] = None, text_encoder_file_name: Optional[str] = NEURON_FILE_NAME, text_encoder_2_file_name: Optional[str] = NEURON_FILE_NAME, @@ -400,6 +401,7 @@ def _from_pretrained( local_files_only=local_files_only, use_auth_token=use_auth_token, revision=revision, + force_download=force_download, allow_patterns=allow_patterns, ignore_patterns=["*.msgpack", "*.safetensors", "*.bin"], ) diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index f39505cba..39346fd3d 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -13,17 +13,28 @@ # See the License for the specific language governing permissions and # limitations under the License. """NeuroModelForXXX classes for seq2seq models' inference on neuron devices.""" +import os from abc import abstractmethod from pathlib import Path from tempfile import TemporaryDirectory -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Union import torch -from transformers import AutoModelForSeq2SeqLM +from huggingface_hub import snapshot_download +from transformers import AutoConfig, AutoModelForSeq2SeqLM +from ..exporters.neuron import ( + NeuronConfig, + main_export, +) +from ..exporters.neuron.model_configs import * # noqa: F403 +from ..exporters.tasks import TasksManager +from ..utils.save_utils import maybe_load_preprocessors from .generation import NeuronGenerationMixin -from .modeling_base import NeuronBaseModel, NeuronConfig +from .modeling_base import NeuronBaseModel from .utils import ( + DECODER_NAME, + ENCODER_NAME, NEURON_FILE_NAME, is_neuronx_available, ) @@ -43,23 +54,34 @@ def __init__( self, encoder: torch.jit._script.ScriptModule, decoder: torch.jit._script.ScriptModule, - config: "PretrainedConfig", + configs: Optional[Dict[str, "PretrainedConfig"]] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, - model_file_name: Optional[str] = None, + encoder_file_name: Optional[str] = NEURON_FILE_NAME, + decoder_file_name: Optional[str] = NEURON_FILE_NAME, preprocessors: Optional[List] = None, - neuron_config: Optional["NeuronConfig"] = None, + neuron_configs: Optional[Dict[str, "NeuronConfig"]] = None, **kwargs, ): - pass - - @staticmethod - def load_model( - encoder_path: Union[str, Path], - decoder_path: Optional[Union[str, Path]] = None, - device_ids: Optional[List[int]] = None, - dynamic_batch_size: bool = False, - ): - pass + self.encoder = NeuronEncoder( + encoder, + self, + self.configs[ENCODER_NAME], + self.neuron_configs[ENCODER_NAME], + ) + self.decoder = NeuronEncoder( + decoder, + self, + self.configs[DECODER_NAME], + self.neuron_configs[DECODER_NAME], + ) + self.configs = configs + self.neuron_configs = neuron_configs + self.dynamic_batch_size = all( + neuron_config._config.neuron["dynamic_batch_size"] for neuron_config in self.neuron_configs.values() + ) + self._attributes_init(model_save_dir, preprocessors, **kwargs) + self.encoder_file_name = encoder_file_name + self.decoder_file_name = decoder_file_name def _save_pretrained( self, @@ -76,13 +98,14 @@ def _save_pretrained( save_directory (`Union[str, Path`]): The directory where to save the model files. """ - pass + save_directory = Path(save_directory) + # TODO @classmethod def _from_pretrained( cls, model_id: Union[str, Path], - config: Dict[str, Any], + config: "PretrainedConfig", use_auth_token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, force_download: bool = False, @@ -92,10 +115,63 @@ def _from_pretrained( subfolder: str = "", local_files_only: bool = False, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, - device_ids: Optional[List[int]] = None, **kwargs, ): - pass + import pdb + + pdb.set_trace() + patterns = {ENCODER_NAME, DECODER_NAME} + + if not os.path.isdir(model_id): + allow_patterns = {os.path.join(k, "*") for k in patterns if not k.startswith("_")} + # Downloads all repo's files matching the allowed patterns + model_id = snapshot_download( + model_id, + cache_dir=cache_dir, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + revision=revision, + force_download=force_download, + allow_patterns=allow_patterns, + ignore_patterns=["*.msgpack", "*.safetensors", "*.bin"], # only download *.neuron artifacts + ) + + preprocessors = maybe_load_preprocessors(model_id, subfolder=subfolder) + + new_model_save_dir = Path(model_id) + + model_and_config_save_paths = { + "encoder": ( + new_model_save_dir / ENCODER_NAME / encoder_file_name, + new_model_save_dir / ENCODER_NAME / cls.config_name, + ), + "decoder": ( + new_model_save_dir / DECODER_NAME / decoder_file_name, + new_model_save_dir / DECODER_NAME / cls.config_name, + ), + } + + # Re-build pretrained configs and neuron configs + configs, neuron_configs = {}, {} + for name, file_paths in model_and_config_save_paths.items(): + if file_paths[1].is_file(): + model_config = AutoConfig.from_json_file(file_paths[1]) + configs[name] = model_config + neuron_configs[name] = cls._neuron_config_init(model_config) + + encoder = cls.load_model(model_and_config_save_paths["encoder"][0]) + decoder = cls.load_model(model_and_config_save_paths["decoder"][0]) + + return cls( + encoder=encoder, + decoder=decoder, + configs=configs, + model_save_dir=model_save_dir, + encoder_file_name=encoder_file_name, + decoder_file_name=decoder_file_name, + preprocessors=preprocessors, + neuron_configs=neuron_configs, + ) @classmethod def _from_transformers( @@ -115,9 +191,45 @@ def _from_transformers( disable_fast_relayout: Optional[bool] = False, disable_fallback: bool = False, dynamic_batch_size: bool = False, - device_ids: Optional[List[int]] = None, + **kwargs_shapes, ) -> "NeuronModelForConditionalGeneration": - pass + if task is None: + task = TasksManager.infer_task_from_model(cls.auto_model_class) + + # Get compilation arguments + auto_cast_type = None if auto_cast is None else auto_cast_type + compiler_kwargs = { + "auto_cast": auto_cast, + "auto_cast_type": auto_cast_type, + "disable_fast_relayout": disable_fast_relayout, + "disable_fallback": disable_fallback, + } + + save_dir = TemporaryDirectory() + save_dir_path = Path(save_dir.name) + + main_export( + model_name_or_path=model_id, + output=save_dir_path, + compiler_kwargs=compiler_kwargs, + task=task, + dynamic_batch_size=dynamic_batch_size, + cache_dir=cache_dir, + trust_remote_code=trust_remote_code, + subfolder=subfolder, + revision=revision, + force_download=force_download, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + do_validation=False, + **kwargs_shapes, + ) + + return cls._from_pretrained( + model_id=save_dir_path, + config=config, + model_save_dir=save_dir, + ) class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin): From 72ed695cceb4b247eb9106c30b7195032ee14d5a Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Mon, 30 Oct 2023 18:04:38 +0000 Subject: [PATCH 08/30] modeling export and loading --- optimum/commands/export/neuronx.py | 1 + optimum/exporters/neuron/__main__.py | 2 +- optimum/exporters/neuron/convert.py | 4 +- optimum/neuron/modeling_base.py | 3 +- optimum/neuron/modeling_diffusion.py | 14 ++--- optimum/neuron/modeling_seq2seq.py | 86 +++++++++++++++++++++++--- optimum/neuron/utils/argument_utils.py | 12 ++-- 7 files changed, 99 insertions(+), 23 deletions(-) diff --git a/optimum/commands/export/neuronx.py b/optimum/commands/export/neuronx.py index 616cee693..d73f252bb 100644 --- a/optimum/commands/export/neuronx.py +++ b/optimum/commands/export/neuronx.py @@ -105,6 +105,7 @@ def parse_args_neuronx(parser: "ArgumentParser"): input_group.add_argument( "--num_beams", type=int, + default=1, help=f"Number of beams for beam search {doc_input}", ) input_group.add_argument( diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index f0402552d..c671f9cb0 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -261,7 +261,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion( def _get_submodels_and_neuron_configs_for_encoder_decoder( - model: Union["PreTrainedModel", "DiffusionPipeline"], + model: "PreTrainedModel", input_shapes: Dict[str, int], task: str, output: Path, diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index c4072cc2a..da4667fb0 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -328,7 +328,7 @@ def export_models( if is_diffusers_available() and isinstance(model_config, FrozenDict): model_config = OrderedDict(model_config) model_config = DiffusersPretrainedConfig.from_dict(model_config) - + model_config = store_compilation_config( config=model_config, input_shapes=sub_neuron_config.input_shapes, @@ -343,6 +343,8 @@ def export_models( ) if isinstance(model_config, PretrainedConfig): model_config = DiffusersPretrainedConfig.from_dict(model_config.__dict__) + import pdb + pdb.set_trace() model_config.save_pretrained(output_path.parent) except Exception as e: failed_models.append((i, model_name)) diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py index 7e6e3da3f..fa21a16c1 100644 --- a/optimum/neuron/modeling_base.py +++ b/optimum/neuron/modeling_base.py @@ -408,8 +408,9 @@ def _neuron_config_init(cls, config: "PretrainedConfig") -> "NeuronConfig": # Neuron config constructuor task = getattr(config, "task") or TasksManager.infer_task_from_model(cls.auto_model_class) task = TasksManager.map_from_synonym(task) + model_type = neuron_configs.get("model_type", None) or config.model_type neuron_config_constructor = TasksManager.get_exporter_config_constructor( - model_type=config.model_type, exporter="neuron", task=task + model_type=model_type, exporter="neuron", task=task ) return neuron_config_constructor( diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py index 5f0befdd1..d7ee37ab1 100644 --- a/optimum/neuron/modeling_diffusion.py +++ b/optimum/neuron/modeling_diffusion.py @@ -294,6 +294,12 @@ def _save_pretrained( """ Saves the model to the serialized format optimized for Neuron devices. """ + if self.model_and_config_save_paths is None: + logger.warning( + "`model_save_paths` is None which means that no path of Neuron model is defined. Nothing will be saved." + ) + return + save_directory = Path(save_directory) if not self.model_and_config_save_paths.get(DIFFUSION_MODEL_VAE_ENCODER_NAME)[0].is_file(): self.model_and_config_save_paths.pop(DIFFUSION_MODEL_VAE_ENCODER_NAME) @@ -304,13 +310,7 @@ def _save_pretrained( if not self.model_and_config_save_paths.get(DIFFUSION_MODEL_TEXT_ENCODER_2_NAME)[0].is_file(): self.model_and_config_save_paths.pop(DIFFUSION_MODEL_TEXT_ENCODER_2_NAME) - if self.model_and_config_save_paths is None: - logger.warning( - "`model_save_paths` is None which means that no path of Neuron model is defined. Nothing will be saved." - ) - return - else: - logger.info(f"Saving the {tuple(self.model_and_config_save_paths.keys())}...") + logger.info(f"Saving the {tuple(self.model_and_config_save_paths.keys())}...") dst_paths = { DIFFUSION_MODEL_TEXT_ENCODER_NAME: save_directory diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index 39346fd3d..9bb3df27c 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -14,6 +14,8 @@ # limitations under the License. """NeuroModelForXXX classes for seq2seq models' inference on neuron devices.""" import os +import shutil +import logging from abc import abstractmethod from pathlib import Path from tempfile import TemporaryDirectory @@ -21,7 +23,7 @@ import torch from huggingface_hub import snapshot_download -from transformers import AutoConfig, AutoModelForSeq2SeqLM +from transformers import AutoConfig, AutoModelForSeq2SeqLM, GenerationConfig from ..exporters.neuron import ( NeuronConfig, @@ -46,9 +48,12 @@ if is_neuronx_available(): pass +logger = logging.getLogger(__name__) + class NeuronModelForConditionalGeneration(NeuronBaseModel): base_model_prefix = "neuron_model" + config_name = "config.json" def __init__( self, @@ -60,6 +65,7 @@ def __init__( decoder_file_name: Optional[str] = NEURON_FILE_NAME, preprocessors: Optional[List] = None, neuron_configs: Optional[Dict[str, "NeuronConfig"]] = None, + generation_config: Optional[GenerationConfig] = None, **kwargs, ): self.encoder = NeuronEncoder( @@ -82,6 +88,10 @@ def __init__( self._attributes_init(model_save_dir, preprocessors, **kwargs) self.encoder_file_name = encoder_file_name self.decoder_file_name = decoder_file_name + + if generation_config is None: + generation_config = GenerationConfig.from_model_config(self.configs[DECODER_NAME]) + self.generation_config = generation_config def _save_pretrained( self, @@ -98,8 +108,49 @@ def _save_pretrained( save_directory (`Union[str, Path`]): The directory where to save the model files. """ + if self.model_and_config_save_paths is None: + logger.warning( + "`model_save_paths` is None which means that no path of Neuron model is defined. Nothing will be saved." + ) + return + save_directory = Path(save_directory) - # TODO + if not self.model_and_config_save_paths.get(ENCODER_NAME)[0].is_file(): + self.model_and_config_save_paths.pop(ENCODER_NAME) + + if not self.model_and_config_save_paths.get(DECODER_NAME)[0].is_file(): + self.model_and_config_save_paths.pop(DECODER_NAME) + + dst_paths = { + ENCODER_NAME: save_directory / ENCODER_NAME / encoder_file_name, + DECODER_NAME: save_directory / DECODER_NAME / decoder_file_name, + } + + model_src_to_dst_path = { + self.model_and_config_save_paths[model_name][0]: dst_paths[model_name] + for model_name in set(self.model_and_config_save_paths.keys()).intersection(dst_paths.keys()) + } + # save + config_src_to_dst_path = { + self.model_and_config_save_paths[model_name][1]: dst_paths[model_name].parent / self.config_name + for model_name in set(self.model_and_config_save_paths.keys()).intersection(dst_paths.keys()) + } + + src_paths = list(model_src_to_dst_path.keys()) + list(config_src_to_dst_path.keys()) + dst_paths = list(model_src_to_dst_path.values()) + list(config_src_to_dst_path.values()) + + for src_path, dst_path in zip(src_paths, dst_paths): + dst_path.parent.mkdir(parents=True, exist_ok=True) + if src_path.is_file(): + shutil.copyfile(src_path, dst_path) + + src_paths = [Path(path) for path in self.onnx_paths] + dst_paths = [save_directory / path.name for path in src_paths] + + if self.tokenizer is not None: + self.tokenizer.save_pretrained(save_directory) + + self.generation_config.save_pretrained(save_directory) @classmethod def _from_pretrained( @@ -117,13 +168,9 @@ def _from_pretrained( model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - import pdb - - pdb.set_trace() - patterns = {ENCODER_NAME, DECODER_NAME} + model_id = str(model_id) if not os.path.isdir(model_id): - allow_patterns = {os.path.join(k, "*") for k in patterns if not k.startswith("_")} # Downloads all repo's files matching the allowed patterns model_id = snapshot_download( model_id, @@ -132,7 +179,6 @@ def _from_pretrained( use_auth_token=use_auth_token, revision=revision, force_download=force_download, - allow_patterns=allow_patterns, ignore_patterns=["*.msgpack", "*.safetensors", "*.bin"], # only download *.neuron artifacts ) @@ -155,12 +201,33 @@ def _from_pretrained( configs, neuron_configs = {}, {} for name, file_paths in model_and_config_save_paths.items(): if file_paths[1].is_file(): - model_config = AutoConfig.from_json_file(file_paths[1]) + model_config = AutoConfig.from_pretrained(file_paths[1]) configs[name] = model_config neuron_configs[name] = cls._neuron_config_init(model_config) encoder = cls.load_model(model_and_config_save_paths["encoder"][0]) decoder = cls.load_model(model_and_config_save_paths["decoder"][0]) + + # TODO: Debug num_beams unmatched issue + import pdb + pdb.set_trace() + + if model_save_dir is None: + model_save_dir = new_model_save_dir + + generation_config = None + try: + generation_config = GenerationConfig.from_pretrained( + model_id, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + revision=revision, + subfolder=os.path.join(subfolder, DECODER_NAME), + ) + except OSError: + logger.info("Generation config file not found, using a generation config created from the model config.") return cls( encoder=encoder, @@ -171,6 +238,7 @@ def _from_pretrained( decoder_file_name=decoder_file_name, preprocessors=preprocessors, neuron_configs=neuron_configs, + generation_config=generation_config, ) @classmethod diff --git a/optimum/neuron/utils/argument_utils.py b/optimum/neuron/utils/argument_utils.py index 68c79b684..eb24d169f 100644 --- a/optimum/neuron/utils/argument_utils.py +++ b/optimum/neuron/utils/argument_utils.py @@ -172,6 +172,13 @@ def store_compilation_config( config_args["input_names"] = input_names config_args["output_names"] = output_names + + original_model_type = getattr(config, "model_type", None) + neuron_model_type = str(model_type).replace("_", "-") + if original_model_type is None: + update_func("model_type", neuron_model_type) # Add model_type to the config if it doesn't exist before, eg. submodel of Stable Diffusion. + elif neuron_model_type != original_model_type: + config_args["model_type"] = neuron_model_type # Neuron custom model_type, eg. `t5-encoder`. update_func("neuron", config_args) @@ -179,10 +186,7 @@ def store_compilation_config( import diffusers update_func("_diffusers_version", diffusers.__version__) - - model_type = getattr(config, "model_type", None) or model_type - model_type = str(model_type).replace("_", "-") - update_func("model_type", model_type) + update_func("task", task) return config From 16ddeeb4aa83430ec9b6771503841d7c38f01086 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Sun, 5 Nov 2023 16:43:41 +0000 Subject: [PATCH 09/30] fix style --- optimum/exporters/neuron/convert.py | 7 +++--- optimum/exporters/neuron/model_configs.py | 7 ++++++ optimum/neuron/modeling_diffusion.py | 2 +- optimum/neuron/modeling_seq2seq.py | 27 ++++++++++++++--------- optimum/neuron/utils/argument_utils.py | 8 ++++--- 5 files changed, 33 insertions(+), 18 deletions(-) diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index da4667fb0..6d2c20071 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -328,7 +328,7 @@ def export_models( if is_diffusers_available() and isinstance(model_config, FrozenDict): model_config = OrderedDict(model_config) model_config = DiffusersPretrainedConfig.from_dict(model_config) - + model_config = store_compilation_config( config=model_config, input_shapes=sub_neuron_config.input_shapes, @@ -343,8 +343,6 @@ def export_models( ) if isinstance(model_config, PretrainedConfig): model_config = DiffusersPretrainedConfig.from_dict(model_config.__dict__) - import pdb - pdb.set_trace() model_config.save_pretrained(output_path.parent) except Exception as e: failed_models.append((i, model_name)) @@ -453,6 +451,9 @@ def export_neuronx( # diffusers specific compiler_args = add_stable_diffusion_compiler_args(config, compiler_args) + import pdb + + pdb.set_trace() neuron_model = neuronx.trace( checked_model, dummy_inputs_tuple, diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index a92681b43..4ea1beff6 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -394,6 +394,13 @@ def patch_model_for_export(self, model, device="xla", **kwargs): custom_wrapper_kwargs={"num_beams": num_beams, "device": device}, ) + # def generate_dummy_inputs(self, **kwargs): + # batch_size = kwargs.pop("batch_size") * kwargs.get("num_beams") + # dummy_inputs = super().generate_dummy_inputs(batch_size=batch_size, **kwargs) + + # return dummy_inputs + + @register_in_tasks_manager("opt", "text-generation") class OPTNeuronConfig(TextNeuronDecoderConfig): NEURONX_CLASS = "opt.model.OPTForSampling" diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py index d7ee37ab1..569191387 100644 --- a/optimum/neuron/modeling_diffusion.py +++ b/optimum/neuron/modeling_diffusion.py @@ -299,7 +299,7 @@ def _save_pretrained( "`model_save_paths` is None which means that no path of Neuron model is defined. Nothing will be saved." ) return - + save_directory = Path(save_directory) if not self.model_and_config_save_paths.get(DIFFUSION_MODEL_VAE_ENCODER_NAME)[0].is_file(): self.model_and_config_save_paths.pop(DIFFUSION_MODEL_VAE_ENCODER_NAME) diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index 9bb3df27c..89e9c86c5 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """NeuroModelForXXX classes for seq2seq models' inference on neuron devices.""" +import logging import os import shutil -import logging from abc import abstractmethod from pathlib import Path from tempfile import TemporaryDirectory @@ -88,7 +88,7 @@ def __init__( self._attributes_init(model_save_dir, preprocessors, **kwargs) self.encoder_file_name = encoder_file_name self.decoder_file_name = decoder_file_name - + if generation_config is None: generation_config = GenerationConfig.from_model_config(self.configs[DECODER_NAME]) self.generation_config = generation_config @@ -113,19 +113,19 @@ def _save_pretrained( "`model_save_paths` is None which means that no path of Neuron model is defined. Nothing will be saved." ) return - + save_directory = Path(save_directory) if not self.model_and_config_save_paths.get(ENCODER_NAME)[0].is_file(): self.model_and_config_save_paths.pop(ENCODER_NAME) if not self.model_and_config_save_paths.get(DECODER_NAME)[0].is_file(): self.model_and_config_save_paths.pop(DECODER_NAME) - + dst_paths = { ENCODER_NAME: save_directory / ENCODER_NAME / encoder_file_name, DECODER_NAME: save_directory / DECODER_NAME / decoder_file_name, } - + model_src_to_dst_path = { self.model_and_config_save_paths[model_name][0]: dst_paths[model_name] for model_name in set(self.model_and_config_save_paths.keys()).intersection(dst_paths.keys()) @@ -135,7 +135,7 @@ def _save_pretrained( self.model_and_config_save_paths[model_name][1]: dst_paths[model_name].parent / self.config_name for model_name in set(self.model_and_config_save_paths.keys()).intersection(dst_paths.keys()) } - + src_paths = list(model_src_to_dst_path.keys()) + list(config_src_to_dst_path.keys()) dst_paths = list(model_src_to_dst_path.values()) + list(config_src_to_dst_path.values()) @@ -143,10 +143,10 @@ def _save_pretrained( dst_path.parent.mkdir(parents=True, exist_ok=True) if src_path.is_file(): shutil.copyfile(src_path, dst_path) - + src_paths = [Path(path) for path in self.onnx_paths] dst_paths = [save_directory / path.name for path in src_paths] - + if self.tokenizer is not None: self.tokenizer.save_pretrained(save_directory) @@ -205,16 +205,21 @@ def _from_pretrained( configs[name] = model_config neuron_configs[name] = cls._neuron_config_init(model_config) + # TODO: Debug num_beams unmatched issue + import pdb + + pdb.set_trace() encoder = cls.load_model(model_and_config_save_paths["encoder"][0]) decoder = cls.load_model(model_and_config_save_paths["decoder"][0]) - + # TODO: Debug num_beams unmatched issue import pdb + pdb.set_trace() - + if model_save_dir is None: model_save_dir = new_model_save_dir - + generation_config = None try: generation_config = GenerationConfig.from_pretrained( diff --git a/optimum/neuron/utils/argument_utils.py b/optimum/neuron/utils/argument_utils.py index eb24d169f..b7e9b4ab0 100644 --- a/optimum/neuron/utils/argument_utils.py +++ b/optimum/neuron/utils/argument_utils.py @@ -172,11 +172,13 @@ def store_compilation_config( config_args["input_names"] = input_names config_args["output_names"] = output_names - + original_model_type = getattr(config, "model_type", None) neuron_model_type = str(model_type).replace("_", "-") if original_model_type is None: - update_func("model_type", neuron_model_type) # Add model_type to the config if it doesn't exist before, eg. submodel of Stable Diffusion. + update_func( + "model_type", neuron_model_type + ) # Add model_type to the config if it doesn't exist before, eg. submodel of Stable Diffusion. elif neuron_model_type != original_model_type: config_args["model_type"] = neuron_model_type # Neuron custom model_type, eg. `t5-encoder`. @@ -186,7 +188,7 @@ def store_compilation_config( import diffusers update_func("_diffusers_version", diffusers.__version__) - + update_func("task", task) return config From 3efdbc860f2621f57b8a5a5bf7d6e6d9af9a3f6e Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Sun, 5 Nov 2023 22:25:40 +0000 Subject: [PATCH 10/30] finish base modeling funcs --- optimum/exporters/neuron/convert.py | 3 -- optimum/neuron/modeling_base.py | 3 -- optimum/neuron/modeling_seq2seq.py | 64 ++++++++++++----------------- 3 files changed, 27 insertions(+), 43 deletions(-) diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index 6d2c20071..c4072cc2a 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -451,9 +451,6 @@ def export_neuronx( # diffusers specific compiler_args = add_stable_diffusion_compiler_args(config, compiler_args) - import pdb - - pdb.set_trace() neuron_model = neuronx.trace( checked_model, dummy_inputs_tuple, diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py index fa21a16c1..05790c084 100644 --- a/optimum/neuron/modeling_base.py +++ b/optimum/neuron/modeling_base.py @@ -373,9 +373,6 @@ def _attributes_init( self.preprocessors = preprocessors if preprocessors is not None else [] - self.input_names = getattr(self.config, "input_names", []) - self.output_names = getattr(self.config, "output_names", []) - # Registers the NeuronModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating # a pipeline https://github.com/huggingface/transformers/blob/3d3204c025b6b5de013e07dd364208e28b4d9589/src/transformers/pipelines/base.py#L940 AutoConfig.register(self.model_type, AutoConfig) diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index 89e9c86c5..5c614073f 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -19,7 +19,7 @@ from abc import abstractmethod from pathlib import Path from tempfile import TemporaryDirectory -from typing import TYPE_CHECKING, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import torch from huggingface_hub import snapshot_download @@ -46,7 +46,7 @@ from transformers import PretrainedConfig if is_neuronx_available(): - pass + import torch_neuronx logger = logging.getLogger(__name__) @@ -66,8 +66,13 @@ def __init__( preprocessors: Optional[List] = None, neuron_configs: Optional[Dict[str, "NeuronConfig"]] = None, generation_config: Optional[GenerationConfig] = None, + model_and_config_save_paths: Optional[Dict[str, Tuple[str, Path]]] = None, **kwargs, ): + self.configs = configs + self.neuron_configs = neuron_configs + self._attributes_init(model_save_dir, preprocessors, **kwargs) + self.model_and_config_save_paths = model_and_config_save_paths if model_and_config_save_paths else None self.encoder = NeuronEncoder( encoder, self, @@ -80,12 +85,9 @@ def __init__( self.configs[DECODER_NAME], self.neuron_configs[DECODER_NAME], ) - self.configs = configs - self.neuron_configs = neuron_configs self.dynamic_batch_size = all( neuron_config._config.neuron["dynamic_batch_size"] for neuron_config in self.neuron_configs.values() ) - self._attributes_init(model_save_dir, preprocessors, **kwargs) self.encoder_file_name = encoder_file_name self.decoder_file_name = decoder_file_name @@ -121,35 +123,20 @@ def _save_pretrained( if not self.model_and_config_save_paths.get(DECODER_NAME)[0].is_file(): self.model_and_config_save_paths.pop(DECODER_NAME) - dst_paths = { - ENCODER_NAME: save_directory / ENCODER_NAME / encoder_file_name, - DECODER_NAME: save_directory / DECODER_NAME / decoder_file_name, - } - - model_src_to_dst_path = { - self.model_and_config_save_paths[model_name][0]: dst_paths[model_name] - for model_name in set(self.model_and_config_save_paths.keys()).intersection(dst_paths.keys()) - } - # save - config_src_to_dst_path = { - self.model_and_config_save_paths[model_name][1]: dst_paths[model_name].parent / self.config_name - for model_name in set(self.model_and_config_save_paths.keys()).intersection(dst_paths.keys()) - } - - src_paths = list(model_src_to_dst_path.keys()) + list(config_src_to_dst_path.keys()) - dst_paths = list(model_src_to_dst_path.values()) + list(config_src_to_dst_path.values()) + dst_paths = [ + save_directory / ENCODER_NAME / encoder_file_name, + save_directory / DECODER_NAME / decoder_file_name, + ] + src_paths = [ + Path(self.model_and_config_save_paths[model_name][0]) + for model_name in set(self.model_and_config_save_paths.keys()).intersection([ENCODER_NAME, DECODER_NAME]) + ] for src_path, dst_path in zip(src_paths, dst_paths): dst_path.parent.mkdir(parents=True, exist_ok=True) if src_path.is_file(): shutil.copyfile(src_path, dst_path) - src_paths = [Path(path) for path in self.onnx_paths] - dst_paths = [save_directory / path.name for path in src_paths] - - if self.tokenizer is not None: - self.tokenizer.save_pretrained(save_directory) - self.generation_config.save_pretrained(save_directory) @classmethod @@ -205,17 +192,9 @@ def _from_pretrained( configs[name] = model_config neuron_configs[name] = cls._neuron_config_init(model_config) - # TODO: Debug num_beams unmatched issue - import pdb - - pdb.set_trace() encoder = cls.load_model(model_and_config_save_paths["encoder"][0]) decoder = cls.load_model(model_and_config_save_paths["decoder"][0]) - - # TODO: Debug num_beams unmatched issue - import pdb - - pdb.set_trace() + torch_neuronx.move_trace_to_device(decoder, 0) if model_save_dir is None: model_save_dir = new_model_save_dir @@ -244,6 +223,7 @@ def _from_pretrained( preprocessors=preprocessors, neuron_configs=neuron_configs, generation_config=generation_config, + model_and_config_save_paths=model_and_config_save_paths, ) @classmethod @@ -266,6 +246,11 @@ def _from_transformers( dynamic_batch_size: bool = False, **kwargs_shapes, ) -> "NeuronModelForConditionalGeneration": + if dynamic_batch_size is True: + logger.warning( + "Sequence-to-sequence models don't support dynamic batch size yet, `dynamic_batch_size` will be set to False." + ) + if task is None: task = TasksManager.infer_task_from_model(cls.auto_model_class) @@ -304,6 +289,11 @@ def _from_transformers( model_save_dir=save_dir, ) + def _save_config(self, save_directory): + save_directory = Path(save_directory) + self.configs[ENCODER_NAME].save_pretrained(save_directory / ENCODER_NAME) + self.configs[DECODER_NAME].save_pretrained(save_directory / DECODER_NAME) + class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin): auto_model_class = AutoModelForSeq2SeqLM From cdc885eba6ac0b6ce6340dc4209420201d2ff39f Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Mon, 6 Nov 2023 09:58:48 +0000 Subject: [PATCH 11/30] quick test inference --- optimum/neuron/modeling_seq2seq.py | 450 ++++++++++++++++++++++++++++- 1 file changed, 446 insertions(+), 4 deletions(-) diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index 5c614073f..2332b3a3b 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -19,11 +19,25 @@ from abc import abstractmethod from pathlib import Path from tempfile import TemporaryDirectory -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import torch from huggingface_hub import snapshot_download -from transformers import AutoConfig, AutoModelForSeq2SeqLM, GenerationConfig +from transformers import AutoConfig, AutoModelForSeq2SeqLM, GenerationConfig, PreTrainedTokenizerBase +from transformers.generation.beam_search import BeamScorer +from transformers.generation.logits_process import ( + LogitsProcessorList, +) +from transformers.generation.stopping_criteria import ( + MaxLengthCriteria, + MaxTimeCriteria, + StoppingCriteriaList, +) +from transformers.generation.utils import ( + BeamSearchOutput, + GreedySearchOutput, +) +from transformers.modeling_outputs import ModelOutput, Seq2SeqLMOutput from ..exporters.neuron import ( NeuronConfig, @@ -291,14 +305,442 @@ def _from_transformers( def _save_config(self, save_directory): save_directory = Path(save_directory) - self.configs[ENCODER_NAME].save_pretrained(save_directory / ENCODER_NAME) - self.configs[DECODER_NAME].save_pretrained(save_directory / DECODER_NAME) + config = self.configs[ENCODER_NAME].copy() + encoder_neuron_config = self.configs[ENCODER_NAME].neuron + decoder_neuron_config = self.configs[DECODER_NAME].neuron + # TODO: Combine encoder decoder config and save in root + combined_config_args = {} + config.__setattr__("neuron", combined_config_args) + config.save_pretrained(save_directory / ENCODER_NAME) class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin): auto_model_class = AutoModelForSeq2SeqLM main_input_name = "input_ids" + def _prepare_encoder_decoder_kwargs_for_generation( + self, inputs_tensor: torch.Tensor, model_kwargs, model_input_name: Optional[str] = None + ) -> Dict[str, Any]: + encoder = self.get_encoder() + model_kwargs["encoder_outputs"]: ModelOutput = encoder(inputs_tensor, model_kwargs["attention_mask"]) + return model_kwargs + + def _update_model_kwargs_for_xla_generation( + self, + model_kwargs: Dict[str, Any], + batch_size: int, + is_encoder_decoder: bool = False, + standardize_cache_format: bool = False, + max_length: Optional[int] = None, + seq_length: Optional[int] = None, + use_cache: bool = True, + ) -> Dict[str, Any]: + def _update_attention(model_kwargs, is_encoder_decoder): + """Updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`""" + + attention_mask_name = "decoder_attention_mask" if is_encoder_decoder else "attention_mask" + attention_mask = model_kwargs.pop(attention_mask_name) + attention_mask_update_slice = torch.ones( + (batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device + ) + attention_mask = torch.cat([attention_mask[:, 1:], attention_mask_update_slice], dim=-1) + mask = {attention_mask_name: attention_mask} + return mask + + mask = _update_attention(model_kwargs, is_encoder_decoder) + # sets the updated variables (mask and past_key_values) + model_kwargs.update(mask) + + # Set a mock cache tensor + model_kwargs["past_key_values"] = torch.tensor([]) + + return model_kwargs + + def _reorder_cache(self, past_key_values, beam_idx): + """ + This is needed for beam search and not greedy sampling + We reorder the cache within the trace so we can skip it in modelling_t5.py. So we override the _reorder_cache + """ + self.beam_idx = beam_idx + return past_key_values + + def generate( + self, + tokenizer: "PreTrainedTokenizerBase", + prompt: str, + max_length: int, + num_beams: int, + num_return_sequences: int, + device: str, + ): + batch_encoding = tokenizer( + prompt, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt" + ) + + past_key_values = self.encoder(batch_encoding["input_ids"], batch_encoding["attention_mask"]) + + decoder_attention_mask = torch.cat( + [torch.zeros((1, max_length - 1), dtype=torch.int32), torch.ones((1, 1), dtype=torch.int32)], axis=1 + ) + + # copy the new cache state to the decoder + if device == "xla": + for state, tensor in zip(self.decoder.parameters(), past_key_values): + state.copy_(tensor) + else: + # First half of the cache is self attention and the rest is cross attention + self.decoder.past_key_values_sa = past_key_values[: len(past_key_values) // 2] + self.decoder.past_key_values_ca = past_key_values[len(past_key_values) // 2 :] + + output = super().generate( + **batch_encoding, + max_length=max_length, + num_beams=num_beams, + num_return_sequences=num_return_sequences, + do_sample=False, + use_cache=True, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs={"last_hidden_state": torch.ones((1, 128, 1))}, + ) # Pass fake encoder_outputs so the transfomers code will not invoke the encoder + return output + + def forward( + self, + attention_mask: Optional[torch.FloatTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, + beam_scores=None, + **kwargs, + ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: + hidden_states = encoder_outputs["last_hidden_state"] + + if not hasattr(self, "beam_idx"): + # Infering the number of beams from the attention mask + num_beams = attention_mask.shape[0] + self.beam_idx = torch.arange(0, num_beams, dtype=torch.int64) + + decoder_outputs = self.decoder( + decoder_input_ids, decoder_attention_mask, hidden_states, attention_mask, self.beam_idx, beam_scores + ) + + # lm_logits = decoder_outputs[0] + next_token_scores = decoder_outputs[0] + next_tokens = decoder_outputs[1] + next_indices = decoder_outputs[2] + + return next_token_scores, next_tokens, next_indices + + def beam_search( + self, + input_ids: torch.LongTensor, + beam_scorer: "BeamScorer", + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[Union[int, List[int]]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + synced_gpus: Optional[bool] = False, + seq_length: Optional[int] = None, + **model_kwargs, + ) -> Union[BeamSearchOutput, torch.LongTensor]: + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id + if isinstance(eos_token_id, int): + eos_token_id = [eos_token_id] + output_scores = output_scores if output_scores is not None else self.generation_config.output_scores + output_attentions = ( + output_attentions if output_attentions is not None else self.generation_config.output_attentions + ) + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states + ) + + batch_size = len(beam_scorer._beam_hyps) + num_beams = beam_scorer.num_beams + + batch_beam_size, cur_len = input_ids.shape + + # Overwrite cur_len + cur_len = seq_length + + if num_beams * batch_size != batch_beam_size: + raise ValueError( + f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}." + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + beam_indices = ( + tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None + ) + + # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens + # of the first beam are considered to avoid sampling the exact same tokens across all beams. + # beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) + beam_scores_device = "cpu" + beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=beam_scores_device) + beam_scores[:, 1:] = -1e9 + beam_scores = beam_scores.view((batch_size * num_beams,)) + + while True: + # prepare model inputs + # From max_length-sized input_ids, select first + # cur_len - 1 values. + update_indices = torch.stack( + [torch.arange(input_ids.size(0)), torch.tensor(cur_len - 1).repeat(input_ids.size(0))], dim=-1 + ) + input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None] + model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs) + + next_token_scores, next_tokens, next_indices = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + beam_scores=beam_scores, + ) + + # stateless + beam_outputs = beam_scorer.process( + input_ids.to("cpu")[:, :cur_len], + next_token_scores.to("cpu"), + next_tokens.to("cpu"), + next_indices.to("cpu"), + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + beam_indices=beam_indices, + ) + + beam_scores = beam_outputs["next_beam_scores"] + beam_next_tokens = beam_outputs["next_beam_tokens"] + beam_idx = beam_outputs["next_beam_indices"] + + update_indices = torch.stack( + [torch.arange(batch_beam_size), torch.tensor(cur_len - 1).repeat(batch_beam_size)], dim=-1 + ) + update_indices_2 = torch.stack( + [torch.arange(batch_beam_size), torch.tensor(cur_len).repeat(batch_beam_size)], dim=-1 + ) + # First select beam_indices + device = input_ids.device + beam_idx_device = beam_idx.to(device=input_ids.device) + input_ids[:, :] = input_ids[beam_idx_device.long(), :] + + # Then append new tokens + input_ids[update_indices_2[:, 0], update_indices_2[:, 1], None] = ( + beam_next_tokens.unsqueeze(-1).to(device).to(torch.long) + ) + input_ids = input_ids * 1 # Hack to materialize tensor + + # update generated ids, model inputs, and length for next step + model_kwargs = self._update_model_kwargs_for_xla_generation( + model_kwargs, + batch_size=batch_beam_size, + is_encoder_decoder=self.config.is_encoder_decoder, + max_length=stopping_criteria.max_length, + seq_length=cur_len, + use_cache=model_kwargs["use_cache"], + ) + if model_kwargs["past_key_values"] is not None: + model_kwargs["past_key_values"] = self._reorder_cache( + model_kwargs["past_key_values"], beam_idx.to(torch.int64) + ) + + if return_dict_in_generate and output_scores: + beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices)))) + + # increase cur_len + cur_len = cur_len + 1 + + # stop when each sentence is finished, or if we exceed the maximum length + stop_criterion_1 = beam_scorer.is_done + if isinstance(stopping_criteria, list): + if len(stopping_criteria) == 1: + stopping_criteria = stopping_criteria[0] + + # Cases that can be handled in XLA without requiring + # non-padded input_ids + if isinstance(stopping_criteria, MaxLengthCriteria): + stop_criterion_2 = cur_len >= stopping_criteria.max_length + elif isinstance(stopping_criteria, MaxTimeCriteria): + stop_criterion_2 = stopping_criteria(input_ids, scores) + else: + # Other cases will be handled on CPU + batch_size, _ = input_ids.shape + input_ids_cpu = input_ids.to("cpu") + mask = torch.cat( + [torch.ones(batch_size, cur_len), torch.zeros(batch_size, input_ids.shape[1] - cur_len)], dim=1 + ).bool() + input_ids_cpu = torch.masked_select(input_ids_cpu, mask).reshape((batch_size, cur_len)) + scores_cpu = scores.to("cpu") if torch.is_tensor(scores) else scores + stop_criterion_2 = stopping_criteria(input_ids_cpu, scores_cpu) + + if stop_criterion_1 or stop_criterion_2: + if not synced_gpus: + break + else: + this_peer_finished = True + + sequence_outputs = beam_scorer.finalize( + input_ids.to("cpu"), + beam_scores.to("cpu"), + next_tokens.to("cpu"), + next_indices.to("cpu"), + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + max_length=stopping_criteria.max_length, + beam_indices=beam_indices, + ) + + for k, v in sequence_outputs.items(): + if type(v) == torch.Tensor: + sequence_outputs[k] = sequence_outputs[k].to(input_ids.device) + + return sequence_outputs["sequences"] + + def greedy_search( + self, + input_ids: torch.LongTensor, + logits_processor: Optional["LogitsProcessorList"] = None, + stopping_criteria: Optional["StoppingCriteriaList"] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[Union[int, List[int]]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + seq_length: Optional[int] = int, + streamer: Optional["BaseStreamer"] = None, + **model_kwargs, + ) -> Union[GreedySearchOutput, torch.LongTensor]: + """ + Overriding greedy sampling to use next tokens returned from neuron device instead of logits. + """ + # init values + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + use_cache = model_kwargs["use_cache"] if "use_cache" in model_kwargs else False + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id + if isinstance(eos_token_id, int): + eos_token_id = [eos_token_id] + eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None + output_scores = output_scores if output_scores is not None else self.generation_config.output_scores + output_attentions = ( + output_attentions if output_attentions is not None else self.generation_config.output_attentions + ) + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + + # keep track of which sequences are already finished + unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + + this_peer_finished = False # used by synced_gpus only + while True: + # prepare model inputs + # From max_length-sized input_ids, select first + # seq_length - 1 values. + + if model_kwargs.get("past_key_values") is None: + input_ids_ = input_ids[:, :seq_length] + else: + update_indices = torch.stack( + [torch.arange(input_ids.size(0)), torch.tensor(seq_length - 1).repeat(input_ids.size(0))], + dim=-1, + ) + input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None] + + model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs) + + # forward pass to get next token + output = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + next_tokens = output[0] + + # finished sentences should have their next token be a padding token + if eos_token_id is not None: + if pad_token_id is None: + raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") + next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) + + # update generated ids, model inputs, and length for next step + + batch_size, _ = input_ids.shape + update_indices = torch.stack( + [torch.arange(batch_size), torch.tensor(seq_length).repeat(batch_size)], dim=-1 + ) + input_ids[update_indices[:, 0], update_indices[:, 1]] = next_tokens[:] + model_kwargs = self._update_model_kwargs_for_xla_generation( + model_kwargs, + batch_size=batch_size, + is_encoder_decoder=self.config.is_encoder_decoder, + max_length=stopping_criteria.max_length, + seq_length=seq_length, + use_cache=use_cache, + ) + + seq_length += 1 + + # if eos_token was found in one sentence, set sentence to finished + if eos_token_id_tensor is not None: + unfinished_sequences = unfinished_sequences.mul( + next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) + ) + + # stop when each sentence is finished, or if we exceed the maximum length + stop_criterion_1 = unfinished_sequences.max() == 0 + + if isinstance(stopping_criteria, list): + if len(stopping_criteria) == 1: + stopping_criteria = stopping_criteria[0] + + # Cases that can be handled in XLA without requiring + # non-padded input_ids + if isinstance(stopping_criteria, MaxLengthCriteria): + stop_criterion_2 = seq_length >= stopping_criteria.max_length + elif isinstance(stopping_criteria, MaxTimeCriteria): + stop_criterion_2 = stopping_criteria(input_ids, scores) + else: + # Other cases will be handled on CPU + batch_size, _ = input_ids.shape + mask = torch.cat( + [torch.ones(batch_size, seq_length), torch.zeros(batch_size, input_ids.shape[1] - seq_length)], + dim=1, + ).bool() + input_ids_cpu = torch.masked_select(input_ids, mask).reshape((batch_size, seq_length)).to("cpu") + scores_cpu = scores.to("cpu") if torch.is_tensor(scores) else scores + stop_criterion_2 = stopping_criteria(input_ids_cpu, scores_cpu) + + if stop_criterion_1 or stop_criterion_2: + this_peer_finished = True + + if this_peer_finished: + break + + if streamer is not None: + streamer.end() + + return input_ids + class _NeuronSeq2SeqModelPart: """ From 2384e522c8e118ea46a380e40691a2059f0c0ab6 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Tue, 7 Nov 2023 23:50:10 +0000 Subject: [PATCH 12/30] fix config loding --- optimum/neuron/modeling_seq2seq.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index 2332b3a3b..344d6dad5 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -305,13 +305,27 @@ def _from_transformers( def _save_config(self, save_directory): save_directory = Path(save_directory) - config = self.configs[ENCODER_NAME].copy() - encoder_neuron_config = self.configs[ENCODER_NAME].neuron - decoder_neuron_config = self.configs[DECODER_NAME].neuron - # TODO: Combine encoder decoder config and save in root - combined_config_args = {} - config.__setattr__("neuron", combined_config_args) - config.save_pretrained(save_directory / ENCODER_NAME) + self.configs[ENCODER_NAME].save_pretrained(save_directory / ENCODER_NAME) + self.configs[DECODER_NAME].save_pretrained(save_directory / DECODER_NAME) + combined_config = self._combine_encoder_decoder_config( + encoder_config=self.configs[ENCODER_NAME], + decoder_config=self.configs[DECODER_NAME], + ) + combined_config.save_pretrained(save_directory) + + def _combine_encoder_decoder_config(self, encoder_config: "PretrainedConfig", decoder_config: "PretrainedConfig"): + encoder_neuron_config = encoder_config.neuron + decoder_neuron_config = decoder_config.neuron + + encoder_neuron_config["encoder_input_names"] = encoder_neuron_config.pop("input_names") + encoder_neuron_config["encoder_output_names"] = encoder_neuron_config.pop("output_names") + decoder_neuron_config["decoder_input_names"] = decoder_neuron_config.pop("input_names") + decoder_neuron_config["decoder_output_names"] = decoder_neuron_config.pop("output_names") + + neuron_config = encoder_neuron_config.update(decoder_neuron_config) + encoder_config.__setattr__("neuron", neuron_config) + + return encoder_config class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin): From ae9df1add5e98573d6f98250f7b600c45939a8bf Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Thu, 9 Nov 2023 00:26:55 +0000 Subject: [PATCH 13/30] finish modeling, works --- optimum/exporters/neuron/__main__.py | 6 +- optimum/neuron/generation/utils.py | 2 +- optimum/neuron/modeling_base.py | 9 +- optimum/neuron/modeling_seq2seq.py | 269 +++++++++++++++++---------- 4 files changed, 179 insertions(+), 107 deletions(-) diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index c671f9cb0..ecdf76ba5 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -193,11 +193,11 @@ def _get_submodels_and_neuron_configs( is_encoder_decoder = model.config.is_encoder_decoder if is_stable_diffusion: - return _get_submodels_and_neuron_configs_for_stable_diffusion( + models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_stable_diffusion( model, input_shapes, task, output, dynamic_batch_size ) elif is_encoder_decoder: - return _get_submodels_and_neuron_configs_for_encoder_decoder( + models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_encoder_decoder( model, input_shapes, task, output, dynamic_batch_size, model_name_or_path ) else: @@ -209,7 +209,7 @@ def _get_submodels_and_neuron_configs( output_model_names = {model_name: "model.neuron"} models_and_neuron_configs = {model_name: (model, neuron_config)} maybe_save_preprocessors(model_name_or_path, output) - return models_and_neuron_configs, output_model_names + return models_and_neuron_configs, output_model_names def _get_submodels_and_neuron_configs_for_stable_diffusion( diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py index ce6f93e8b..81a5c3fa2 100644 --- a/optimum/neuron/generation/utils.py +++ b/optimum/neuron/generation/utils.py @@ -967,7 +967,7 @@ def generate( "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1." ) - if self.device.type != input_ids.device.type: + if hasattr(self, "device") and self.device.type != input_ids.device.type: warnings.warn( "You are calling .generate() with the `input_ids` being on a device type different" f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model" diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py index 05790c084..0d26adaee 100644 --- a/optimum/neuron/modeling_base.py +++ b/optimum/neuron/modeling_base.py @@ -480,10 +480,15 @@ def _pad_to_compiled_shape(self, inputs: Dict[str, "torch.Tensor"]): # Pad to batch size: dimension 0 (pad_token_id can't be 0) padding = (0,) * len(padding) - if self.neuron_config.dynamic_batch_size is True and input_tensor.size(0) % target_shapes[0] == 0: + is_encoder_decoder = getattr(self.config, "is_encoder_decoder", False) + if ( + not is_encoder_decoder + and self.neuron_config.dynamic_batch_size is True + and input_tensor.size(0) % target_shapes[0] == 0 + ): inputs[input_name] = input_tensor continue - elif self.neuron_config.dynamic_batch_size is True: + elif not is_encoder_decoder and self.neuron_config.dynamic_batch_size is True: target_shape = (input_tensor.size(0) // target_shapes[0] + 1) * target_shapes[0] to_pad = target_shape - input_tensor.size(0) else: diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index 344d6dad5..1ae49e721 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -13,17 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. """NeuroModelForXXX classes for seq2seq models' inference on neuron devices.""" +import copy import logging import os import shutil from abc import abstractmethod from pathlib import Path from tempfile import TemporaryDirectory -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import torch from huggingface_hub import snapshot_download -from transformers import AutoConfig, AutoModelForSeq2SeqLM, GenerationConfig, PreTrainedTokenizerBase +from transformers import AutoConfig, AutoModelForSeq2SeqLM, GenerationConfig from transformers.generation.beam_search import BeamScorer from transformers.generation.logits_process import ( LogitsProcessorList, @@ -37,7 +38,7 @@ BeamSearchOutput, GreedySearchOutput, ) -from transformers.modeling_outputs import ModelOutput, Seq2SeqLMOutput +from transformers.modeling_outputs import Seq2SeqLMOutput from ..exporters.neuron import ( NeuronConfig, @@ -57,7 +58,8 @@ if TYPE_CHECKING: - from transformers import PretrainedConfig + from transformers import PretrainedConfig, PreTrainedModel + from transformers.generation.streamers import BaseStreamer if is_neuronx_available(): import torch_neuronx @@ -73,18 +75,23 @@ def __init__( self, encoder: torch.jit._script.ScriptModule, decoder: torch.jit._script.ScriptModule, - configs: Optional[Dict[str, "PretrainedConfig"]] = None, + config: "PretrainedConfig", model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, encoder_file_name: Optional[str] = NEURON_FILE_NAME, decoder_file_name: Optional[str] = NEURON_FILE_NAME, preprocessors: Optional[List] = None, neuron_configs: Optional[Dict[str, "NeuronConfig"]] = None, + configs: Optional[Dict[str, "PretrainedConfig"]] = None, generation_config: Optional[GenerationConfig] = None, model_and_config_save_paths: Optional[Dict[str, Tuple[str, Path]]] = None, **kwargs, ): + self.config = config self.configs = configs self.neuron_configs = neuron_configs + self.input_static_shapes = NeuronModelForConditionalGeneration.get_input_static_shapes( + self.neuron_configs[ENCODER_NAME] + ) # only for the encoder self._attributes_init(model_save_dir, preprocessors, **kwargs) self.model_and_config_save_paths = model_and_config_save_paths if model_and_config_save_paths else None self.encoder = NeuronEncoder( @@ -93,7 +100,7 @@ def __init__( self.configs[ENCODER_NAME], self.neuron_configs[ENCODER_NAME], ) - self.decoder = NeuronEncoder( + self.decoder = NeuronDecoder( decoder, self, self.configs[DECODER_NAME], @@ -142,8 +149,8 @@ def _save_pretrained( save_directory / DECODER_NAME / decoder_file_name, ] src_paths = [ - Path(self.model_and_config_save_paths[model_name][0]) - for model_name in set(self.model_and_config_save_paths.keys()).intersection([ENCODER_NAME, DECODER_NAME]) + Path(self.model_and_config_save_paths[ENCODER_NAME][0]), + Path(self.model_and_config_save_paths[DECODER_NAME][0]), ] for src_path, dst_path in zip(src_paths, dst_paths): @@ -206,8 +213,13 @@ def _from_pretrained( configs[name] = model_config neuron_configs[name] = cls._neuron_config_init(model_config) - encoder = cls.load_model(model_and_config_save_paths["encoder"][0]) - decoder = cls.load_model(model_and_config_save_paths["decoder"][0]) + # Initialize Neuron Runtime before loading models + runtime = torch.classes.neuron.Runtime() + runtime.initialize() + runtime.set_default_neuron_cores(0, 1) + + encoder = cls.load_model(model_and_config_save_paths[ENCODER_NAME][0]) + decoder = cls.load_model(model_and_config_save_paths[DECODER_NAME][0]) torch_neuronx.move_trace_to_device(decoder, 0) if model_save_dir is None: @@ -230,12 +242,13 @@ def _from_pretrained( return cls( encoder=encoder, decoder=decoder, - configs=configs, + config=config, model_save_dir=model_save_dir, encoder_file_name=encoder_file_name, decoder_file_name=decoder_file_name, preprocessors=preprocessors, neuron_configs=neuron_configs, + configs=configs, generation_config=generation_config, model_and_config_save_paths=model_and_config_save_paths, ) @@ -316,108 +329,30 @@ def _save_config(self, save_directory): def _combine_encoder_decoder_config(self, encoder_config: "PretrainedConfig", decoder_config: "PretrainedConfig"): encoder_neuron_config = encoder_config.neuron decoder_neuron_config = decoder_config.neuron + combined_config = copy.deepcopy(encoder_config) encoder_neuron_config["encoder_input_names"] = encoder_neuron_config.pop("input_names") encoder_neuron_config["encoder_output_names"] = encoder_neuron_config.pop("output_names") decoder_neuron_config["decoder_input_names"] = decoder_neuron_config.pop("input_names") decoder_neuron_config["decoder_output_names"] = decoder_neuron_config.pop("output_names") - neuron_config = encoder_neuron_config.update(decoder_neuron_config) - encoder_config.__setattr__("neuron", neuron_config) + encoder_neuron_config.update(decoder_neuron_config) + encoder_neuron_config.pop("model_type") + combined_config.__setattr__("neuron", encoder_neuron_config) - return encoder_config + return combined_config + + def can_generate(self): + logger.warning( + "NeuronModelForConditionalGeneration is an abstract class and is not meant to be used for generation. Please use NeuronModelForSeq2SeqLM instead." + ) + return False class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin): auto_model_class = AutoModelForSeq2SeqLM main_input_name = "input_ids" - def _prepare_encoder_decoder_kwargs_for_generation( - self, inputs_tensor: torch.Tensor, model_kwargs, model_input_name: Optional[str] = None - ) -> Dict[str, Any]: - encoder = self.get_encoder() - model_kwargs["encoder_outputs"]: ModelOutput = encoder(inputs_tensor, model_kwargs["attention_mask"]) - return model_kwargs - - def _update_model_kwargs_for_xla_generation( - self, - model_kwargs: Dict[str, Any], - batch_size: int, - is_encoder_decoder: bool = False, - standardize_cache_format: bool = False, - max_length: Optional[int] = None, - seq_length: Optional[int] = None, - use_cache: bool = True, - ) -> Dict[str, Any]: - def _update_attention(model_kwargs, is_encoder_decoder): - """Updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`""" - - attention_mask_name = "decoder_attention_mask" if is_encoder_decoder else "attention_mask" - attention_mask = model_kwargs.pop(attention_mask_name) - attention_mask_update_slice = torch.ones( - (batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device - ) - attention_mask = torch.cat([attention_mask[:, 1:], attention_mask_update_slice], dim=-1) - mask = {attention_mask_name: attention_mask} - return mask - - mask = _update_attention(model_kwargs, is_encoder_decoder) - # sets the updated variables (mask and past_key_values) - model_kwargs.update(mask) - - # Set a mock cache tensor - model_kwargs["past_key_values"] = torch.tensor([]) - - return model_kwargs - - def _reorder_cache(self, past_key_values, beam_idx): - """ - This is needed for beam search and not greedy sampling - We reorder the cache within the trace so we can skip it in modelling_t5.py. So we override the _reorder_cache - """ - self.beam_idx = beam_idx - return past_key_values - - def generate( - self, - tokenizer: "PreTrainedTokenizerBase", - prompt: str, - max_length: int, - num_beams: int, - num_return_sequences: int, - device: str, - ): - batch_encoding = tokenizer( - prompt, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt" - ) - - past_key_values = self.encoder(batch_encoding["input_ids"], batch_encoding["attention_mask"]) - - decoder_attention_mask = torch.cat( - [torch.zeros((1, max_length - 1), dtype=torch.int32), torch.ones((1, 1), dtype=torch.int32)], axis=1 - ) - - # copy the new cache state to the decoder - if device == "xla": - for state, tensor in zip(self.decoder.parameters(), past_key_values): - state.copy_(tensor) - else: - # First half of the cache is self attention and the rest is cross attention - self.decoder.past_key_values_sa = past_key_values[: len(past_key_values) // 2] - self.decoder.past_key_values_ca = past_key_values[len(past_key_values) // 2 :] - - output = super().generate( - **batch_encoding, - max_length=max_length, - num_beams=num_beams, - num_return_sequences=num_return_sequences, - do_sample=False, - use_cache=True, - decoder_attention_mask=decoder_attention_mask, - encoder_outputs={"last_hidden_state": torch.ones((1, 128, 1))}, - ) # Pass fake encoder_outputs so the transfomers code will not invoke the encoder - return output - def forward( self, attention_mask: Optional[torch.FloatTensor] = None, @@ -438,13 +373,59 @@ def forward( decoder_input_ids, decoder_attention_mask, hidden_states, attention_mask, self.beam_idx, beam_scores ) - # lm_logits = decoder_outputs[0] next_token_scores = decoder_outputs[0] next_tokens = decoder_outputs[1] next_indices = decoder_outputs[2] return next_token_scores, next_tokens, next_indices + def generate( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + generation_config: Optional[GenerationConfig] = None, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, + synced_gpus: Optional[bool] = None, + assistant_model: Optional["PreTrainedModel"] = None, + streamer: Optional["BaseStreamer"] = None, + num_return_sequences: Optional[int] = None, + device: str = "xla", + **kwargs, + ): + max_length = self.neuron_configs[ENCODER_NAME].sequence_length + num_beams = self.neuron_configs[ENCODER_NAME].num_beams + batch_size = self.neuron_configs[ENCODER_NAME].batch_size + + inputs = {"input_ids": input_ids} + if attention_mask is not None: + inputs["attention_mask"] = attention_mask + inputs = self._pad_to_compiled_shape(inputs) + + past_key_values = self.encoder(**inputs) + + decoder_attention_mask = torch.cat( + [torch.zeros((batch_size, max_length - 1), dtype=torch.int64), torch.ones((1, 1), dtype=torch.int64)], + axis=1, + ) + + # copy the new cache state to the decoder + for state, tensor in zip(self.decoder.model.parameters(), past_key_values): + state.copy_(tensor) + + output = super().generate( + **inputs, + max_length=max_length, + num_beams=num_beams, + num_return_sequences=num_return_sequences, + do_sample=False, + use_cache=True, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs={"last_hidden_state": torch.ones((batch_size, max_length, 1))}, + ) # Pass fake encoder_outputs so the transfomers code will not invoke the encoder + return output + def beam_search( self, input_ids: torch.LongTensor, @@ -642,6 +623,10 @@ def greedy_search( logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() use_cache = model_kwargs["use_cache"] if "use_cache" in model_kwargs else False stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + if max_length is not None: + from transformers.generation.stopping_criteria import validate_stopping_criteria + + stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id if isinstance(eos_token_id, int): @@ -755,6 +740,86 @@ def greedy_search( return input_ids + def _reorder_cache(self, past_key_values, beam_idx): + """ + The cache was reordered during the tracing of the decoder so we can skip it here. This is needed for beam search and not greedy sampling. + """ + self.beam_idx = beam_idx + return past_key_values + + def get_encoder(self) -> "NeuronEncoder": + return self.encoder + + def _update_model_kwargs_for_xla_generation( + self, + model_kwargs: Dict[str, Any], + batch_size: int, + is_encoder_decoder: bool = False, + standardize_cache_format: bool = False, + max_length: Optional[int] = None, + seq_length: Optional[int] = None, + use_cache: bool = True, + ) -> Dict[str, Any]: + def _update_attention(model_kwargs, is_encoder_decoder): + """Updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`""" + + attention_mask_name = "decoder_attention_mask" if is_encoder_decoder else "attention_mask" + attention_mask = model_kwargs.pop(attention_mask_name) + attention_mask_update_slice = torch.ones( + (batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device + ) + attention_mask = torch.cat([attention_mask[:, 1:], attention_mask_update_slice], dim=-1) + mask = {attention_mask_name: attention_mask} + return mask + + mask = _update_attention(model_kwargs, is_encoder_decoder) + # sets the updated variables (mask and past_key_values) + model_kwargs.update(mask) + + # Set a mock cache tensor + model_kwargs["past_key_values"] = torch.tensor([]) + + return model_kwargs + + # Override to cut the input_ids to just last token + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, + decoder_attention_mask=None, + cross_attn_head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs, + ): + # cut decoder_input_ids as past is cached + input_ids = input_ids[:, -1:] + + return { + "decoder_input_ids": input_ids, + "past_key_values": past_key_values, + "encoder_outputs": encoder_outputs, + "attention_mask": attention_mask, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + "decoder_attention_mask": decoder_attention_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, + } + + def _validate_static_shape(self, input_shapes: List[int], target_shapes: List[int]) -> bool: + """ + Checks if a input needs to be padded. + """ + return input_shapes == target_shapes + + def can_generate(self): + """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate.""" + return True + class _NeuronSeq2SeqModelPart: """ @@ -790,6 +855,8 @@ class NeuronEncoder(_NeuronSeq2SeqModelPart): Encoder part of the encoder-decoder model for Neuron inference. (Actually it's a monolith of encoder + decoder without past_key_values to workaround the control flow in the decoder). """ + main_input_name = "input_ids" + def __init__( self, model: torch.jit._script.ScriptModule, From a3784cf9e5805962e05f8851b3689d52ceb7f262 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Thu, 9 Nov 2023 14:47:47 +0000 Subject: [PATCH 14/30] add part of tests --- optimum/exporters/neuron/__main__.py | 6 +- optimum/neuron/modeling_seq2seq.py | 26 +++---- tests/cli/test_export_cli.py | 29 ++++++++ tests/exporters/exporters_utils.py | 4 ++ tests/exporters/test_export.py | 102 +++++++++++++++------------ tests/generation/conftest.py | 45 ++++++++++-- tests/generation/test_export.py | 75 +++++++++++++------- tests/generation/test_generate.py | 12 ++-- tests/generation/test_hub.py | 78 +++++++++++++------- 9 files changed, 255 insertions(+), 122 deletions(-) diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index ecdf76ba5..324c678ac 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -22,7 +22,7 @@ from typing import TYPE_CHECKING, Any, Dict, Optional, Union from requests.exceptions import ConnectionError as RequestsConnectionError -from transformers import AutoConfig +from transformers import AutoConfig, PretrainedConfig from ...neuron.utils import ( DECODER_NAME, @@ -190,7 +190,9 @@ def _get_submodels_and_neuron_configs( model_name_or_path: Optional[Union[str, Path]] = None, ): is_stable_diffusion = "stable-diffusion" in task - is_encoder_decoder = model.config.is_encoder_decoder + is_encoder_decoder = ( + getattr(model.config, "is_encoder_decoder", False) if isinstance(model.config, PretrainedConfig) else False + ) if is_stable_diffusion: models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_stable_diffusion( diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index 1ae49e721..6a10ea7a5 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -387,11 +387,9 @@ def generate( logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, - synced_gpus: Optional[bool] = None, assistant_model: Optional["PreTrainedModel"] = None, streamer: Optional["BaseStreamer"] = None, num_return_sequences: Optional[int] = None, - device: str = "xla", **kwargs, ): max_length = self.neuron_configs[ENCODER_NAME].sequence_length @@ -416,14 +414,21 @@ def generate( output = super().generate( **inputs, + generation_config=generation_config, + logits_processor=logits_processor, + stopping_criteria=stopping_criteria, + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + assistant_model=assistant_model, + streamer=streamer, + num_return_sequences=num_return_sequences, max_length=max_length, num_beams=num_beams, - num_return_sequences=num_return_sequences, - do_sample=False, - use_cache=True, + do_sample=kwargs.pop("do_sample", False), + use_cache=kwargs.pop("use_cache", True), decoder_attention_mask=decoder_attention_mask, + # Pass fake encoder_outputs so the transfomers code will not invoke the encoder encoder_outputs={"last_hidden_state": torch.ones((batch_size, max_length, 1))}, - ) # Pass fake encoder_outputs so the transfomers code will not invoke the encoder + ) return output def beam_search( @@ -432,7 +437,6 @@ def beam_search( beam_scorer: "BeamScorer", logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, - max_length: Optional[int] = None, pad_token_id: Optional[int] = None, eos_token_id: Optional[Union[int, List[int]]] = None, output_attentions: Optional[bool] = None, @@ -443,6 +447,9 @@ def beam_search( seq_length: Optional[int] = None, **model_kwargs, ) -> Union[BeamSearchOutput, torch.LongTensor]: + """ + Overriding beam search to use next_token_scores returned from neuron device instead of logits. + """ logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id @@ -580,8 +587,6 @@ def beam_search( if stop_criterion_1 or stop_criterion_2: if not synced_gpus: break - else: - this_peer_finished = True sequence_outputs = beam_scorer.finalize( input_ids.to("cpu"), @@ -642,9 +647,6 @@ def greedy_search( # init attention / hidden states / scores tuples scores = () if (return_dict_in_generate and output_scores) else None - decoder_attentions = () if (return_dict_in_generate and output_attentions) else None - cross_attentions = () if (return_dict_in_generate and output_attentions) else None - decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None # keep track of which sequences are already finished unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device) diff --git a/tests/cli/test_export_cli.py b/tests/cli/test_export_cli.py index b0ed83121..d6c5d38f1 100644 --- a/tests/cli/test_export_cli.py +++ b/tests/cli/test_export_cli.py @@ -213,3 +213,32 @@ def test_stable_diffusion_xl(self): shell=False, check=True, ) + + @requires_neuronx + def test_t5(self): + model_id = "hf-internal-testing/tiny-random-t5" + with tempfile.TemporaryDirectory() as tempdir: + subprocess.run( + [ + "optimum-cli", + "export", + "neuron", + "--model", + model_id, + "--task", + "text2text-generation", + "--batch_size", + "1", + "--sequence_length", + "18", + "--num_beams", + "4", + "--auto_cast", + "matmul", + "--auto_cast_type", + "bf16", + tempdir, + ], + shell=False, + check=True, + ) diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 55ed9fed5..7ad87ae36 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -32,6 +32,10 @@ "xlm-roberta": "hf-internal-testing/tiny-xlm-roberta", } +ENCODER_DECODER_MODELS_TINY = { + "t5": "hf-internal-testing/tiny-random-t5", +} + STABLE_DIFFUSION_MODELS_TINY = { "stable-diffusion": ["hf-internal-testing/tiny-stable-diffusion-torch"], "stable-diffusion-xl": ["echarlaix/tiny-random-stable-diffusion-xl"], diff --git a/tests/exporters/test_export.py b/tests/exporters/test_export.py index de1fa0dc0..2bd1981ee 100644 --- a/tests/exporters/test_export.py +++ b/tests/exporters/test_export.py @@ -14,7 +14,6 @@ # limitations under the License. import copy -import os import random import unittest from pathlib import Path @@ -22,7 +21,7 @@ from typing import Dict, Optional from parameterized import parameterized -from transformers import AutoConfig, set_seed +from transformers import AutoConfig, AutoModelForSeq2SeqLM, set_seed from transformers.testing_utils import require_vision from optimum.exporters.neuron import ( @@ -30,25 +29,17 @@ build_stable_diffusion_components_mandatory_shapes, export, export_models, - get_stable_diffusion_models_for_export, validate_model_outputs, validate_models_outputs, ) +from optimum.exporters.neuron.__main__ import _get_submodels_and_neuron_configs from optimum.exporters.neuron.model_configs import * # noqa: F403 from optimum.exporters.tasks import TasksManager -from optimum.neuron.utils import ( - DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, - DIFFUSION_MODEL_TEXT_ENCODER_NAME, - DIFFUSION_MODEL_UNET_NAME, - DIFFUSION_MODEL_VAE_DECODER_NAME, - DIFFUSION_MODEL_VAE_ENCODER_NAME, - NEURON_FILE_NAME, -) from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available, logging from optimum.utils.testing_utils import require_diffusers -from .exporters_utils import EXPORT_MODELS_TINY, STABLE_DIFFUSION_MODELS_TINY +from .exporters_utils import ENCODER_DECODER_MODELS_TINY, EXPORT_MODELS_TINY, STABLE_DIFFUSION_MODELS_TINY if is_diffusers_available(): @@ -164,29 +155,23 @@ class NeuronStableDiffusionExportTestCase(unittest.TestCase): """ @parameterized.expand(STABLE_DIFFUSION_MODELS_TINY["stable-diffusion"]) - def test_export_for_stable_diffusion_models(self, model_name): + def test_export_for_stable_diffusion_models(self, model_id): set_seed(SEED) # prepare neuron config / models - pipe = StableDiffusionPipeline.from_pretrained(model_name) + model = StableDiffusionPipeline.from_pretrained(model_id) input_shapes = build_stable_diffusion_components_mandatory_shapes( - **{"batch_size": 1, "height": 64, "width": 64} - ) - models_and_neuron_configs = get_stable_diffusion_models_for_export( - pipeline=pipe, - task="stable-diffusion", - dynamic_batch_size=False, - **input_shapes, + **{"batch_size": 1, "height": 64, "width": 64, "num_images_per_prompt": 4} ) - output_model_names = { - DIFFUSION_MODEL_TEXT_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_TEXT_ENCODER_NAME, NEURON_FILE_NAME), - DIFFUSION_MODEL_UNET_NAME: os.path.join(DIFFUSION_MODEL_UNET_NAME, NEURON_FILE_NAME), - DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME), - DIFFUSION_MODEL_VAE_DECODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_DECODER_NAME, NEURON_FILE_NAME), - } - with TemporaryDirectory() as tmpdirname: + models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs( + model=model, + input_shapes=input_shapes, + task="stable-diffusion", + output=Path(tmpdirname), + model_name_or_path=model_id, + ) _, neuron_outputs = export_models( models_and_neuron_configs=models_and_neuron_configs, output_dir=Path(tmpdirname), @@ -200,30 +185,59 @@ def test_export_for_stable_diffusion_models(self, model_name): ) @parameterized.expand(STABLE_DIFFUSION_MODELS_TINY["stable-diffusion-xl"]) - def test_export_for_stable_diffusion_xl_models(self, model_name): + def test_export_for_stable_diffusion_xl_models(self, model_id): set_seed(SEED) # prepare neuron config / models - pipe = StableDiffusionXLPipeline.from_pretrained(model_name) + model = StableDiffusionXLPipeline.from_pretrained(model_id) input_shapes = build_stable_diffusion_components_mandatory_shapes( - **{"batch_size": 1, "height": 64, "width": 64} - ) - models_and_neuron_configs = get_stable_diffusion_models_for_export( - pipeline=pipe, - task="stable-diffusion-xl", - dynamic_batch_size=False, - **input_shapes, + **{"batch_size": 1, "height": 64, "width": 64, "num_images_per_prompt": 4} ) - output_model_names = { - DIFFUSION_MODEL_TEXT_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_TEXT_ENCODER_NAME, NEURON_FILE_NAME), - DIFFUSION_MODEL_TEXT_ENCODER_2_NAME: os.path.join(DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, NEURON_FILE_NAME), - DIFFUSION_MODEL_UNET_NAME: os.path.join(DIFFUSION_MODEL_UNET_NAME, NEURON_FILE_NAME), - DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME), - DIFFUSION_MODEL_VAE_DECODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_DECODER_NAME, NEURON_FILE_NAME), - } + with TemporaryDirectory() as tmpdirname: + models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs( + model=model, + input_shapes=input_shapes, + task="stable-diffusion-xl", + output=Path(tmpdirname), + model_name_or_path=model_id, + ) + _, neuron_outputs = export_models( + models_and_neuron_configs=models_and_neuron_configs, + output_dir=Path(tmpdirname), + output_file_names=output_model_names, + ) + validate_models_outputs( + models_and_neuron_configs=models_and_neuron_configs, + neuron_named_outputs=neuron_outputs, + output_dir=Path(tmpdirname), + neuron_files_subpaths=output_model_names, + ) + + +@is_inferentia_test +@requires_neuronx +class NeuronEncoderDecoderExportTestCase(unittest.TestCase): + """ + Integration tests ensuring encoder-decoder models are correctly exported. + """ + + @parameterized.expand(ENCODER_DECODER_MODELS_TINY.items()) + def test_export_for_encoder_decoder_models(self, model_name, model_id): + set_seed(SEED) + + # prepare neuron config / models + model = AutoModelForSeq2SeqLM.from_pretrained(model_id) + input_shapes = {"batch_size": 1, "sequence_length": 18, "num_beams": 4} with TemporaryDirectory() as tmpdirname: + models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs( + model=model, + input_shapes=input_shapes, + task="text2text-generation", + output=Path(tmpdirname), + model_name_or_path=model_id, + ) _, neuron_outputs = export_models( models_and_neuron_configs=models_and_neuron_configs, output_dir=Path(tmpdirname), diff --git a/tests/generation/conftest.py b/tests/generation/conftest.py index 3997bc9a6..ccf40d151 100644 --- a/tests/generation/conftest.py +++ b/tests/generation/conftest.py @@ -17,7 +17,7 @@ import pytest from transformers import AutoTokenizer -from optimum.neuron import NeuronModelForCausalLM +from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM from optimum.neuron.utils.testing_utils import requires_neuronx from optimum.utils.testing_utils import USER @@ -29,24 +29,32 @@ "llama": "dacorvo/tiny-random-llama", "opt": "hf-internal-testing/tiny-random-OPTForCausalLM", } +SEQ2SEQ_MODEL_NAMES = { + "t5": "hf-internal-testing/tiny-random-t5", +} @pytest.fixture(scope="module", params=[DECODER_MODEL_NAMES[model_arch] for model_arch in DECODER_MODEL_ARCHITECTURES]) -def export_model_id(request): +def export_decoder_id(request): + return request.param + + +@pytest.fixture(scope="module", params=[SEQ2SEQ_MODEL_NAMES[model_arch] for model_arch in SEQ2SEQ_MODEL_NAMES]) +def export_seq2seq_id(request): return request.param @pytest.fixture(scope="module") @requires_neuronx -def neuron_model_path(export_model_id): +def neuron_decoder_path(export_decoder_id): model = NeuronModelForCausalLM.from_pretrained( - export_model_id, export=True, batch_size=1, sequence_length=100, num_cores=2 + export_decoder_id, export=True, batch_size=1, sequence_length=100, num_cores=2 ) model_dir = TemporaryDirectory() model_path = model_dir.name model.save_pretrained(model_path) del model - tokenizer = AutoTokenizer.from_pretrained(export_model_id) + tokenizer = AutoTokenizer.from_pretrained(export_decoder_id) tokenizer.save_pretrained(model_path) del tokenizer # Yield instead of returning to keep a reference to the temporary directory. @@ -56,8 +64,31 @@ def neuron_model_path(export_model_id): @pytest.fixture(scope="module") -def neuron_push_id(export_model_id): - model_name = export_model_id.split("/")[-1] +@requires_neuronx +def neuron_seq2seq_path(export_seq2seq_id): + model = NeuronModelForSeq2SeqLM.from_pretrained( + export_seq2seq_id, export=True, batch_size=1, sequence_length=32, num_beams=4 + ) + model_dir = TemporaryDirectory() + model_path = model_dir.name + model.save_pretrained(model_path) + del model + # Yield instead of returning to keep a reference to the temporary directory. + # It will go out of scope and be released only once all tests needing the fixture + # have been completed. + yield model_path + + +@pytest.fixture(scope="module") +def neuron_push_decoder_id(export_decoder_id): + model_name = export_decoder_id.split("/")[-1] + repo_id = f"{USER}/{model_name}-neuronx" + return repo_id + + +@pytest.fixture(scope="module") +def neuron_push_seq2seq_id(export_seq2seq_id): + model_name = export_seq2seq_id.split("/")[-1] repo_id = f"{USER}/{model_name}-neuronx" return repo_id diff --git a/tests/generation/test_export.py b/tests/generation/test_export.py index e4eaef935..9b88b1515 100644 --- a/tests/generation/test_export.py +++ b/tests/generation/test_export.py @@ -16,34 +16,59 @@ import pytest from generation_utils import check_neuron_model -from optimum.neuron import NeuronModelForCausalLM +from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx -@pytest.mark.parametrize( - "batch_size, sequence_length, num_cores, auto_cast_type", - [ - [1, 100, 2, "fp32"], - [1, 100, 2, "fp16"], - [2, 100, 2, "fp16"], - ], -) -@is_inferentia_test -@requires_neuronx -def test_model_export(export_model_id, batch_size, sequence_length, num_cores, auto_cast_type): - model = NeuronModelForCausalLM.from_pretrained( - export_model_id, - export=True, - batch_size=batch_size, - sequence_length=sequence_length, - num_cores=num_cores, - auto_cast_type=auto_cast_type, +class DecoderTests: + @pytest.mark.parametrize( + "batch_size, sequence_length, num_cores, auto_cast_type", + [ + [1, 100, 2, "fp32"], + [1, 100, 2, "fp16"], + [2, 100, 2, "fp16"], + ], ) - check_neuron_model(model, batch_size, sequence_length, num_cores, auto_cast_type) + @is_inferentia_test + @requires_neuronx + def test_decoder_export(export_decoder_id, batch_size, sequence_length, num_cores, auto_cast_type): + model = NeuronModelForCausalLM.from_pretrained( + export_decoder_id, + export=True, + batch_size=batch_size, + sequence_length=sequence_length, + num_cores=num_cores, + auto_cast_type=auto_cast_type, + ) + check_neuron_model(model, batch_size, sequence_length, num_cores, auto_cast_type) + @is_inferentia_test + @requires_neuronx + def test_model_from_path(neuron_decoder_path): + model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) + check_neuron_model(model) -@is_inferentia_test -@requires_neuronx -def test_model_from_path(neuron_model_path): - model = NeuronModelForCausalLM.from_pretrained(neuron_model_path) - check_neuron_model(model) + +class Seq2SeqTests: + @pytest.mark.parametrize( + "batch_size, sequence_length, num_beams", + [ + [1, 32, 1], + [1, 32, 4], + ], + ) + @is_inferentia_test + @requires_neuronx + def test_seq2seq_export(export_seq2seq_id, batch_size, sequence_length, num_beams): + model = NeuronModelForSeq2SeqLM.from_pretrained( + export_seq2seq_id, + export=True, + batch_size=batch_size, + sequence_length=sequence_length, + num_beams=num_beams, + ) + + @is_inferentia_test + @requires_neuronx + def test_model_from_path(neuron_seq2seq_path): + model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path) diff --git a/tests/generation/test_generate.py b/tests/generation/test_generate.py index 47eecb8a7..04ec9b9ea 100644 --- a/tests/generation/test_generate.py +++ b/tests/generation/test_generate.py @@ -40,17 +40,17 @@ def _test_model_generation(model, tokenizer, batch_size, input_length, **gen_kwa ) @is_inferentia_test @requires_neuronx -def test_model_generation(neuron_model_path, gen_kwargs): - model = NeuronModelForCausalLM.from_pretrained(neuron_model_path) - tokenizer = AutoTokenizer.from_pretrained(neuron_model_path) +def test_model_generation(neuron_decoder_path, gen_kwargs): + model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) + tokenizer = AutoTokenizer.from_pretrained(neuron_decoder_path) _test_model_generation(model, tokenizer, model.batch_size, 10, **gen_kwargs) @is_inferentia_test @requires_neuronx -def test_model_generation_input_dimensions(neuron_model_path): - model = NeuronModelForCausalLM.from_pretrained(neuron_model_path) - tokenizer = AutoTokenizer.from_pretrained(neuron_model_path) +def test_model_generation_input_dimensions(neuron_decoder_path): + model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) + tokenizer = AutoTokenizer.from_pretrained(neuron_decoder_path) # Using valid input dimensions _test_model_generation(model, tokenizer, model.batch_size, model.max_length // 2) # Using an incompatible batch_size diff --git a/tests/generation/test_hub.py b/tests/generation/test_hub.py index 2966e0199..e8f717677 100644 --- a/tests/generation/test_hub.py +++ b/tests/generation/test_hub.py @@ -18,33 +18,59 @@ from huggingface_hub import HfApi from transformers.testing_utils import ENDPOINT_STAGING -from optimum.neuron import NeuronModelForCausalLM +from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx from optimum.utils.testing_utils import TOKEN -@is_inferentia_test -@requires_neuronx -def test_model_from_hub(): - model = NeuronModelForCausalLM.from_pretrained( - "dacorvo/tiny-random-gpt2-neuronx", revision="6cb671b50db5cecb7abead9e2ec7099d4bab44a8" - ) - check_neuron_model(model, batch_size=16, sequence_length=512, num_cores=2, auto_cast_type="fp32") - - -@is_inferentia_test -@requires_neuronx -def test_push_to_hub(neuron_model_path, neuron_push_id): - model = NeuronModelForCausalLM.from_pretrained(neuron_model_path) - model.push_to_hub(neuron_model_path, neuron_push_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING) - api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN) - try: - hub_files_info = api.list_files_info(neuron_push_id) - hub_files_path = [info.rfilename for info in hub_files_info] - for path, _, files in os.walk(neuron_model_path): - for name in files: - local_file_path = os.path.join(path, name) - hub_file_path = os.path.relpath(local_file_path, neuron_model_path) - assert hub_file_path in hub_files_path - finally: - api.delete_repo(neuron_push_id) +class DecoderTests: + @is_inferentia_test + @requires_neuronx + def test_model_from_hub(): + model = NeuronModelForCausalLM.from_pretrained( + "dacorvo/tiny-random-gpt2-neuronx", revision="6cb671b50db5cecb7abead9e2ec7099d4bab44a8" + ) + check_neuron_model(model, batch_size=16, sequence_length=512, num_cores=2, auto_cast_type="fp32") + + @is_inferentia_test + @requires_neuronx + def test_push_to_hub(neuron_decoder_path, neuron_push_decoder_id): + model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) + model.push_to_hub(neuron_decoder_path, neuron_push_decoder_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING) + api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN) + try: + hub_files_info = api.list_files_info(neuron_push_decoder_id) + hub_files_path = [info.rfilename for info in hub_files_info] + for path, _, files in os.walk(neuron_decoder_path): + for name in files: + local_file_path = os.path.join(path, name) + hub_file_path = os.path.relpath(local_file_path, neuron_decoder_path) + assert hub_file_path in hub_files_path + finally: + api.delete_repo(neuron_push_decoder_id) + + +class Seq2SeqTests: + @is_inferentia_test + @requires_neuronx + def test_model_from_hub(): + model = NeuronModelForSeq2SeqLM.from_pretrained( + "Jingya/tiny-random-t5-neuronx", revision="6cb671b50db5cecb7abead9e2ec7099d4bab44a8" + ) + + @is_inferentia_test + @requires_neuronx + def test_push_seq2seq_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id): + model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_push_seq2seq_id) + model.push_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING) + api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN) + try: + hub_files_info = api.list_files_info(neuron_push_seq2seq_id) + hub_files_path = [info.rfilename for info in hub_files_info] + for path, _, files in os.walk(neuron_seq2seq_path): + for name in files: + local_file_path = os.path.join(path, name) + hub_file_path = os.path.relpath(local_file_path, neuron_seq2seq_path) + assert hub_file_path in hub_files_path + finally: + api.delete_repo(neuron_push_seq2seq_id) From 308c08e838509e5d734de61b88c9aa664d40bfe5 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Thu, 9 Nov 2023 15:45:11 +0000 Subject: [PATCH 15/30] tests done --- tests/generation/conftest.py | 18 +++++- tests/generation/test_export.py | 94 +++++++++++++++--------------- tests/generation/test_generate.py | 24 +++++++- tests/generation/test_hub.py | 95 ++++++++++++++++--------------- 4 files changed, 135 insertions(+), 96 deletions(-) diff --git a/tests/generation/conftest.py b/tests/generation/conftest.py index ccf40d151..85f203f85 100644 --- a/tests/generation/conftest.py +++ b/tests/generation/conftest.py @@ -67,7 +67,23 @@ def neuron_decoder_path(export_decoder_id): @requires_neuronx def neuron_seq2seq_path(export_seq2seq_id): model = NeuronModelForSeq2SeqLM.from_pretrained( - export_seq2seq_id, export=True, batch_size=1, sequence_length=32, num_beams=4 + export_seq2seq_id, export=True, batch_size=1, sequence_length=64, num_beams=4 + ) + model_dir = TemporaryDirectory() + model_path = model_dir.name + model.save_pretrained(model_path) + del model + # Yield instead of returning to keep a reference to the temporary directory. + # It will go out of scope and be released only once all tests needing the fixture + # have been completed. + yield model_path + + +@pytest.fixture(scope="module") +@requires_neuronx +def neuron_seq2seq_greedy_path(export_seq2seq_id): + model = NeuronModelForSeq2SeqLM.from_pretrained( + export_seq2seq_id, export=True, batch_size=1, sequence_length=64, num_beams=1 ) model_dir = TemporaryDirectory() model_path = model_dir.name diff --git a/tests/generation/test_export.py b/tests/generation/test_export.py index 9b88b1515..32c53c4a4 100644 --- a/tests/generation/test_export.py +++ b/tests/generation/test_export.py @@ -20,55 +20,57 @@ from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx -class DecoderTests: - @pytest.mark.parametrize( - "batch_size, sequence_length, num_cores, auto_cast_type", - [ - [1, 100, 2, "fp32"], - [1, 100, 2, "fp16"], - [2, 100, 2, "fp16"], - ], +@pytest.mark.parametrize( + "batch_size, sequence_length, num_cores, auto_cast_type", + [ + [1, 100, 2, "fp32"], + [1, 100, 2, "fp16"], + [2, 100, 2, "fp16"], + ], +) +@is_inferentia_test +@requires_neuronx +def test_decoder_export(export_decoder_id, batch_size, sequence_length, num_cores, auto_cast_type): + model = NeuronModelForCausalLM.from_pretrained( + export_decoder_id, + export=True, + batch_size=batch_size, + sequence_length=sequence_length, + num_cores=num_cores, + auto_cast_type=auto_cast_type, ) - @is_inferentia_test - @requires_neuronx - def test_decoder_export(export_decoder_id, batch_size, sequence_length, num_cores, auto_cast_type): - model = NeuronModelForCausalLM.from_pretrained( - export_decoder_id, - export=True, - batch_size=batch_size, - sequence_length=sequence_length, - num_cores=num_cores, - auto_cast_type=auto_cast_type, - ) - check_neuron_model(model, batch_size, sequence_length, num_cores, auto_cast_type) + check_neuron_model(model, batch_size, sequence_length, num_cores, auto_cast_type) - @is_inferentia_test - @requires_neuronx - def test_model_from_path(neuron_decoder_path): - model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) - check_neuron_model(model) +@is_inferentia_test +@requires_neuronx +def test_model_from_path(neuron_decoder_path): + model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) + check_neuron_model(model) -class Seq2SeqTests: - @pytest.mark.parametrize( - "batch_size, sequence_length, num_beams", - [ - [1, 32, 1], - [1, 32, 4], - ], + +@pytest.mark.parametrize( + "batch_size, sequence_length, num_beams", + [ + [1, 64, 1], + [1, 64, 4], + ], +) +@is_inferentia_test +@requires_neuronx +def test_seq2seq_export(export_seq2seq_id, batch_size, sequence_length, num_beams): + model = NeuronModelForSeq2SeqLM.from_pretrained( + export_seq2seq_id, + export=True, + batch_size=batch_size, + sequence_length=sequence_length, + num_beams=num_beams, ) - @is_inferentia_test - @requires_neuronx - def test_seq2seq_export(export_seq2seq_id, batch_size, sequence_length, num_beams): - model = NeuronModelForSeq2SeqLM.from_pretrained( - export_seq2seq_id, - export=True, - batch_size=batch_size, - sequence_length=sequence_length, - num_beams=num_beams, - ) + return model + - @is_inferentia_test - @requires_neuronx - def test_model_from_path(neuron_seq2seq_path): - model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path) +@is_inferentia_test +@requires_neuronx +def test_seq2seq_model_from_path(neuron_seq2seq_path): + model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path) + return model diff --git a/tests/generation/test_generate.py b/tests/generation/test_generate.py index 04ec9b9ea..06cbed335 100644 --- a/tests/generation/test_generate.py +++ b/tests/generation/test_generate.py @@ -17,7 +17,7 @@ import torch from transformers import AutoTokenizer -from optimum.neuron import NeuronModelForCausalLM +from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx @@ -40,7 +40,7 @@ def _test_model_generation(model, tokenizer, batch_size, input_length, **gen_kwa ) @is_inferentia_test @requires_neuronx -def test_model_generation(neuron_decoder_path, gen_kwargs): +def test_decoder_generation(neuron_decoder_path, gen_kwargs): model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) tokenizer = AutoTokenizer.from_pretrained(neuron_decoder_path) _test_model_generation(model, tokenizer, model.batch_size, 10, **gen_kwargs) @@ -59,3 +59,23 @@ def test_model_generation_input_dimensions(neuron_decoder_path): # Using an incompatible input length with pytest.raises(ValueError, match="The input sequence length"): _test_model_generation(model, tokenizer, model.batch_size, input_length=model.max_length * 2) + + +@is_inferentia_test +@requires_neuronx +def test_seq2seq_generation_beam(neuron_seq2seq_path): + model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path) + tokenizer = AutoTokenizer.from_pretrained(neuron_seq2seq_path) + inputs = tokenizer("translate English to German: Lets eat good food.", return_tensors="pt") + output = model.generate(**inputs, num_return_sequences=1) + return output + + +@is_inferentia_test +@requires_neuronx +def test_seq2seq_generation_greedy(neuron_seq2seq_greedy_path): + model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_greedy_path) + tokenizer = AutoTokenizer.from_pretrained(neuron_seq2seq_greedy_path) + inputs = tokenizer("translate English to German: Lets eat good food.", return_tensors="pt") + output = model.generate(**inputs, num_return_sequences=1) + return output diff --git a/tests/generation/test_hub.py b/tests/generation/test_hub.py index e8f717677..ff8e90615 100644 --- a/tests/generation/test_hub.py +++ b/tests/generation/test_hub.py @@ -23,54 +23,55 @@ from optimum.utils.testing_utils import TOKEN -class DecoderTests: - @is_inferentia_test - @requires_neuronx - def test_model_from_hub(): - model = NeuronModelForCausalLM.from_pretrained( - "dacorvo/tiny-random-gpt2-neuronx", revision="6cb671b50db5cecb7abead9e2ec7099d4bab44a8" - ) - check_neuron_model(model, batch_size=16, sequence_length=512, num_cores=2, auto_cast_type="fp32") +@is_inferentia_test +@requires_neuronx +def test_model_from_hub(): + model = NeuronModelForCausalLM.from_pretrained( + "dacorvo/tiny-random-gpt2-neuronx", revision="6cb671b50db5cecb7abead9e2ec7099d4bab44a8" + ) + check_neuron_model(model, batch_size=16, sequence_length=512, num_cores=2, auto_cast_type="fp32") - @is_inferentia_test - @requires_neuronx - def test_push_to_hub(neuron_decoder_path, neuron_push_decoder_id): - model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) - model.push_to_hub(neuron_decoder_path, neuron_push_decoder_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING) - api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN) - try: - hub_files_info = api.list_files_info(neuron_push_decoder_id) - hub_files_path = [info.rfilename for info in hub_files_info] - for path, _, files in os.walk(neuron_decoder_path): - for name in files: - local_file_path = os.path.join(path, name) - hub_file_path = os.path.relpath(local_file_path, neuron_decoder_path) - assert hub_file_path in hub_files_path - finally: - api.delete_repo(neuron_push_decoder_id) +@is_inferentia_test +@requires_neuronx +def test_push_to_hub(neuron_decoder_path, neuron_push_decoder_id): + model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) + model.push_to_hub(neuron_decoder_path, neuron_push_decoder_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING) + api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN) + try: + hub_files_info = api.list_files_info(neuron_push_decoder_id) + hub_files_path = [info.rfilename for info in hub_files_info] + for path, _, files in os.walk(neuron_decoder_path): + for name in files: + local_file_path = os.path.join(path, name) + hub_file_path = os.path.relpath(local_file_path, neuron_decoder_path) + assert hub_file_path in hub_files_path + finally: + api.delete_repo(neuron_push_decoder_id) -class Seq2SeqTests: - @is_inferentia_test - @requires_neuronx - def test_model_from_hub(): - model = NeuronModelForSeq2SeqLM.from_pretrained( - "Jingya/tiny-random-t5-neuronx", revision="6cb671b50db5cecb7abead9e2ec7099d4bab44a8" - ) - @is_inferentia_test - @requires_neuronx - def test_push_seq2seq_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id): - model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_push_seq2seq_id) - model.push_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING) - api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN) - try: - hub_files_info = api.list_files_info(neuron_push_seq2seq_id) - hub_files_path = [info.rfilename for info in hub_files_info] - for path, _, files in os.walk(neuron_seq2seq_path): - for name in files: - local_file_path = os.path.join(path, name) - hub_file_path = os.path.relpath(local_file_path, neuron_seq2seq_path) - assert hub_file_path in hub_files_path - finally: - api.delete_repo(neuron_push_seq2seq_id) +@is_inferentia_test +@requires_neuronx +def test_seq2seq_model_from_hub(): + model = NeuronModelForSeq2SeqLM.from_pretrained( + "Jingya/tiny-random-t5-neuronx", revision="ce617676ce12a19df7c6bd523c69b83447fa036b" + ) + return model + + +@is_inferentia_test +@requires_neuronx +def test_push_seq2seq_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id): + model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path) + model.push_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING) + api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN) + try: + hub_files_info = api.list_files_info(neuron_push_seq2seq_id) + hub_files_path = [info.rfilename for info in hub_files_info] + for path, _, files in os.walk(neuron_seq2seq_path): + for name in files: + local_file_path = os.path.join(path, name) + hub_file_path = os.path.relpath(local_file_path, neuron_seq2seq_path) + assert hub_file_path in hub_files_path + finally: + api.delete_repo(neuron_push_seq2seq_id) From 12e931190a3261c73a3bb5b4e027577e5080d9b9 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Fri, 17 Nov 2023 15:25:04 +0000 Subject: [PATCH 16/30] apply some suggestions --- optimum/exporters/neuron/model_configs.py | 6 ------ optimum/neuron/modeling_seq2seq.py | 1 - 2 files changed, 7 deletions(-) diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index 4ea1beff6..ddb1a1eb8 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -394,12 +394,6 @@ def patch_model_for_export(self, model, device="xla", **kwargs): custom_wrapper_kwargs={"num_beams": num_beams, "device": device}, ) - # def generate_dummy_inputs(self, **kwargs): - # batch_size = kwargs.pop("batch_size") * kwargs.get("num_beams") - # dummy_inputs = super().generate_dummy_inputs(batch_size=batch_size, **kwargs) - - # return dummy_inputs - @register_in_tasks_manager("opt", "text-generation") class OPTNeuronConfig(TextNeuronDecoderConfig): diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index 6a10ea7a5..ac926caa1 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -485,7 +485,6 @@ def beam_search( # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens # of the first beam are considered to avoid sampling the exact same tokens across all beams. - # beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) beam_scores_device = "cpu" beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=beam_scores_device) beam_scores[:, 1:] = -1e9 From 13445c9d622674ab3ba2b92c6534e51be71fe82e Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Wed, 22 Nov 2023 15:47:36 +0000 Subject: [PATCH 17/30] fix style --- optimum/exporters/neuron/__main__.py | 2 +- optimum/exporters/neuron/model_configs.py | 4 +++- optimum/exporters/neuron/model_wrappers.py | 6 +++--- optimum/exporters/neuron/utils.py | 3 ++- tests/exporters/test_export.py | 4 +++- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index 78e042a1d..70abd9619 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -223,7 +223,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion( output: Path, dynamic_batch_size: bool = False, submodels: Dict[str, Union[Path, str]] = None, -): +): model = replace_stable_diffusion_submodels(model, submodels) check_compiler_compatibility_for_stable_diffusion() if is_neuron_available(): diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index d6b16cb62..060e6ee01 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -450,7 +450,9 @@ def patch_model_for_export(self, model, device="xla", **kwargs): batch_size = kwargs.pop("batch_size", 1) sequence_length = kwargs.pop("sequence_length", 1) num_beams = kwargs.pop("num_beams", 1) - return self.CUSTOM_MODEL_WRAPPER(model, batch_size=batch_size, sequence_length=sequence_length, num_beams=num_beams, device=device) + return self.CUSTOM_MODEL_WRAPPER( + model, batch_size=batch_size, sequence_length=sequence_length, num_beams=num_beams, device=device + ) def generate_io_aliases(self, model): num_outputs_from_trace = 3 if model.num_beams > 1 else 1 diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py index 2ef95e875..3e27b4765 100644 --- a/optimum/exporters/neuron/model_wrappers.py +++ b/optimum/exporters/neuron/model_wrappers.py @@ -25,9 +25,9 @@ class UnetNeuronWrapper(torch.nn.Module): def __init__(self, model, input_names: List[str]): - super().__init__() - self.model = model - self.input_names = input_names + super().__init__() + self.model = model + self.input_names = input_names def forward(self, *inputs): if len(inputs) != len(self.input_names): diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py index f11fcb1c2..81a474374 100644 --- a/optimum/exporters/neuron/utils.py +++ b/optimum/exporters/neuron/utils.py @@ -334,6 +334,7 @@ def check_mandatory_input_shapes(neuron_config_constructor, task, input_shapes): f"Cannot find the value of `{name}` which is mandatory for exporting the model to the neuron format, please set the value explicitly." ) + def replace_stable_diffusion_submodels(pipeline, submodels): if submodels is not None: unet_id = submodels.pop("unet", None) @@ -343,6 +344,7 @@ def replace_stable_diffusion_submodels(pipeline, submodels): return pipeline + def get_encoder_decoder_models_for_export( model: "PreTrainedModel", task: str, @@ -398,4 +400,3 @@ def get_encoder_decoder_models_for_export( models_for_export[DECODER_NAME] = (model, decoder_neuron_config) return models_for_export - diff --git a/tests/exporters/test_export.py b/tests/exporters/test_export.py index 4a491f162..76b24a560 100644 --- a/tests/exporters/test_export.py +++ b/tests/exporters/test_export.py @@ -154,7 +154,9 @@ class NeuronStableDiffusionExportTestCase(unittest.TestCase): Integration tests ensuring stable diffusion models are correctly exported. """ - @parameterized.expand([STABLE_DIFFUSION_MODELS_TINY["stable-diffusion"], STABLE_DIFFUSION_MODELS_TINY["latent-consistency"]]) + @parameterized.expand( + [STABLE_DIFFUSION_MODELS_TINY["stable-diffusion"], STABLE_DIFFUSION_MODELS_TINY["latent-consistency"]] + ) def test_export_for_stable_diffusion_models(self, model_id): set_seed(SEED) From ded43a4e4722e696901f239ce5f48176abdd9a18 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Thu, 23 Nov 2023 14:32:20 +0000 Subject: [PATCH 18/30] address part of comments --- docs/source/tutorials/stable_diffusion.mdx | 2 +- optimum/exporters/neuron/base.py | 2 +- optimum/exporters/neuron/config.py | 11 +++-------- optimum/exporters/neuron/convert.py | 4 +--- optimum/exporters/neuron/model_configs.py | 1 - optimum/exporters/neuron/model_wrappers.py | 7 ++++--- optimum/neuron/utils/input_generators.py | 1 + 7 files changed, 11 insertions(+), 17 deletions(-) diff --git a/docs/source/tutorials/stable_diffusion.mdx b/docs/source/tutorials/stable_diffusion.mdx index c115dd760..5d6a734b6 100644 --- a/docs/source/tutorials/stable_diffusion.mdx +++ b/docs/source/tutorials/stable_diffusion.mdx @@ -357,7 +357,7 @@ To avoid Neuron device out of memory, it's suggested to finish all base inferenc Latent Consistency Models (LCMs) were proposed in [Latent Consistency Models: Synthesizing High-Resolution Images with Few-Step Inference by Simian Luo, Yiqin Tan, Longbo Huang, Jian Li, and Hang Zhao](https://huggingface.co/papers/2310.04378). LCMs enable inference with fewer steps on any pre-trained LDMs, including Stable Diffusion and SDXL. In `optimum-neuron`, you can: - - Use the class `NeuronLatentConsistencyModelPipeline` to compile and run inference of LCMs distilled from Stable Diffusion (SD) models, + - Use the class `NeuronLatentConsistencyModelPipeline` to compile and run inference of LCMs distilled from Stable Diffusion (SD) models. - And continue to use the class `NeuronStableDiffusionXLPipeline` for LCMs distilled from SDXL models. Here are examples to compile the LCMs of Stable Diffusion ( [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) ) and Stable Diffusion XL( [latent-consistency/lcm-sdxl](https://huggingface.co/latent-consistency/lcm-sdxl) ), and then run inference on AWS Inferentia 2 : diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py index 5548dc4b0..6b005869f 100644 --- a/optimum/exporters/neuron/base.py +++ b/optimum/exporters/neuron/base.py @@ -119,7 +119,7 @@ def __init__( audio_sequence_length: Optional[int] = None, point_batch_size: Optional[int] = None, nb_points_per_image: Optional[int] = None, - num_beams: Optional[int] = None, + num_beams: int = 1, # TODO: add custom dtype after optimum 1.13 release # int_dtype: str = "int64", # float_dtype: str = "fp32", diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py index 82cbf4450..fccac7e39 100644 --- a/optimum/exporters/neuron/config.py +++ b/optimum/exporters/neuron/config.py @@ -16,7 +16,7 @@ Common Neuron configuration classes that handle most of the features for building model specific configurations. """ -from typing import Dict, List +from typing import List from ...utils import ( DummyBboxInputGenerator, @@ -79,11 +79,7 @@ class TextSeq2SeqNeuronConfig(NeuronConfig): ) @property - def is_decoder(self) -> bool: - raise NotImplementedError() - - @property - def inputs(self) -> Dict[str, Dict[int, str]]: + def inputs(self) -> List[str]: common_inputs = [] # encoder + decoder without past if "encoder" in self.MODEL_TYPE: @@ -100,7 +96,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]: return common_inputs @property - def outputs(self) -> Dict[str, Dict[int, str]]: + def outputs(self) -> List[str]: common_outputs = [] # encoder + decoder without past if "encoder" in self.MODEL_TYPE: @@ -115,7 +111,6 @@ def outputs(self) -> Dict[str, Dict[int, str]]: beam_outputs = ( ["next_token_scores", "next_tokens", "next_indices"] if self.num_beams > 1 else ["next_tokens"] ) - # for i in range(self._config.num_decoder_layers): common_outputs = ( beam_outputs + [f"past.{idx}.self.key" for idx in range(self._config.num_decoder_layers)] diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index 85d7ba124..9cace43f7 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -171,9 +171,7 @@ def validate_model_outputs( ref_inputs = config.generate_dummy_inputs(return_tuple=False, **input_shapes) if reference_model.config.is_encoder_decoder: reference_model = config.patch_model_for_export(reference_model, device="cpu", **input_shapes) - if ( - hasattr(config._config, "_class_name") and "AutoencoderKL" in config._config._class_name - ) or reference_model.config.is_encoder_decoder: + if "AutoencoderKL" in getattr(config._config, "_class_name", "") or reference_model.config.is_encoder_decoder: # VAE components for stable diffusion or Encoder-Decoder models ref_inputs = tuple(ref_inputs.values()) ref_outputs = reference_model(*ref_inputs) diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index 906e53764..e4dda2fa5 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -432,7 +432,6 @@ def is_decoder(self) -> bool: @property def inputs(self) -> List[str]: common_inputs = super().inputs + ["beam_idx", "beam_scores"] - return common_inputs def generate_dummy_inputs(self, **kwargs): diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py index 3e27b4765..abc63c114 100644 --- a/optimum/exporters/neuron/model_wrappers.py +++ b/optimum/exporters/neuron/model_wrappers.py @@ -13,7 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Model wrappers for Neuron export.""" -from typing import TYPE_CHECKING, List + +from typing import TYPE_CHECKING, List, Optional import torch from transformers.models.t5.modeling_t5 import T5LayerCrossAttention @@ -65,7 +66,7 @@ def __init__( model: "PreTrainedModel", num_beams: int = 1, device: str = "xla", - tp_degree=None, + tp_degree: Optional[int] = None, ): super().__init__() self.model = model @@ -143,7 +144,7 @@ def __init__( sequence_length: int, num_beams: int = 1, device: str = "xla", - tp_degree=None, + tp_degree: Optional[int] = None, ): super().__init__() self.model = model diff --git a/optimum/neuron/utils/input_generators.py b/optimum/neuron/utils/input_generators.py index 1616123a9..91a1657d9 100644 --- a/optimum/neuron/utils/input_generators.py +++ b/optimum/neuron/utils/input_generators.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Dummy input generation classes.""" + import torch from ...utils import DTYPE_MAPPER, DummyInputGenerator, NormalizedTextConfig From 994374bd0e35d6867dadeeeed89735061367593c Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Thu, 23 Nov 2023 19:10:04 +0000 Subject: [PATCH 19/30] apply some suggestions --- optimum/neuron/generation/utils.py | 170 +++++++++++++++-------------- optimum/neuron/modeling_seq2seq.py | 106 ++++-------------- 2 files changed, 112 insertions(+), 164 deletions(-) diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py index 81a5c3fa2..9ab87e914 100644 --- a/optimum/neuron/generation/utils.py +++ b/optimum/neuron/generation/utils.py @@ -82,6 +82,91 @@ class NeuronGenerationMixin(GenerationMixin): learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies). """ + @staticmethod + def _initialize_attention( + model_kwargs, + num_padding_values, + batch_size, + device, + is_encoder_decoder, + ): + """Initializes the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`""" + if is_encoder_decoder: + # One 1 for decoder_start_token_id, 0s for the currently-unfilled locations in the past_key_values tensor, + # 1s for the actual input_ids + decoder_attention_mask = torch.cat( + [ + torch.zeros((batch_size, num_padding_values), dtype=torch.int32), + torch.ones((batch_size, 2), dtype=torch.int32), + ], + axis=1, + ).to(device) + mask = {"decoder_attention_mask": decoder_attention_mask} + else: + attention_mask = model_kwargs.pop("attention_mask") + # 0s for the currently-unfilled locations in the past_key_values tensor, 1s for the actual input_ids + attention_mask = torch.cat( + [ + torch.zeros( + (batch_size, num_padding_values), dtype=attention_mask.dtype, device=attention_mask.device + ), + attention_mask, + torch.ones((batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device), + ], + axis=1, + ) + mask = {"attention_mask": attention_mask} + + return mask + + @staticmethod + def _update_attention(model_kwargs, batch_size, is_encoder_decoder): + """Updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`""" + + attention_mask_name = "decoder_attention_mask" if is_encoder_decoder else "attention_mask" + attention_mask = model_kwargs.pop(attention_mask_name) + attention_mask_update_slice = torch.ones( + (batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device + ) + attention_mask = torch.cat([attention_mask[:, 1:], attention_mask_update_slice], dim=-1) + mask = {attention_mask_name: attention_mask} + return mask + + @staticmethod + def _initialize_past(past_key_values, num_padding_values): + """Initialize past_key_values with zeros -- the structure depends on `batch_axis`""" + + new_past = () + for past_layer in past_key_values: + new_past_layer = list(past_layer) + for i in range(len(new_past_layer[:2])): + b, n_heads, _, head_dim = past_layer[i].shape + new_past_layer[i] = torch.cat( + [ + torch.zeros( + (b, n_heads, num_padding_values, head_dim), + dtype=past_layer[i].dtype, + device=past_layer[i].device, + ), + past_layer[i], + ], + dim=2, + ) + new_past += (tuple(new_past_layer),) + + return new_past + + @staticmethod + def _update_past(past_key_values): + new_past = () + for past_layer in past_key_values: + new_past_layer = list(past_layer) + for i, _ in enumerate(new_past_layer[:2]): + new_past_layer[i] = past_layer[i][:, :, 1:] + new_past += (tuple(new_past_layer),) + + return new_past + def _update_model_kwargs_for_xla_generation( self, outputs: ModelOutput, @@ -93,81 +178,6 @@ def _update_model_kwargs_for_xla_generation( seq_length: Optional[int] = None, use_cache: bool = True, ) -> Dict[str, Any]: - def _initialize_attention(model_kwargs, num_padding_values, is_encoder_decoder): - """Initializes the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`""" - if is_encoder_decoder: - # One 1 for decoder_start_token_id, 0s for the currently-unfilled locations in the past_key_values tensor, - # 1s for the actual input_ids - decoder_attention_mask = torch.cat( - [ - torch.zeros((batch_size, num_padding_values), dtype=torch.int32), - torch.ones((batch_size, 2), dtype=torch.int32), - ], - axis=1, - ).to(outputs.logits.device) - mask = {"decoder_attention_mask": decoder_attention_mask} - else: - attention_mask = model_kwargs.pop("attention_mask") - # 0s for the currently-unfilled locations in the past_key_values tensor, 1s for the actual input_ids - attention_mask = torch.cat( - [ - torch.zeros( - (batch_size, num_padding_values), dtype=attention_mask.dtype, device=attention_mask.device - ), - attention_mask, - torch.ones((batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device), - ], - axis=1, - ) - mask = {"attention_mask": attention_mask} - - return mask - - def _update_attention(model_kwargs, is_encoder_decoder): - """Updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`""" - - attention_mask_name = "decoder_attention_mask" if is_encoder_decoder else "attention_mask" - attention_mask = model_kwargs.pop(attention_mask_name) - attention_mask_update_slice = torch.ones( - (batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device - ) - attention_mask = torch.cat([attention_mask[:, 1:], attention_mask_update_slice], dim=-1) - mask = {attention_mask_name: attention_mask} - return mask - - def _initialize_past(past_key_values, num_padding_values): - """Initialize past_key_values with zeros -- the structure depends on `batch_axis`""" - - new_past = () - for past_layer in past_key_values: - new_past_layer = list(past_layer) - for i in range(len(new_past_layer[:2])): - b, n_heads, _, head_dim = past_layer[i].shape - new_past_layer[i] = torch.cat( - [ - torch.zeros( - (b, n_heads, num_padding_values, head_dim), - dtype=past_layer[i].dtype, - device=past_layer[i].device, - ), - past_layer[i], - ], - dim=2, - ) - new_past += (tuple(new_past_layer),) - - return new_past - - def _update_past(past_key_values): - new_past = () - for past_layer in past_key_values: - new_past_layer = list(past_layer) - for i, _ in enumerate(new_past_layer[:2]): - new_past_layer[i] = past_layer[i][:, :, 1:] - new_past += (tuple(new_past_layer),) - - return new_past - if use_cache: past_key_values = self._extract_past_from_model_output(outputs) if past_key_values is None: @@ -182,11 +192,13 @@ def _update_past(past_key_values): # previous autoregressive generation steps (step 0 has no past_key_values, step 1 has 1 past_key_values value, ..., the last step # has `max_length - 1` past_key_values values). num_padding_values = max_length - seq_length - mask = _initialize_attention(model_kwargs, num_padding_values, is_encoder_decoder) - new_past = _initialize_past(past_key_values, num_padding_values) + mask = self._initialize_attention( + model_kwargs, num_padding_values, batch_size, outputs.logits.device, is_encoder_decoder + ) + new_past = self._initialize_past(past_key_values, num_padding_values) else: - mask = _update_attention(model_kwargs, is_encoder_decoder) - new_past = _update_past(past_key_values) + mask = self._update_attention(model_kwargs, batch_size, is_encoder_decoder) + new_past = self._update_past(past_key_values) # sets the updated variables (mask and past_key_values) model_kwargs.update(mask) diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index ac926caa1..6071aad1b 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -12,12 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""NeuroModelForXXX classes for seq2seq models' inference on neuron devices.""" +"""NeuroModelForXXX classes for seq2seq models' inference on Neuron devices.""" + import copy import logging import os import shutil -from abc import abstractmethod +from abc import ABC, abstractmethod from pathlib import Path from tempfile import TemporaryDirectory from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union @@ -44,7 +45,6 @@ NeuronConfig, main_export, ) -from ..exporters.neuron.model_configs import * # noqa: F403 from ..exporters.tasks import TasksManager from ..utils.save_utils import maybe_load_preprocessors from .generation import NeuronGenerationMixin @@ -67,7 +67,7 @@ logger = logging.getLogger(__name__) -class NeuronModelForConditionalGeneration(NeuronBaseModel): +class NeuronModelForConditionalGeneration(NeuronBaseModel, ABC): base_model_prefix = "neuron_model" config_name = "config.json" @@ -130,6 +130,10 @@ def _save_pretrained( Args: save_directory (`Union[str, Path`]): The directory where to save the model files. + encoder_file_name (`str`, defaults to `NEURON_FILE_NAME`]): + The file name to save the encoder. + decoder_file_name (`str`, defaults to `NEURON_FILE_NAME`]): + The file name to save the decoder. """ if self.model_and_config_save_paths is None: logger.warning( @@ -342,12 +346,6 @@ def _combine_encoder_decoder_config(self, encoder_config: "PretrainedConfig", de return combined_config - def can_generate(self): - logger.warning( - "NeuronModelForConditionalGeneration is an abstract class and is not meant to be used for generation. Please use NeuronModelForSeq2SeqLM instead." - ) - return False - class NeuronModelForSeq2SeqLM(NeuronModelForConditionalGeneration, NeuronGenerationMixin): auto_model_class = AutoModelForSeq2SeqLM @@ -360,7 +358,6 @@ def forward( decoder_attention_mask: Optional[torch.BoolTensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, beam_scores=None, - **kwargs, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: hidden_states = encoder_outputs["last_hidden_state"] @@ -439,8 +436,6 @@ def beam_search( stopping_criteria: Optional[StoppingCriteriaList] = None, pad_token_id: Optional[int] = None, eos_token_id: Optional[Union[int, List[int]]] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, synced_gpus: Optional[bool] = False, @@ -450,19 +445,16 @@ def beam_search( """ Overriding beam search to use next_token_scores returned from neuron device instead of logits. """ - logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + if logits_processor is not None: + logger.warning( + "`logits_processor` will not be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron." + ) stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id if isinstance(eos_token_id, int): eos_token_id = [eos_token_id] output_scores = output_scores if output_scores is not None else self.generation_config.output_scores - output_attentions = ( - output_attentions if output_attentions is not None else self.generation_config.output_attentions - ) - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states - ) batch_size = len(beam_scorer._beam_hyps) num_beams = beam_scorer.num_beams @@ -500,13 +492,7 @@ def beam_search( input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None] model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs) - next_token_scores, next_tokens, next_indices = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - beam_scores=beam_scores, - ) + next_token_scores, next_tokens, next_indices = self(**model_inputs, beam_scores=beam_scores) # stateless beam_outputs = beam_scorer.process( @@ -545,14 +531,8 @@ def beam_search( model_kwargs, batch_size=batch_beam_size, is_encoder_decoder=self.config.is_encoder_decoder, - max_length=stopping_criteria.max_length, - seq_length=cur_len, - use_cache=model_kwargs["use_cache"], ) - if model_kwargs["past_key_values"] is not None: - model_kwargs["past_key_values"] = self._reorder_cache( - model_kwargs["past_key_values"], beam_idx.to(torch.int64) - ) + self._reorder_cache(beam_idx.to(torch.int64)) if return_dict_in_generate and output_scores: beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices)))) @@ -612,8 +592,6 @@ def greedy_search( max_length: Optional[int] = None, pad_token_id: Optional[int] = None, eos_token_id: Optional[Union[int, List[int]]] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, seq_length: Optional[int] = int, @@ -624,8 +602,10 @@ def greedy_search( Overriding greedy sampling to use next tokens returned from neuron device instead of logits. """ # init values - logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() - use_cache = model_kwargs["use_cache"] if "use_cache" in model_kwargs else False + if logits_processor is not None: + logger.warning( + "`logits_processor` will not be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron." + ) stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() if max_length is not None: from transformers.generation.stopping_criteria import validate_stopping_criteria @@ -637,12 +617,6 @@ def greedy_search( eos_token_id = [eos_token_id] eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None output_scores = output_scores if output_scores is not None else self.generation_config.output_scores - output_attentions = ( - output_attentions if output_attentions is not None else self.generation_config.output_attentions - ) - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states - ) # init attention / hidden states / scores tuples scores = () if (return_dict_in_generate and output_scores) else None @@ -668,12 +642,7 @@ def greedy_search( model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs) # forward pass to get next token - output = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) + output = self(**model_inputs) next_tokens = output[0] # finished sentences should have their next token be a padding token @@ -693,9 +662,6 @@ def greedy_search( model_kwargs, batch_size=batch_size, is_encoder_decoder=self.config.is_encoder_decoder, - max_length=stopping_criteria.max_length, - seq_length=seq_length, - use_cache=use_cache, ) seq_length += 1 @@ -741,12 +707,11 @@ def greedy_search( return input_ids - def _reorder_cache(self, past_key_values, beam_idx): + def _reorder_cache(self, beam_idx): """ The cache was reordered during the tracing of the decoder so we can skip it here. This is needed for beam search and not greedy sampling. """ self.beam_idx = beam_idx - return past_key_values def get_encoder(self) -> "NeuronEncoder": return self.encoder @@ -756,43 +721,19 @@ def _update_model_kwargs_for_xla_generation( model_kwargs: Dict[str, Any], batch_size: int, is_encoder_decoder: bool = False, - standardize_cache_format: bool = False, - max_length: Optional[int] = None, - seq_length: Optional[int] = None, - use_cache: bool = True, ) -> Dict[str, Any]: - def _update_attention(model_kwargs, is_encoder_decoder): - """Updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`""" - - attention_mask_name = "decoder_attention_mask" if is_encoder_decoder else "attention_mask" - attention_mask = model_kwargs.pop(attention_mask_name) - attention_mask_update_slice = torch.ones( - (batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device - ) - attention_mask = torch.cat([attention_mask[:, 1:], attention_mask_update_slice], dim=-1) - mask = {attention_mask_name: attention_mask} - return mask - - mask = _update_attention(model_kwargs, is_encoder_decoder) + mask = self._update_attention(model_kwargs, batch_size, is_encoder_decoder) # sets the updated variables (mask and past_key_values) model_kwargs.update(mask) - # Set a mock cache tensor - model_kwargs["past_key_values"] = torch.tensor([]) - return model_kwargs # Override to cut the input_ids to just last token def prepare_inputs_for_generation( self, input_ids, - past_key_values=None, attention_mask=None, - head_mask=None, - decoder_head_mask=None, decoder_attention_mask=None, - cross_attn_head_mask=None, - use_cache=None, encoder_outputs=None, **kwargs, ): @@ -801,14 +742,9 @@ def prepare_inputs_for_generation( return { "decoder_input_ids": input_ids, - "past_key_values": past_key_values, "encoder_outputs": encoder_outputs, "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, "decoder_attention_mask": decoder_attention_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, } def _validate_static_shape(self, input_shapes: List[int], target_shapes: List[int]) -> bool: From 5c55ec16c6715b1f633e704ede2280088570f242 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Fri, 24 Nov 2023 16:35:38 +0000 Subject: [PATCH 20/30] add pad left support and log --- optimum/neuron/modeling_base.py | 45 ++++++++++++++++++++++++------ optimum/neuron/modeling_seq2seq.py | 6 ++-- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py index 1bfb42627..7b3a28ecd 100644 --- a/optimum/neuron/modeling_base.py +++ b/optimum/neuron/modeling_base.py @@ -20,7 +20,7 @@ from contextlib import contextmanager from pathlib import Path from tempfile import TemporaryDirectory -from typing import TYPE_CHECKING, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union import torch from huggingface_hub import HfApi, HfFolder, hf_hub_download @@ -451,10 +451,19 @@ def _raise_if_invalid_padding(self, input_name, input_tensor, target_shapes, to_ f" than the static shapes used for compilation: {target_shapes}{extra}." ) - def _pad_to_compiled_shape(self, inputs: Dict[str, "torch.Tensor"]): + def _pad_to_compiled_shape( + self, inputs: Dict[str, "torch.Tensor"], padding_side: Literal["right", "left"] = "right" + ): """ Pads input tensors if they are not in valid shape. + + Args: + inputs (`Dict[str, "torch.Tensor"]`): + Dictionary of input torch tensors. + padding_side (`Literal["right", "left"]`, defaults to "right"): + The side on which to apply the padding. """ + logger.info(f"Padding input tensors, the padding side is: {padding_side}.") for input_name, input_tensor in inputs.items(): target_shapes = self.input_static_shapes[input_name] padding = () @@ -466,7 +475,7 @@ def _pad_to_compiled_shape(self, inputs: Dict[str, "torch.Tensor"]): to_pad = target_shapes[i] - input_tensor.size(i) self._raise_if_invalid_padding(input_name, input_tensor, target_shapes, to_pad, i) - padding += (0, to_pad) + padding += (0, to_pad) if padding_side == "right" else (to_pad, 0) if ( self.preprocessors is not None @@ -496,7 +505,7 @@ def _pad_to_compiled_shape(self, inputs: Dict[str, "torch.Tensor"]): else: to_pad = target_shapes[0] - input_tensor.size(0) self._raise_if_invalid_padding(input_name, input_tensor, target_shapes, to_pad, 0) - padding += (0, to_pad) + padding += (0, to_pad) if padding_side == "right" else (to_pad, 0) pad_id = 1 inputs[input_name] = torch.nn.functional.pad(input_tensor, padding, mode="constant", value=pad_id) @@ -508,7 +517,13 @@ def neuron_padding_manager(self, inputs: Dict[str, "torch.Tensor"]): inputs = tuple(self._pad_to_compiled_shape(inputs).values()) yield inputs - def remove_padding(self, outputs: List[torch.Tensor], dims: List[int], indices: List[int]) -> List[torch.Tensor]: + @staticmethod + def remove_padding( + outputs: List[torch.Tensor], + dims: List[int], + indices: List[int], + padding_side: Literal["right", "left"] = "right", + ) -> List[torch.Tensor]: """ Removes padding from output tensors. @@ -519,12 +534,26 @@ def remove_padding(self, outputs: List[torch.Tensor], dims: List[int], indices: List of dimensions in which we slice a tensor. indices (`List[int]`): List of indices in which we slice a tensor along an axis. + padding_side (`Literal["right", "left"]`, defaults to "right"): + The side on which the padding has been applied. """ if len(dims) != len(indices): raise ValueError(f"The size of `dims`({len(dims)}) and indices`({len(indices)}) must be equal.") + for dim, indice in zip(dims, indices): - outputs = [ - torch.index_select(output_tensor, dim, torch.LongTensor(range(indice))) for output_tensor in outputs - ] + if padding_side == "right": + outputs = [ + torch.index_select(output_tensor, dim, torch.LongTensor(range(indice))) + for output_tensor in outputs + ] + elif padding_side == "left": + outputs = [ + torch.index_select( + output_tensor, + dim, + torch.LongTensor(range(output_tensor.shape[dim] - indice, output_tensor.shape[dim])), + ) + for output_tensor in outputs + ] return outputs diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index 6071aad1b..ff3feb9ba 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -421,7 +421,9 @@ def generate( max_length=max_length, num_beams=num_beams, do_sample=kwargs.pop("do_sample", False), - use_cache=kwargs.pop("use_cache", True), + use_cache=kwargs.pop( + "use_cache", False + ), # `use_cache` is supported by default in `optimum-neuron`, set to False to avoid warning decoder_attention_mask=decoder_attention_mask, # Pass fake encoder_outputs so the transfomers code will not invoke the encoder encoder_outputs={"last_hidden_state": torch.ones((batch_size, max_length, 1))}, @@ -447,7 +449,7 @@ def beam_search( """ if logits_processor is not None: logger.warning( - "`logits_processor` will not be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron." + "`logits_processor` will be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron." ) stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id From 9396c7ac5c6552cc2ae0051d95a570d6f4588b08 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Mon, 27 Nov 2023 11:57:14 +0000 Subject: [PATCH 21/30] fix enable custom max length instead of real max length limit --- optimum/neuron/modeling_seq2seq.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index ff3feb9ba..23396002e 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -59,7 +59,6 @@ if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedModel - from transformers.generation.streamers import BaseStreamer if is_neuronx_available(): import torch_neuronx @@ -385,7 +384,6 @@ def generate( stopping_criteria: Optional[StoppingCriteriaList] = None, prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, assistant_model: Optional["PreTrainedModel"] = None, - streamer: Optional["BaseStreamer"] = None, num_return_sequences: Optional[int] = None, **kwargs, ): @@ -416,9 +414,8 @@ def generate( stopping_criteria=stopping_criteria, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, assistant_model=assistant_model, - streamer=streamer, num_return_sequences=num_return_sequences, - max_length=max_length, + max_length=kwargs.pop("max_length", None) or max_length, num_beams=num_beams, do_sample=kwargs.pop("do_sample", False), use_cache=kwargs.pop( @@ -597,7 +594,6 @@ def greedy_search( output_scores: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, seq_length: Optional[int] = int, - streamer: Optional["BaseStreamer"] = None, **model_kwargs, ) -> Union[GreedySearchOutput, torch.LongTensor]: """ @@ -704,9 +700,6 @@ def greedy_search( if this_peer_finished: break - if streamer is not None: - streamer.end() - return input_ids def _reorder_cache(self, beam_idx): From 9676a6543bc2fc29b9ec4a6f616955925d8d60f3 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Tue, 28 Nov 2023 18:35:29 +0000 Subject: [PATCH 22/30] reuse neuron gen mix --- optimum/neuron/generation/utils.py | 1155 ++++++++++++++-------------- optimum/neuron/modeling_seq2seq.py | 298 +------ 2 files changed, 576 insertions(+), 877 deletions(-) diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py index 9ab87e914..11f64d88e 100644 --- a/optimum/neuron/generation/utils.py +++ b/optimum/neuron/generation/utils.py @@ -52,8 +52,7 @@ if TYPE_CHECKING: - from transformers.generation.streamers import BaseStreamer - from transformers.modeling_utils import PreTrainedModel + pass logger = logging.get_logger(__name__) @@ -272,419 +271,6 @@ def _expand_dict_for_generation(dict_to_expand): return input_ids, model_kwargs - def beam_search( - self, - input_ids: torch.LongTensor, - beam_scorer: BeamScorer, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[Union[int, List[int]]] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: Optional[bool] = False, - seq_length: Optional[int] = None, - **model_kwargs, - ) -> Union[BeamSearchOutput, torch.LongTensor]: - r""" - Generates sequences of token ids for models with a language modeling head using **beam search decoding** and - can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - - - In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate() - instead. For an overview of generation strategies and code examples, check the [following - guide](../generation_strategies). - - - - Parameters: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - beam_scorer (`BeamScorer`): - An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and - sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`, *optional*): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - max_length (`int`, *optional*, defaults to 20): - **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated - tokens. The maximum length of the sequence to be generated. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - eos_token_id (`Union[int, List[int]]`, *optional*): - The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - seq_length: - Length of current input_ids sequence - model_kwargs: - Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is - an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or - `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.BeamSearchEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - - - Examples: - - ```python - >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForSeq2SeqLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, - ... BeamSearchScorer, - ... ) - >>> import torch - - >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") - >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") - - >>> encoder_input_str = "translate English to German: How old are you?" - >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids - - - >>> # lets run beam search using 3 beams - >>> num_beams = 3 - >>> # define decoder start token ids - >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long) - >>> input_ids = input_ids * model.config.decoder_start_token_id - - >>> # add encoder_outputs to model keyword arguments - >>> model_kwargs = { - ... "encoder_outputs": model.get_encoder()( - ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True - ... ) - ... } - - >>> # instantiate beam scorer - >>> beam_scorer = BeamSearchScorer( - ... batch_size=1, - ... num_beams=num_beams, - ... device=model.device, - ... ) - - >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( - ... [ - ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id), - ... ] - ... ) - - >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs) - - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ['Wie alt bist du?'] - ```""" - # init values - logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() - stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - if max_length is not None: - warnings.warn( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.", - UserWarning, - ) - stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) - if len(stopping_criteria) == 0: - warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning) - pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id - if isinstance(eos_token_id, int): - eos_token_id = [eos_token_id] - output_scores = output_scores if output_scores is not None else self.generation_config.output_scores - output_attentions = ( - output_attentions if output_attentions is not None else self.generation_config.output_attentions - ) - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.generation_config.return_dict_in_generate - ) - - batch_size = len(beam_scorer._beam_hyps) - num_beams = beam_scorer.num_beams - - batch_beam_size, cur_len = input_ids.shape - - # Overwrite cur_len - cur_len = seq_length - - if num_beams * batch_size != batch_beam_size: - raise ValueError( - f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}." - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - beam_indices = ( - tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None - ) - decoder_attentions = () if (return_dict_in_generate and output_attentions) else None - cross_attentions = () if (return_dict_in_generate and output_attentions) else None - decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None - ) - - # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens - # of the first beam are considered to avoid sampling the exact same tokens across all beams. - # beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) - beam_scores_device = "cpu" - beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=beam_scores_device) - beam_scores[:, 1:] = -1e9 - beam_scores = beam_scores.view((batch_size * num_beams,)) - - this_peer_finished = False # used by synced_gpus only - while True: - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break - - # prepare model inputs - if model_kwargs["use_cache"]: - # From max_length-sized input_ids, select first - # cur_len - 1 values. - update_indices = torch.stack( - [torch.arange(input_ids.size(0)), torch.tensor(cur_len - 1).repeat(input_ids.size(0))], dim=-1 - ) - input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None] - model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs) - else: - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - if synced_gpus and this_peer_finished: - cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need - - if not model_kwargs["use_cache"]: - one_hot = ( - torch.cat( - [ - torch.tensor([0]).repeat(1, cur_len - 1), - torch.tensor([1]).repeat(1, 1), - torch.tensor([0]).repeat(1, input_ids.size(1) - cur_len), - ], - dim=1, - ) - .to(device=outputs.logits.device) - .float() - ) - next_token_logits = torch.matmul(one_hot, outputs.logits) - next_token_logits = next_token_logits.squeeze(1) - else: - next_token_logits = outputs.logits[:, -1, :] - - # Manually compute log softmax - # log_softmax(vi) = vi - max(vi) - log(sum(exp(vi - max(vi)))) - logit_max, _ = torch.max(next_token_logits, dim=-1, keepdim=True) - logsumexp = torch.log(torch.exp(next_token_logits - logit_max).sum(dim=-1, keepdim=True)) - next_token_scores = next_token_logits - logit_max - logsumexp - # (batch_size * num_beams, vocab_size) - - xm.mark_step() - - # We don't want to change every single logit processor, so - # we peform this processing on CPU. - input_ids_ = input_ids.to("cpu")[:, :cur_len] - next_token_scores_ = next_token_scores.to("cpu") - next_token_scores_processed = logits_processor(input_ids_, next_token_scores_) - - next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_token_scores_processed,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # reshape for beam search - vocab_size = next_token_scores.shape[-1] - next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) - next_token_scores = next_token_scores * 1 - - # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search) - next_token_scores, next_tokens = torch.topk( - next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True - ) - - next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor") - next_tokens = next_tokens % vocab_size - - # stateless - beam_outputs = beam_scorer.process( - input_ids.to("cpu")[:, :cur_len], - next_token_scores.to("cpu"), - next_tokens.to("cpu"), - next_indices.to("cpu"), - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - beam_indices=beam_indices, - ) - - beam_scores = beam_outputs["next_beam_scores"] - beam_next_tokens = beam_outputs["next_beam_tokens"] - beam_idx = beam_outputs["next_beam_indices"] - - update_indices = torch.stack( - [torch.arange(batch_beam_size), torch.tensor(cur_len - 1).repeat(batch_beam_size)], dim=-1 - ) - update_indices_2 = torch.stack( - [torch.arange(batch_beam_size), torch.tensor(cur_len).repeat(batch_beam_size)], dim=-1 - ) - # First select beam_indices - device = input_ids.device - beam_idx_device = beam_idx.to(device=input_ids.device) - input_ids[:, :] = input_ids[beam_idx_device.long(), :] - - # Then append new tokens - input_ids[update_indices_2[:, 0], update_indices_2[:, 1], None] = beam_next_tokens.unsqueeze(-1).to(device) - input_ids = input_ids * 1 # Hack to materialize tensor - - # update generated ids, model inputs, and length for next step - model_kwargs = self._update_model_kwargs_for_xla_generation( - outputs, - model_kwargs, - batch_size=batch_beam_size, - is_encoder_decoder=self.config.is_encoder_decoder, - max_length=stopping_criteria.max_length, - seq_length=cur_len, - use_cache=model_kwargs["use_cache"], - ) - if model_kwargs["past_key_values"] is not None: - model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx) - - if return_dict_in_generate and output_scores: - beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices)))) - - # increase cur_len - cur_len = cur_len + 1 - - # stop when each sentence is finished, or if we exceed the maximum length - stop_criterion_1 = beam_scorer.is_done - if isinstance(stopping_criteria, list): - if len(stopping_criteria) == 1: - stopping_criteria = stopping_criteria[0] - - # Cases that can be handled in XLA without requiring - # non-padded input_ids - if isinstance(stopping_criteria, MaxLengthCriteria): - stop_criterion_2 = cur_len >= stopping_criteria.max_length - elif isinstance(stopping_criteria, MaxTimeCriteria): - stop_criterion_2 = stopping_criteria(input_ids, scores) - else: - # Other cases will be handled on CPU - batch_size, _ = input_ids.shape - input_ids_cpu = input_ids.to("cpu") - mask = torch.cat( - [torch.ones(batch_size, cur_len), torch.zeros(batch_size, input_ids.shape[1] - cur_len)], dim=1 - ).bool() - input_ids_cpu = torch.masked_select(input_ids_cpu, mask).reshape((batch_size, cur_len)) - scores_cpu = scores.to("cpu") if torch.is_tensor(scores) else scores - stop_criterion_2 = stopping_criteria(input_ids_cpu, scores_cpu) - - if stop_criterion_1 or stop_criterion_2: - if not synced_gpus: - break - else: - this_peer_finished = True - - sequence_outputs = beam_scorer.finalize( - input_ids.to("cpu"), - beam_scores.to("cpu"), - next_tokens.to("cpu"), - next_indices.to("cpu"), - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - max_length=stopping_criteria.max_length, - beam_indices=beam_indices, - ) - - for k, v in sequence_outputs.items(): - if type(v) == torch.Tensor: - sequence_outputs[k] = sequence_outputs[k].to(input_ids.device) - - if return_dict_in_generate: - if not output_scores: - sequence_outputs["sequence_scores"] = None - - if self.config.is_encoder_decoder: - return BeamSearchEncoderDecoderOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - beam_indices=sequence_outputs["beam_indices"], - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return BeamSearchDecoderOnlyOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - beam_indices=sequence_outputs["beam_indices"], - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return sequence_outputs["sequences"] - @torch.no_grad() def generate( self, @@ -694,8 +280,7 @@ def generate( stopping_criteria: Optional[StoppingCriteriaList] = None, prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, synced_gpus: Optional[bool] = None, - assistant_model: Optional["PreTrainedModel"] = None, - streamer: Optional["BaseStreamer"] = None, + is_traced_inference: bool = False, **kwargs, ) -> Union[GenerateOutput, torch.LongTensor]: r""" @@ -714,23 +299,23 @@ def generate( Parameters: - inputs (`torch.Tensor` of varying shape depending on the modality, *optional*): + inputs (`Optional[torch.Tensor]`, defaults to `None`): The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs` should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of `input_ids`, `input_values`, `input_features`, or `pixel_values`. - generation_config (`~generation.GenerationConfig`, *optional*): + generation_config (`Optional[GenerationConfig]`, defaults to `None`): The generation configuration to be used as base parametrization for the generation call. `**kwargs` passed to generate matching the attributes of `generation_config` will override them. If `generation_config` is not provided, the default will be used, which had the following loading priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s default values, whose documentation should be checked to parameterize generation. - logits_processor (`LogitsProcessorList`, *optional*): + logits_processor (`Optional[LogitsProcessorList]`, defaults to `None`): Custom logits processors that complement the default logits processors built from arguments and generation config. If a logit processor is passed that is already created with the arguments or a generation config an error is thrown. This feature is intended for advanced users. - stopping_criteria (`StoppingCriteriaList`, *optional*): + stopping_criteria (`Optional[StoppingCriteriaList]`, defaults to `None`): Custom stopping criteria that complement the default stopping criteria built from arguments and a generation config. If a stopping criteria is passed that is already created with the arguments or a generation config an error is thrown. This feature is intended for advanced users. @@ -741,18 +326,13 @@ def generate( on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful for constrained generation conditioned on the prefix, as described in [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904). - synced_gpus (`bool`, *optional*): + synced_gpus (`Optional[bool]`, defaults to `None`): Whether to continue running the while loop until max_length. Unless overridden this flag will be set to `True` under DeepSpeed ZeRO Stage 3 multiple GPUs environment to avoid hanging if one GPU finished generating before other GPUs. Otherwise it'll be set to `False`. - assistant_model (`PreTrainedModel`, *optional*): - An assistant model that can be used to accelerate generation. The assistant model must have the exact - same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model - is much faster than running generation with the model you're calling generate from. As such, the - assistant model should be much smaller. - streamer (`BaseStreamer`, *optional*): - Streamer object that will be used to stream the generated sequences. Generated tokens are passed - through `streamer.put(token_ids)` and the streamer is responsible for any further processing. + is_traced_inference (`bool`, defaults to `False`): + Whether the decoder is traced or using XLA lazy tensor. If the decoder is traced, next tokens and the beam scores + are computed inside the decoder. kwargs: Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder @@ -832,9 +412,11 @@ def generate( # 4. Define other model kwargs model_kwargs["output_attentions"] = generation_config.output_attentions model_kwargs["output_hidden_states"] = generation_config.output_hidden_states - if generation_config.use_cache: + if generation_config.use_cache and not is_traced_inference: warnings.warn("use_cache is not supported for generation on Neuron devices, switching to use_cache=False.") - model_kwargs["use_cache"] = False + model_kwargs["use_cache"] = False + else: + model_kwargs["use_cache"] = generation_config.use_cache accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys()) requires_attention_mask = "encoder_outputs" not in model_kwargs @@ -875,9 +457,6 @@ def generate( else: input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids") - if streamer is not None: - streamer.put(input_ids.cpu()) - # 6. Prepare `max_length` depending on other stopping criteria. input_ids_seq_length = input_ids.shape[-1] has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None @@ -974,11 +553,6 @@ def generate( if generation_config.num_beam_groups > generation_config.num_beams: raise ValueError("`num_beam_groups` has to be smaller or equal to `num_beams`") - if streamer is not None and (generation_config.num_beams > 1): - raise ValueError( - "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1." - ) - if hasattr(self, "device") and self.device.type != input_ids.device.type: warnings.warn( "You are calling .generate() with the `input_ids` being on a device type different" @@ -1022,7 +596,7 @@ def generate( return_dict_in_generate=generation_config.return_dict_in_generate, synced_gpus=synced_gpus, seq_length=input_ids_seq_length, - streamer=streamer, + is_traced_inference=is_traced_inference, **model_kwargs, ) elif is_beam_gen_mode: @@ -1061,15 +635,332 @@ def generate( return_dict_in_generate=generation_config.return_dict_in_generate, synced_gpus=synced_gpus, seq_length=input_ids_seq_length, + is_traced_inference=is_traced_inference, **model_kwargs, ) else: - raise ValueError("Only greedy search and beam search are supported on Neuron.") + raise ValueError("Only greedy search and beam search are supported on Neuron.") + + def greedy_search( + self, + input_ids: torch.LongTensor, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[Union[int, List[int]]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + synced_gpus: bool = False, + seq_length: Optional[int] = None, + is_traced_inference: bool = False, + **model_kwargs, + ) -> Union[GreedySearchOutput, torch.LongTensor]: + r""" + Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be + used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. + + + + In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate() + instead. For an overview of generation strategies and code examples, check the [following + guide](../generation_strategies). + + + + + Parameters: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + logits_processor (`LogitsProcessorList`, *optional*): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + stopping_criteria (`StoppingCriteriaList`, *optional*): + An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] + used to tell if the generation loop should stop. + + max_length (`int`, *optional*, defaults to 20): + **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated + tokens. The maximum length of the sequence to be generated. + pad_token_id (`int`, *optional*): + The id of the *padding* token. + eos_token_id (`Union[int, List[int]]`, *optional*): + The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more details. + output_hidden_states (`bool`, *optional*, defaults to `False`): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more details. + output_scores (`bool`, *optional*, defaults to `False`): + Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + return_dict_in_generate (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + synced_gpus (`bool`, *optional*, defaults to `False`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + seq_length (`Optional[int]`, defaults to `False`): + Length of current input_ids sequence + is_traced_inference (`bool`, defaults to `False`): + Whether the decoder is traced or using XLA lazy tensor. If the decoder is traced, next tokens and the beam scores + are computed inside the decoder. + model_kwargs: + Additional model specific keyword arguments will be forwarded to the `forward` function of the model. + If model is an encoder-decoder model the kwargs should include `encoder_outputs`. + + Return: + [`~generation.GreedySearchDecoderOnlyOutput`], [`~generation.GreedySearchEncoderDecoderOutput`] or + `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a + [`~generation.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GreedySearchEncoderDecoderOutput`] if + `model.config.is_encoder_decoder=True`. + + Examples: + + ```python + >>> from transformers import AutoTokenizer + >>> from optimum.neuron import NeuronModelForSeq2SeqLM + + >>> tokenizer = AutoTokenizer.from_pretrained("t5-small") + >>> input_shapes = {"batch_size": 1, "sequence_length": 128, "num_beams": 1} + >>> model = NeuronModelForSeq2SeqLM.from_pretrained("t5-small", export=True, dynamic_batch_size=False, **input_shapes) + + >>> input_prompt = "translate English to German: Lets eat good food." + >>> inputs = tokenizer(input_prompt, return_tensors="pt") + + >>> outputs = model.greedy_search(input_ids) + + >>> results = [tokenizer.decode(t, skip_special_tokens=True) for t in outputs] + ``` + """ + # init values + if logits_processor is not None and is_traced_inference: + logger.warning( + "`logits_processor` will not be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron." + ) + elif logits_processor is None: + logits_processor = LogitsProcessorList() + use_cache = model_kwargs.pop("use_cache", False) + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + if max_length is not None: + warnings.warn( + "`max_length` is deprecated in this function, use" + " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", + UserWarning, + ) + stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) + pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id + if isinstance(eos_token_id, int): + eos_token_id = [eos_token_id] + eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None + output_scores = output_scores if output_scores is not None else self.generation_config.output_scores + output_attentions = ( + output_attentions if output_attentions is not None else self.generation_config.output_attentions + ) + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate + if return_dict_in_generate is not None + else self.generation_config.return_dict_in_generate + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + ) + + # keep track of which sequences are already finished + unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + + this_peer_finished = False # used by synced_gpus only + while True: + if synced_gpus: + # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. + # The following logic allows an early break if all peers finished generating their sequence + this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) + # send 0.0 if we finished, 1.0 otherwise + dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) + # did all peers finish? the reduced sum will be 0.0 then + if this_peer_finished_flag.item() == 0.0: + break + + # prepare model inputs + if use_cache: + # From max_length-sized input_ids, select first + # seq_length - 1 values. + + if model_kwargs.get("past_key_values") is None: + input_ids_ = input_ids[:, :seq_length] + else: + update_indices = torch.stack( + [torch.arange(input_ids.size(0)), torch.tensor(seq_length - 1).repeat(input_ids.size(0))], + dim=-1, + ) + input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None] + + model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs) + else: + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + # forward pass to get next token + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + if synced_gpus and this_peer_finished: + continue # don't waste resources running the code we don't need + + if not is_traced_inference: + if not use_cache: + one_hot = ( + torch.cat( + [ + torch.tensor([0]).repeat(1, seq_length - 1), + torch.tensor([1]).repeat(1, 1), + torch.tensor([0]).repeat(1, input_ids.size(1) - seq_length), + ], + dim=1, + ) + .to(device=outputs.logits.device) + .float() + ) + next_token_logits = torch.matmul(one_hot, outputs.logits) + next_token_logits = next_token_logits.squeeze(1) + else: + next_token_logits = outputs.logits[:, -1, :] + + # pre-process distribution + # Move to cpu to handle arbitrary logits_processor + next_tokens_scores = logits_processor(input_ids.to("cpu")[:, :seq_length], next_token_logits.to("cpu")) + next_tokens_scores = next_tokens_scores.to(input_ids.device) + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_tokens_scores,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) + + # argmax + next_tokens = torch.argmax(next_tokens_scores, dim=-1) + else: + next_tokens = outputs[0] + + # finished sentences should have their next token be a padding token + if eos_token_id is not None: + if pad_token_id is None: + raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") + next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) + + # update generated ids, model inputs, and length for next step + batch_size, _ = input_ids.shape + update_indices = torch.stack( + [torch.arange(batch_size), torch.tensor(seq_length).repeat(batch_size)], dim=-1 + ) + input_ids[update_indices[:, 0], update_indices[:, 1]] = next_tokens[:] + model_kwargs = self._update_model_kwargs_for_xla_generation( + outputs=outputs, + model_kwargs=model_kwargs, + batch_size=batch_size, + is_encoder_decoder=self.config.is_encoder_decoder, + max_length=stopping_criteria.max_length, + seq_length=seq_length, + use_cache=use_cache, + ) + + seq_length += 1 + + # if eos_token was found in one sentence, set sentence to finished + if eos_token_id_tensor is not None: + unfinished_sequences = unfinished_sequences.mul( + next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) + ) + + if not is_traced_inference: + xm.mark_step() + + # stop when each sentence is finished, or if we exceed the maximum length + stop_criterion_1 = unfinished_sequences.max() == 0 + + if isinstance(stopping_criteria, list): + if len(stopping_criteria) == 1: + stopping_criteria = stopping_criteria[0] + + # Cases that can be handled in XLA without requiring + # non-padded input_ids + if isinstance(stopping_criteria, MaxLengthCriteria): + stop_criterion_2 = seq_length >= stopping_criteria.max_length + elif isinstance(stopping_criteria, MaxTimeCriteria): + stop_criterion_2 = stopping_criteria(input_ids, scores) + else: + # Other cases will be handled on CPU + batch_size, _ = input_ids.shape + mask = torch.cat( + [torch.ones(batch_size, seq_length), torch.zeros(batch_size, input_ids.shape[1] - seq_length)], + dim=1, + ).bool() + input_ids_cpu = torch.masked_select(input_ids, mask).reshape((batch_size, seq_length)).to("cpu") + scores_cpu = scores.to("cpu") if torch.is_tensor(scores) else scores + stop_criterion_2 = stopping_criteria(input_ids_cpu, scores_cpu) + + if stop_criterion_1 or stop_criterion_2: + this_peer_finished = True + + if this_peer_finished and not synced_gpus: + break + + if return_dict_in_generate: + if self.config.is_encoder_decoder: + return GreedySearchEncoderDecoderOutput( + sequences=input_ids, + scores=scores, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + ) + else: + return GreedySearchDecoderOnlyOutput( + sequences=input_ids, + scores=scores, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + ) + else: + return input_ids - def greedy_search( + def beam_search( self, input_ids: torch.LongTensor, + beam_scorer: BeamScorer, logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, max_length: Optional[int] = None, @@ -1079,34 +970,35 @@ def greedy_search( output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, - synced_gpus: bool = False, + synced_gpus: Optional[bool] = False, seq_length: Optional[int] = None, - streamer: Optional["BaseStreamer"] = None, + is_traced_inference: bool = False, **model_kwargs, - ) -> Union[GreedySearchOutput, torch.LongTensor]: + ) -> Union[BeamSearchOutput, torch.LongTensor]: r""" - Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be - used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. + Generates sequences of token ids for models with a language modeling head using **beam search decoding** and + can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate() + In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate() instead. For an overview of generation strategies and code examples, check the [following guide](../generation_strategies). - Parameters: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): The sequence used as a prompt for the generation. + beam_scorer (`BeamScorer`): + An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and + sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. logits_processor (`LogitsProcessorList`, *optional*): An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] used to modify the prediction scores of the language modeling head applied at each generation step. stopping_criteria (`StoppingCriteriaList`, *optional*): An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] used to tell if the generation loop should stop. - max_length (`int`, *optional*, defaults to 20): **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated tokens. The maximum length of the sequence to be generated. @@ -1126,75 +1018,74 @@ def greedy_search( Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. synced_gpus (`bool`, *optional*, defaults to `False`): Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - seq_length: + seq_length (`Optional[int]`, defaults to `False`): Length of current input_ids sequence - streamer (`BaseStreamer`, *optional*): - Streamer object that will be used to stream the generated sequences. Generated tokens are passed - through `streamer.put(token_ids)` and the streamer is responsible for any further processing. - Unsupported for XLA devices + is_traced_inference (`bool`, defaults to `False`): + Whether the decoder is traced or using XLA lazy tensor. If the decoder is traced, next tokens and the beam scores + are computed inside the decoder. model_kwargs: - Additional model specific keyword arguments will be forwarded to the `forward` function of the model. - If model is an encoder-decoder model the kwargs should include `encoder_outputs`. + Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is + an encoder-decoder model the kwargs should include `encoder_outputs`. Return: - [`~generation.GreedySearchDecoderOnlyOutput`], [`~generation.GreedySearchEncoderDecoderOutput`] or + [`generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.GreedySearchEncoderDecoderOutput`] if + [`~generation.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.BeamSearchEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. + Examples: ```python - >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForCausalLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, - ... StoppingCriteriaList, - ... MaxLengthCriteria, - ... ) - - >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") - >>> model = AutoModelForCausalLM.from_pretrained("gpt2") + >>> from transformers import AutoTokenizer + >>> from optimum.neuron import NeuronModelForSeq2SeqLM - >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token - >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id + >>> tokenizer = AutoTokenizer.from_pretrained("t5-small") + >>> input_shapes = {"batch_size": 1, "sequence_length": 128, "num_beams": 4} + >>> model = NeuronModelForSeq2SeqLM.from_pretrained("t5-small", export=True, dynamic_batch_size=False, **input_shapes) - >>> input_prompt = "It might be possible to" - >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids - - >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( - ... [ - ... MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id), - ... ] - ... ) - >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)]) + >>> input_prompt = "translate English to German: Lets eat good food." + >>> inputs = tokenizer(input_prompt, return_tensors="pt") - >>> outputs = model.greedy_search( - ... input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria + >>> # add encoder_outputs to model keyword arguments + >>> model_kwargs = { + ... "encoder_outputs": model.get_encoder()( + ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True + ... ) + ... } + >>> # instantiate beam scorer + >>> beam_scorer = BeamSearchScorer( + ... batch_size=1, + ... num_beams=num_beams, + ... device=model.device, ... ) + >>> outputs = model.beam_search(input_ids, beam_scorer) >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ["It might be possible to get a better understanding of the nature of the problem, but it's not"] - ```""" + ``` + """ # init values - logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() - use_cache = model_kwargs["use_cache"] if "use_cache" in model_kwargs else False + if logits_processor is not None and is_traced_inference: + logger.warning( + "`logits_processor` will not be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron." + ) + elif logits_processor is None: + logits_processor = LogitsProcessorList() stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() if max_length is not None: warnings.warn( "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", + " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.", UserWarning, ) stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) + if len(stopping_criteria) == 0: + warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning) pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id if isinstance(eos_token_id, int): eos_token_id = [eos_token_id] - eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None output_scores = output_scores if output_scores is not None else self.generation_config.output_scores output_attentions = ( output_attentions if output_attentions is not None else self.generation_config.output_attentions @@ -1208,8 +1099,24 @@ def greedy_search( else self.generation_config.return_dict_in_generate ) + batch_size = len(beam_scorer._beam_hyps) + num_beams = beam_scorer.num_beams + + batch_beam_size, cur_len = input_ids.shape + + # Overwrite cur_len + cur_len = seq_length + + if num_beams * batch_size != batch_beam_size: + raise ValueError( + f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}." + ) + # init attention / hidden states / scores tuples scores = () if (return_dict_in_generate and output_scores) else None + beam_indices = ( + tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None + ) decoder_attentions = () if (return_dict_in_generate and output_attentions) else None cross_attentions = () if (return_dict_in_generate and output_attentions) else None decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None @@ -1221,8 +1128,13 @@ def greedy_search( model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None ) - # keep track of which sequences are already finished - unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens + # of the first beam are considered to avoid sampling the exact same tokens across all beams. + # beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) + beam_scores_device = "cpu" + beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=beam_scores_device) + beam_scores[:, 1:] = -1e9 + beam_scores = beam_scores.view((batch_size * num_beams,)) this_peer_finished = False # used by synced_gpus only while True: @@ -1237,113 +1149,153 @@ def greedy_search( break # prepare model inputs - if use_cache: - # From max_length-sized input_ids, select first - # seq_length - 1 values. - - if model_kwargs.get("past_key_values") is None: - input_ids_ = input_ids[:, :seq_length] - else: - update_indices = torch.stack( - [torch.arange(input_ids.size(0)), torch.tensor(seq_length - 1).repeat(input_ids.size(0))], - dim=-1, - ) - input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None] + if model_kwargs["use_cache"]: + import pdb + pdb.set_trace() + # From max_length-sized input_ids, select first + # cur_len - 1 values. + update_indices = torch.stack( + [torch.arange(input_ids.size(0)), torch.tensor(cur_len - 1).repeat(input_ids.size(0))], dim=-1 + ) + input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None] model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs) else: model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - # forward pass to get next token - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - if synced_gpus and this_peer_finished: - continue # don't waste resources running the code we don't need + if is_traced_inference: + next_token_scores, next_tokens, next_indices = self(**model_inputs, beam_scores=beam_scores) + else: + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) - if not use_cache: - one_hot = ( - torch.cat( - [ - torch.tensor([0]).repeat(1, seq_length - 1), - torch.tensor([1]).repeat(1, 1), - torch.tensor([0]).repeat(1, input_ids.size(1) - seq_length), - ], - dim=1, + if synced_gpus and this_peer_finished: + cur_len = cur_len + 1 + continue # don't waste resources running the code we don't need + + if not model_kwargs["use_cache"]: + one_hot = ( + torch.cat( + [ + torch.tensor([0]).repeat(1, cur_len - 1), + torch.tensor([1]).repeat(1, 1), + torch.tensor([0]).repeat(1, input_ids.size(1) - cur_len), + ], + dim=1, + ) + .to(device=outputs.logits.device) + .float() ) - .to(device=outputs.logits.device) - .float() + next_token_logits = torch.matmul(one_hot, outputs.logits) + next_token_logits = next_token_logits.squeeze(1) + else: + next_token_logits = outputs.logits[:, -1, :] + + # Manually compute log softmax + # log_softmax(vi) = vi - max(vi) - log(sum(exp(vi - max(vi)))) + logit_max, _ = torch.max(next_token_logits, dim=-1, keepdim=True) + logsumexp = torch.log(torch.exp(next_token_logits - logit_max).sum(dim=-1, keepdim=True)) + next_token_scores = next_token_logits - logit_max - logsumexp + # (batch_size * num_beams, vocab_size) + + xm.mark_step() + + # We don't want to change every single logit processor, so + # we peform this processing on CPU. + input_ids_ = input_ids.to("cpu")[:, :cur_len] + next_token_scores_ = next_token_scores.to("cpu") + next_token_scores_processed = logits_processor(input_ids_, next_token_scores_) + + next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores) + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_token_scores_processed,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) + + # reshape for beam search + vocab_size = next_token_scores.shape[-1] + next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) + next_token_scores = next_token_scores * 1 + + # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search) + next_token_scores, next_tokens = torch.topk( + next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True ) - next_token_logits = torch.matmul(one_hot, outputs.logits) - next_token_logits = next_token_logits.squeeze(1) - else: - next_token_logits = outputs.logits[:, -1, :] - - # pre-process distribution - # Move to cpu to handle arbitrary logits_processor - next_tokens_scores = logits_processor(input_ids.to("cpu")[:, :seq_length], next_token_logits.to("cpu")) - next_tokens_scores = next_tokens_scores.to(input_ids.device) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_tokens_scores,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - # argmax - next_tokens = torch.argmax(next_tokens_scores, dim=-1) + next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor") + next_tokens = next_tokens % vocab_size - # finished sentences should have their next token be a padding token - if eos_token_id is not None: - if pad_token_id is None: - raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") - next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) + # stateless + beam_outputs = beam_scorer.process( + input_ids.to("cpu")[:, :cur_len], + next_token_scores.to("cpu"), + next_tokens.to("cpu"), + next_indices.to("cpu"), + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + beam_indices=beam_indices, + ) + + beam_scores = beam_outputs["next_beam_scores"] + beam_next_tokens = beam_outputs["next_beam_tokens"] + beam_idx = beam_outputs["next_beam_indices"] - # update generated ids, model inputs, and length for next step - batch_size, _ = input_ids.shape update_indices = torch.stack( - [torch.arange(batch_size), torch.tensor(seq_length).repeat(batch_size)], dim=-1 + [torch.arange(batch_beam_size), torch.tensor(cur_len - 1).repeat(batch_beam_size)], dim=-1 ) - input_ids[update_indices[:, 0], update_indices[:, 1]] = next_tokens[:] + update_indices_2 = torch.stack( + [torch.arange(batch_beam_size), torch.tensor(cur_len).repeat(batch_beam_size)], dim=-1 + ) + # First select beam_indices + device = input_ids.device + beam_idx_device = beam_idx.to(device=input_ids.device) + input_ids[:, :] = input_ids[beam_idx_device.long(), :] + + # Then append new tokens + input_ids[update_indices_2[:, 0], update_indices_2[:, 1], None] = beam_next_tokens.unsqueeze(-1).to(device) + input_ids = input_ids * 1 # Hack to materialize tensor + + # update generated ids, model inputs, and length for next step model_kwargs = self._update_model_kwargs_for_xla_generation( outputs, model_kwargs, - batch_size=batch_size, + batch_size=batch_beam_size, is_encoder_decoder=self.config.is_encoder_decoder, max_length=stopping_criteria.max_length, - seq_length=seq_length, - use_cache=use_cache, + seq_length=cur_len, + use_cache=model_kwargs["use_cache"], ) + if is_traced_inference: + self._reorder_cache(beam_idx.to(torch.int64)) + elif model_kwargs["past_key_values"] is not None: + model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx) - seq_length += 1 - - # if eos_token was found in one sentence, set sentence to finished - if eos_token_id_tensor is not None: - unfinished_sequences = unfinished_sequences.mul( - next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) - ) + if return_dict_in_generate and output_scores: + beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices)))) - xm.mark_step() + # increase cur_len + cur_len = cur_len + 1 # stop when each sentence is finished, or if we exceed the maximum length - stop_criterion_1 = unfinished_sequences.max() == 0 - + stop_criterion_1 = beam_scorer.is_done if isinstance(stopping_criteria, list): if len(stopping_criteria) == 1: stopping_criteria = stopping_criteria[0] @@ -1351,34 +1303,51 @@ def greedy_search( # Cases that can be handled in XLA without requiring # non-padded input_ids if isinstance(stopping_criteria, MaxLengthCriteria): - stop_criterion_2 = seq_length >= stopping_criteria.max_length + stop_criterion_2 = cur_len >= stopping_criteria.max_length elif isinstance(stopping_criteria, MaxTimeCriteria): stop_criterion_2 = stopping_criteria(input_ids, scores) else: # Other cases will be handled on CPU batch_size, _ = input_ids.shape + input_ids_cpu = input_ids.to("cpu") mask = torch.cat( - [torch.ones(batch_size, seq_length), torch.zeros(batch_size, input_ids.shape[1] - seq_length)], - dim=1, + [torch.ones(batch_size, cur_len), torch.zeros(batch_size, input_ids.shape[1] - cur_len)], dim=1 ).bool() - input_ids_cpu = torch.masked_select(input_ids, mask).reshape((batch_size, seq_length)).to("cpu") + input_ids_cpu = torch.masked_select(input_ids_cpu, mask).reshape((batch_size, cur_len)) scores_cpu = scores.to("cpu") if torch.is_tensor(scores) else scores stop_criterion_2 = stopping_criteria(input_ids_cpu, scores_cpu) if stop_criterion_1 or stop_criterion_2: - this_peer_finished = True + if not synced_gpus: + break + else: + this_peer_finished = True - if this_peer_finished and not synced_gpus: - break + sequence_outputs = beam_scorer.finalize( + input_ids.to("cpu"), + beam_scores.to("cpu"), + next_tokens.to("cpu"), + next_indices.to("cpu"), + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + max_length=stopping_criteria.max_length, + beam_indices=beam_indices, + ) - if streamer is not None: - streamer.end() + for k, v in sequence_outputs.items(): + if type(v) == torch.Tensor: + sequence_outputs[k] = sequence_outputs[k].to(input_ids.device) if return_dict_in_generate: + if not output_scores: + sequence_outputs["sequence_scores"] = None + if self.config.is_encoder_decoder: - return GreedySearchEncoderDecoderOutput( - sequences=input_ids, + return BeamSearchEncoderDecoderOutput( + sequences=sequence_outputs["sequences"], + sequences_scores=sequence_outputs["sequence_scores"], scores=scores, + beam_indices=sequence_outputs["beam_indices"], encoder_attentions=encoder_attentions, encoder_hidden_states=encoder_hidden_states, decoder_attentions=decoder_attentions, @@ -1386,11 +1355,13 @@ def greedy_search( decoder_hidden_states=decoder_hidden_states, ) else: - return GreedySearchDecoderOnlyOutput( - sequences=input_ids, + return BeamSearchDecoderOnlyOutput( + sequences=sequence_outputs["sequences"], + sequences_scores=sequence_outputs["sequence_scores"], scores=scores, + beam_indices=sequence_outputs["beam_indices"], attentions=decoder_attentions, hidden_states=decoder_hidden_states, ) else: - return input_ids + return sequence_outputs["sequences"] diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index 23396002e..b52e7e863 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -26,19 +26,12 @@ import torch from huggingface_hub import snapshot_download from transformers import AutoConfig, AutoModelForSeq2SeqLM, GenerationConfig -from transformers.generation.beam_search import BeamScorer from transformers.generation.logits_process import ( LogitsProcessorList, ) from transformers.generation.stopping_criteria import ( - MaxLengthCriteria, - MaxTimeCriteria, StoppingCriteriaList, ) -from transformers.generation.utils import ( - BeamSearchOutput, - GreedySearchOutput, -) from transformers.modeling_outputs import Seq2SeqLMOutput from ..exporters.neuron import ( @@ -59,6 +52,7 @@ if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedModel + from transformers.utils import ModelOutput if is_neuronx_available(): import torch_neuronx @@ -357,6 +351,10 @@ def forward( decoder_attention_mask: Optional[torch.BoolTensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, beam_scores=None, + # Leave following kwargs for compatibility, will not have any effect. + return_dict: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: hidden_states = encoder_outputs["last_hidden_state"] @@ -418,290 +416,14 @@ def generate( max_length=kwargs.pop("max_length", None) or max_length, num_beams=num_beams, do_sample=kwargs.pop("do_sample", False), - use_cache=kwargs.pop( - "use_cache", False - ), # `use_cache` is supported by default in `optimum-neuron`, set to False to avoid warning + use_cache=True, # pkv is cached by default decoder_attention_mask=decoder_attention_mask, # Pass fake encoder_outputs so the transfomers code will not invoke the encoder encoder_outputs={"last_hidden_state": torch.ones((batch_size, max_length, 1))}, + is_traced_inference=True, ) return output - def beam_search( - self, - input_ids: torch.LongTensor, - beam_scorer: "BeamScorer", - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[Union[int, List[int]]] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: Optional[bool] = False, - seq_length: Optional[int] = None, - **model_kwargs, - ) -> Union[BeamSearchOutput, torch.LongTensor]: - """ - Overriding beam search to use next_token_scores returned from neuron device instead of logits. - """ - if logits_processor is not None: - logger.warning( - "`logits_processor` will be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron." - ) - stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id - if isinstance(eos_token_id, int): - eos_token_id = [eos_token_id] - output_scores = output_scores if output_scores is not None else self.generation_config.output_scores - - batch_size = len(beam_scorer._beam_hyps) - num_beams = beam_scorer.num_beams - - batch_beam_size, cur_len = input_ids.shape - - # Overwrite cur_len - cur_len = seq_length - - if num_beams * batch_size != batch_beam_size: - raise ValueError( - f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}." - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - beam_indices = ( - tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None - ) - - # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens - # of the first beam are considered to avoid sampling the exact same tokens across all beams. - beam_scores_device = "cpu" - beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=beam_scores_device) - beam_scores[:, 1:] = -1e9 - beam_scores = beam_scores.view((batch_size * num_beams,)) - - while True: - # prepare model inputs - # From max_length-sized input_ids, select first - # cur_len - 1 values. - update_indices = torch.stack( - [torch.arange(input_ids.size(0)), torch.tensor(cur_len - 1).repeat(input_ids.size(0))], dim=-1 - ) - input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None] - model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs) - - next_token_scores, next_tokens, next_indices = self(**model_inputs, beam_scores=beam_scores) - - # stateless - beam_outputs = beam_scorer.process( - input_ids.to("cpu")[:, :cur_len], - next_token_scores.to("cpu"), - next_tokens.to("cpu"), - next_indices.to("cpu"), - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - beam_indices=beam_indices, - ) - - beam_scores = beam_outputs["next_beam_scores"] - beam_next_tokens = beam_outputs["next_beam_tokens"] - beam_idx = beam_outputs["next_beam_indices"] - - update_indices = torch.stack( - [torch.arange(batch_beam_size), torch.tensor(cur_len - 1).repeat(batch_beam_size)], dim=-1 - ) - update_indices_2 = torch.stack( - [torch.arange(batch_beam_size), torch.tensor(cur_len).repeat(batch_beam_size)], dim=-1 - ) - # First select beam_indices - device = input_ids.device - beam_idx_device = beam_idx.to(device=input_ids.device) - input_ids[:, :] = input_ids[beam_idx_device.long(), :] - - # Then append new tokens - input_ids[update_indices_2[:, 0], update_indices_2[:, 1], None] = ( - beam_next_tokens.unsqueeze(-1).to(device).to(torch.long) - ) - input_ids = input_ids * 1 # Hack to materialize tensor - - # update generated ids, model inputs, and length for next step - model_kwargs = self._update_model_kwargs_for_xla_generation( - model_kwargs, - batch_size=batch_beam_size, - is_encoder_decoder=self.config.is_encoder_decoder, - ) - self._reorder_cache(beam_idx.to(torch.int64)) - - if return_dict_in_generate and output_scores: - beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices)))) - - # increase cur_len - cur_len = cur_len + 1 - - # stop when each sentence is finished, or if we exceed the maximum length - stop_criterion_1 = beam_scorer.is_done - if isinstance(stopping_criteria, list): - if len(stopping_criteria) == 1: - stopping_criteria = stopping_criteria[0] - - # Cases that can be handled in XLA without requiring - # non-padded input_ids - if isinstance(stopping_criteria, MaxLengthCriteria): - stop_criterion_2 = cur_len >= stopping_criteria.max_length - elif isinstance(stopping_criteria, MaxTimeCriteria): - stop_criterion_2 = stopping_criteria(input_ids, scores) - else: - # Other cases will be handled on CPU - batch_size, _ = input_ids.shape - input_ids_cpu = input_ids.to("cpu") - mask = torch.cat( - [torch.ones(batch_size, cur_len), torch.zeros(batch_size, input_ids.shape[1] - cur_len)], dim=1 - ).bool() - input_ids_cpu = torch.masked_select(input_ids_cpu, mask).reshape((batch_size, cur_len)) - scores_cpu = scores.to("cpu") if torch.is_tensor(scores) else scores - stop_criterion_2 = stopping_criteria(input_ids_cpu, scores_cpu) - - if stop_criterion_1 or stop_criterion_2: - if not synced_gpus: - break - - sequence_outputs = beam_scorer.finalize( - input_ids.to("cpu"), - beam_scores.to("cpu"), - next_tokens.to("cpu"), - next_indices.to("cpu"), - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - max_length=stopping_criteria.max_length, - beam_indices=beam_indices, - ) - - for k, v in sequence_outputs.items(): - if type(v) == torch.Tensor: - sequence_outputs[k] = sequence_outputs[k].to(input_ids.device) - - return sequence_outputs["sequences"] - - def greedy_search( - self, - input_ids: torch.LongTensor, - logits_processor: Optional["LogitsProcessorList"] = None, - stopping_criteria: Optional["StoppingCriteriaList"] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[Union[int, List[int]]] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - seq_length: Optional[int] = int, - **model_kwargs, - ) -> Union[GreedySearchOutput, torch.LongTensor]: - """ - Overriding greedy sampling to use next tokens returned from neuron device instead of logits. - """ - # init values - if logits_processor is not None: - logger.warning( - "`logits_processor` will not be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron." - ) - stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - if max_length is not None: - from transformers.generation.stopping_criteria import validate_stopping_criteria - - stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) - pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id - if isinstance(eos_token_id, int): - eos_token_id = [eos_token_id] - eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None - output_scores = output_scores if output_scores is not None else self.generation_config.output_scores - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - - # keep track of which sequences are already finished - unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device) - - this_peer_finished = False # used by synced_gpus only - while True: - # prepare model inputs - # From max_length-sized input_ids, select first - # seq_length - 1 values. - - if model_kwargs.get("past_key_values") is None: - input_ids_ = input_ids[:, :seq_length] - else: - update_indices = torch.stack( - [torch.arange(input_ids.size(0)), torch.tensor(seq_length - 1).repeat(input_ids.size(0))], - dim=-1, - ) - input_ids_ = input_ids[update_indices[:, 0], update_indices[:, 1], None] - - model_inputs = self.prepare_inputs_for_generation(input_ids_, **model_kwargs) - - # forward pass to get next token - output = self(**model_inputs) - next_tokens = output[0] - - # finished sentences should have their next token be a padding token - if eos_token_id is not None: - if pad_token_id is None: - raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") - next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) - - # update generated ids, model inputs, and length for next step - - batch_size, _ = input_ids.shape - update_indices = torch.stack( - [torch.arange(batch_size), torch.tensor(seq_length).repeat(batch_size)], dim=-1 - ) - input_ids[update_indices[:, 0], update_indices[:, 1]] = next_tokens[:] - model_kwargs = self._update_model_kwargs_for_xla_generation( - model_kwargs, - batch_size=batch_size, - is_encoder_decoder=self.config.is_encoder_decoder, - ) - - seq_length += 1 - - # if eos_token was found in one sentence, set sentence to finished - if eos_token_id_tensor is not None: - unfinished_sequences = unfinished_sequences.mul( - next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) - ) - - # stop when each sentence is finished, or if we exceed the maximum length - stop_criterion_1 = unfinished_sequences.max() == 0 - - if isinstance(stopping_criteria, list): - if len(stopping_criteria) == 1: - stopping_criteria = stopping_criteria[0] - - # Cases that can be handled in XLA without requiring - # non-padded input_ids - if isinstance(stopping_criteria, MaxLengthCriteria): - stop_criterion_2 = seq_length >= stopping_criteria.max_length - elif isinstance(stopping_criteria, MaxTimeCriteria): - stop_criterion_2 = stopping_criteria(input_ids, scores) - else: - # Other cases will be handled on CPU - batch_size, _ = input_ids.shape - mask = torch.cat( - [torch.ones(batch_size, seq_length), torch.zeros(batch_size, input_ids.shape[1] - seq_length)], - dim=1, - ).bool() - input_ids_cpu = torch.masked_select(input_ids, mask).reshape((batch_size, seq_length)).to("cpu") - scores_cpu = scores.to("cpu") if torch.is_tensor(scores) else scores - stop_criterion_2 = stopping_criteria(input_ids_cpu, scores_cpu) - - if stop_criterion_1 or stop_criterion_2: - this_peer_finished = True - - if this_peer_finished: - break - - return input_ids - def _reorder_cache(self, beam_idx): """ The cache was reordered during the tracing of the decoder so we can skip it here. This is needed for beam search and not greedy sampling. @@ -716,6 +438,12 @@ def _update_model_kwargs_for_xla_generation( model_kwargs: Dict[str, Any], batch_size: int, is_encoder_decoder: bool = False, + # Leave following kwargs for compatibility, will not have any effect. + outputs: "ModelOutput" = None, + standardize_cache_format: bool = False, + max_length: Optional[int] = None, + seq_length: Optional[int] = None, + use_cache: bool = True, ) -> Dict[str, Any]: mask = self._update_attention(model_kwargs, batch_size, is_encoder_decoder) # sets the updated variables (mask and past_key_values) From e8d72c2b5192b59e51620db658229c7dc4c72ddc Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Tue, 28 Nov 2023 22:58:01 +0000 Subject: [PATCH 23/30] fix beam --- optimum/neuron/generation/utils.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py index 11f64d88e..e56ddd0b1 100644 --- a/optimum/neuron/generation/utils.py +++ b/optimum/neuron/generation/utils.py @@ -1150,9 +1150,6 @@ def beam_search( # prepare model inputs if model_kwargs["use_cache"]: - import pdb - - pdb.set_trace() # From max_length-sized input_ids, select first # cur_len - 1 values. update_indices = torch.stack( @@ -1164,7 +1161,8 @@ def beam_search( model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) if is_traced_inference: - next_token_scores, next_tokens, next_indices = self(**model_inputs, beam_scores=beam_scores) + outputs = self(**model_inputs, beam_scores=beam_scores) + next_token_scores, next_tokens, next_indices = outputs else: outputs = self( **model_inputs, @@ -1270,13 +1268,21 @@ def beam_search( input_ids[:, :] = input_ids[beam_idx_device.long(), :] # Then append new tokens - input_ids[update_indices_2[:, 0], update_indices_2[:, 1], None] = beam_next_tokens.unsqueeze(-1).to(device) + if is_traced_inference: + # int64 is not natively supported by inf2 and has been cast down to int32 + input_ids[update_indices_2[:, 0], update_indices_2[:, 1], None] = ( + beam_next_tokens.unsqueeze(-1).to(device).to(torch.long) + ) + else: + input_ids[update_indices_2[:, 0], update_indices_2[:, 1], None] = beam_next_tokens.unsqueeze(-1).to( + device + ) input_ids = input_ids * 1 # Hack to materialize tensor # update generated ids, model inputs, and length for next step model_kwargs = self._update_model_kwargs_for_xla_generation( - outputs, - model_kwargs, + outputs=outputs, + model_kwargs=model_kwargs, batch_size=batch_beam_size, is_encoder_decoder=self.config.is_encoder_decoder, max_length=stopping_criteria.max_length, From dd4b1c7fe464a67da6d3a04b1350a690c2a4ef6e Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Wed, 29 Nov 2023 00:51:08 +0000 Subject: [PATCH 24/30] fix tests --- optimum/exporters/neuron/convert.py | 8 +++++--- optimum/neuron/modeling_base.py | 4 +++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index 9cace43f7..0c2d5eef3 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -169,9 +169,11 @@ def validate_model_outputs( with torch.no_grad(): reference_model.eval() ref_inputs = config.generate_dummy_inputs(return_tuple=False, **input_shapes) - if reference_model.config.is_encoder_decoder: + if getattr(reference_model.config, "is_encoder_decoder", False): reference_model = config.patch_model_for_export(reference_model, device="cpu", **input_shapes) - if "AutoencoderKL" in getattr(config._config, "_class_name", "") or reference_model.config.is_encoder_decoder: + if "AutoencoderKL" in getattr(config._config, "_class_name", "") or getattr( + reference_model.config, "is_encoder_decoder", False + ): # VAE components for stable diffusion or Encoder-Decoder models ref_inputs = tuple(ref_inputs.values()) ref_outputs = reference_model(*ref_inputs) @@ -428,7 +430,7 @@ def export_neuronx( dummy_inputs_tuple = tuple(dummy_inputs.values()) aliases = {} - if model.config.is_encoder_decoder: + if getattr(model.config, "is_encoder_decoder", False): checked_model = config.patch_model_for_export(model, **input_shapes) if getattr(config, "is_decoder", False): aliases = config.generate_io_aliases(checked_model) diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py index 7b3a28ecd..144826740 100644 --- a/optimum/neuron/modeling_base.py +++ b/optimum/neuron/modeling_base.py @@ -407,7 +407,9 @@ def _neuron_config_init(cls, config: "PretrainedConfig") -> "NeuronConfig": # Neuron config constructuor task = getattr(config, "task") or TasksManager.infer_task_from_model(cls.auto_model_class) task = TasksManager.map_from_synonym(task) - model_type = neuron_configs.get("model_type", None) or config.model_type + model_type = neuron_configs.get("model_type", None) + if not (model_type and model_type != "None"): + model_type = config.model_type neuron_config_constructor = TasksManager.get_exporter_config_constructor( model_type=model_type, exporter="neuron", task=task ) From df7cde79eef301bffb1ef6208c4f4490b694b775 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Wed, 29 Nov 2023 18:24:49 +0000 Subject: [PATCH 25/30] support optional outputs for decoder --- optimum/commands/export/neuronx.py | 10 +++++++ optimum/exporters/neuron/__main__.py | 22 ++++++++++++++- optimum/exporters/neuron/base.py | 7 +++++ optimum/exporters/neuron/config.py | 9 +++++++ optimum/exporters/neuron/convert.py | 1 + optimum/exporters/neuron/model_configs.py | 9 ++++++- optimum/exporters/neuron/model_wrappers.py | 31 +++++++++++++++++++--- optimum/exporters/neuron/utils.py | 4 +++ optimum/neuron/generation/utils.py | 15 ++++++----- optimum/neuron/utils/argument_utils.py | 4 +++ 10 files changed, 99 insertions(+), 13 deletions(-) diff --git a/optimum/commands/export/neuronx.py b/optimum/commands/export/neuronx.py index 1278b604b..5761bac44 100644 --- a/optimum/commands/export/neuronx.py +++ b/optimum/commands/export/neuronx.py @@ -141,6 +141,16 @@ def parse_args_neuronx(parser: "ArgumentParser"): "UNet model ID on huggingface.co or path on disk to load model from. This will replace the unet in the original Stable Diffusion pipeline." ), ) + optional_group.add_argument( + "--output_hidden_states", + action="store_true", + help=("Whether or not for the traced model to return the hidden states of all layers."), + ) + optional_group.add_argument( + "--output_attentions", + action="store_true", + help=("Whether or not for the traced model to return the attentions tensors of all attention layers."), + ) class NeuronxExportCommand(BaseOptimumCLICommand): diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index 70abd9619..7d65fb241 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -121,6 +121,18 @@ def normalize_input_shapes(task: str, args: argparse.Namespace) -> Dict[str, int return input_shapes +def customize_optional_outputs(args: argparse.Namespace) -> Dict[str, bool]: + """ + Customize optional outputs of the traced model, eg. if `output_attentions=True`, the attentions tensors will be traced. + """ + possible_outputs = ["output_attentions", "output_hidden_states"] + + customized_outputs = {} + for name in possible_outputs: + customized_outputs[name] = getattr(args, name, False) + return customized_outputs + + def normalize_stable_diffusion_input_shapes( args: argparse.Namespace, ) -> Dict[str, Dict[str, int]]: @@ -190,6 +202,7 @@ def _get_submodels_and_neuron_configs( dynamic_batch_size: bool = False, model_name_or_path: Optional[Union[str, Path]] = None, submodels: Dict[str, Union[Path, str]] = None, + optional_outputs: Dict[str, bool] = None, ): is_stable_diffusion = "stable-diffusion" in task is_encoder_decoder = ( @@ -202,7 +215,7 @@ def _get_submodels_and_neuron_configs( ) elif is_encoder_decoder: models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_encoder_decoder( - model, input_shapes, task, output, dynamic_batch_size, model_name_or_path + model, input_shapes, task, output, dynamic_batch_size, model_name_or_path, optional_outputs ) else: neuron_config_constructor = TasksManager.get_exporter_config_constructor( @@ -273,6 +286,7 @@ def _get_submodels_and_neuron_configs_for_encoder_decoder( output: Path, dynamic_batch_size: bool = False, model_name_or_path: Optional[Union[str, Path]] = None, + optional_outputs: Dict[str, bool] = None, ): if is_neuron_available(): raise RuntimeError( @@ -284,6 +298,7 @@ def _get_submodels_and_neuron_configs_for_encoder_decoder( task=task, dynamic_batch_size=dynamic_batch_size, input_shapes=input_shapes, + optional_outputs=optional_outputs, ) output_model_names = { ENCODER_NAME: os.path.join(ENCODER_NAME, NEURON_FILE_NAME), @@ -310,6 +325,7 @@ def main_export( use_auth_token: Optional[Union[bool, str]] = None, do_validation: bool = True, submodels: Dict[str, Union[Path, str]] = None, + optional_outputs: Dict[str, bool] = None, **input_shapes, ): output = Path(output) @@ -341,6 +357,7 @@ def main_export( dynamic_batch_size=dynamic_batch_size, model_name_or_path=model_name_or_path, submodels=submodels, + optional_outputs=optional_outputs, ) _, neuron_outputs = export_models( @@ -408,6 +425,8 @@ def main(): input_shapes = normalize_input_shapes(task, args) submodels = None + optional_outputs = customize_optional_outputs(args) + main_export( model_name_or_path=args.model, output=args.output, @@ -419,6 +438,7 @@ def main(): trust_remote_code=args.trust_remote_code, do_validation=not args.disable_validation, submodels=submodels, + optional_outputs=optional_outputs, **input_shapes, ) diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py index 6b005869f..c5e3c9cbf 100644 --- a/optimum/exporters/neuron/base.py +++ b/optimum/exporters/neuron/base.py @@ -120,6 +120,8 @@ def __init__( point_batch_size: Optional[int] = None, nb_points_per_image: Optional[int] = None, num_beams: int = 1, + output_attentions: bool = False, + output_hidden_states: bool = False, # TODO: add custom dtype after optimum 1.13 release # int_dtype: str = "int64", # float_dtype: str = "fp32", @@ -156,6 +158,11 @@ def __init__( input_shapes[name] = value setattr(self, name, value) setattr(self, "input_shapes", input_shapes) + setattr( + self, + "optional_outputs", + {"output_attentions": output_attentions, "output_hidden_states": output_hidden_states}, + ) setattr(self, "compiler_type", compiler_type) setattr(self, "compiler_version", compiler_version) diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py index fccac7e39..597b886c7 100644 --- a/optimum/exporters/neuron/config.py +++ b/optimum/exporters/neuron/config.py @@ -118,6 +118,15 @@ def outputs(self) -> List[str]: + [f"past.{idx}.cross.key" for idx in range(self._config.num_decoder_layers)] + [f"past.{idx}.cross.value" for idx in range(self._config.num_decoder_layers)] ) + if self.optional_outputs["output_attentions"]: + # Flatten attentions tensors of all attention layers + common_outputs += [f"decoder_attention.{idx}" for idx in range(self._config.num_decoder_layers)] + if self.optional_outputs["output_hidden_states"]: + # Flatten hidden states of all layers + common_outputs += [ + f"decoder_hidden_state.{idx}" for idx in range(self._config.num_decoder_layers + 1) + ] # +1 for the embedding layer + return common_outputs def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]: diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index 0c2d5eef3..a3bfa9857 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -340,6 +340,7 @@ def export_models( compiler_version=NEURON_COMPILER_VERSION, model_type=getattr(sub_neuron_config, "MODEL_TYPE", None), task=getattr(sub_neuron_config, "task", None), + optional_outputs=getattr(sub_neuron_config, "optional_outputs", None), ) if isinstance(model_config, PretrainedConfig): model_config = DiffusersPretrainedConfig.from_dict(model_config.__dict__) diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index e4dda2fa5..eaf03ba51 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -457,8 +457,15 @@ def patch_model_for_export(self, model, device="xla", **kwargs): batch_size = kwargs.pop("batch_size", 1) sequence_length = kwargs.pop("sequence_length", 1) num_beams = kwargs.pop("num_beams", 1) + return self.CUSTOM_MODEL_WRAPPER( - model, batch_size=batch_size, sequence_length=sequence_length, num_beams=num_beams, device=device + model, + batch_size=batch_size, + sequence_length=sequence_length, + num_beams=num_beams, + output_hidden_states=self.optional_outputs["output_hidden_states"], + output_attentions=self.optional_outputs["output_attentions"], + device=device, ) def generate_io_aliases(self, model): diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py index abc63c114..31d6d00ba 100644 --- a/optimum/exporters/neuron/model_wrappers.py +++ b/optimum/exporters/neuron/model_wrappers.py @@ -143,6 +143,8 @@ def __init__( batch_size: int, sequence_length: int, num_beams: int = 1, + output_hidden_states: bool = False, + output_attentions: bool = False, device: str = "xla", tp_degree: Optional[int] = None, ): @@ -152,6 +154,8 @@ def __init__( self.batch_size = batch_size self.sequence_length = sequence_length self.num_beams = num_beams + self.output_hidden_states = output_hidden_states + self.output_attentions = output_attentions self.device = device self.tp_degree = tp_degree @@ -259,12 +263,21 @@ def forward( encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, use_cache=True, - output_attentions=False, - output_hidden_states=False, + output_attentions=self.output_attentions, + output_hidden_states=self.output_hidden_states, ) last_hidden_state = decoder_output["last_hidden_state"] past_key_values = decoder_output["past_key_values"] + if self.output_hidden_states: + decoder_hidden_states = [ + hidden_state for hidden_state in decoder_output["hidden_states"] + ] # flatten `hidden_states` which is a tuple of tensors + + if self.output_attentions: + decoder_attentions = [ + attention for attention in decoder_output["attentions"] + ] # flatten `hidden_states` which is a tuple of tensors if self.config.tie_word_embeddings: # Rescale output before projecting on vocab @@ -307,8 +320,18 @@ def forward( next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor") next_tokens = next_tokens % vocab_size - return [next_token_scores, next_tokens, next_indices] + past_key_values_sa + past_key_values_ca + neuron_outputs = [next_token_scores, next_tokens, next_indices] + past_key_values_sa + past_key_values_ca + else: # Greedy next_tokens = torch.argmax(next_token_logits, dim=-1) - return [next_tokens] + past_key_values_sa + past_key_values_ca + + neuron_outputs = [next_tokens] + past_key_values_sa + past_key_values_ca + + if self.output_hidden_states: + neuron_outputs += decoder_hidden_states + + if self.output_attentions: + neuron_outputs += decoder_attentions + + return neuron_outputs diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py index 0e7741ec8..afca52597 100644 --- a/optimum/exporters/neuron/utils.py +++ b/optimum/exporters/neuron/utils.py @@ -352,6 +352,7 @@ def get_encoder_decoder_models_for_export( task: str, input_shapes: Dict[str, int], dynamic_batch_size: Optional[bool] = False, + optional_outputs: Dict[str, bool] = None, ) -> Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]: """ Returns the components of an encoder-decoder model and their subsequent neuron configs. @@ -366,6 +367,8 @@ def get_encoder_decoder_models_for_export( Static shapes used for compiling the encoder and the decoder. dynamic_batch_size (`bool`, defaults to `False`): Whether the Neuron compiled model supports dynamic batch size. + optional_outputs (`Dict[str, bool]`, defaults to `None`) + Whether to trace some optional output tensors. Returns: `Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]`: A Dict containing the model and @@ -397,6 +400,7 @@ def get_encoder_decoder_models_for_export( config=model.config, task=task, dynamic_batch_size=dynamic_batch_size, + **optional_outputs, **input_shapes, ) models_for_export[DECODER_NAME] = (model, decoder_neuron_config) diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py index e56ddd0b1..3476fe0a8 100644 --- a/optimum/neuron/generation/utils.py +++ b/optimum/neuron/generation/utils.py @@ -17,7 +17,7 @@ import copy import inspect import warnings -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch import torch.distributed as dist @@ -51,9 +51,6 @@ from transformers.utils import ModelOutput, logging -if TYPE_CHECKING: - pass - logger = logging.get_logger(__name__) @@ -873,6 +870,11 @@ def greedy_search( else: next_tokens = outputs[0] + if return_dict_in_generate and output_scores: + logger.warning( + "`output_scores` will be neglected because currently we do not trace `next_token_scores` for greedy search. If you want us to support the option during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron." + ) + # finished sentences should have their next token be a padding token if eos_token_id is not None: if pad_token_id is None: @@ -1068,7 +1070,7 @@ def beam_search( # init values if logits_processor is not None and is_traced_inference: logger.warning( - "`logits_processor` will not be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron." + "`logits_processor` will be neglected because in `optimum-neuron`, `next_tokens` is computed inside the compiled decoder. If you want us to support custom logits_processor during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron." ) elif logits_processor is None: logits_processor = LogitsProcessorList() @@ -1130,7 +1132,6 @@ def beam_search( # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens # of the first beam are considered to avoid sampling the exact same tokens across all beams. - # beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) beam_scores_device = "cpu" beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=beam_scores_device) beam_scores[:, 1:] = -1e9 @@ -1203,7 +1204,7 @@ def beam_search( xm.mark_step() # We don't want to change every single logit processor, so - # we peform this processing on CPU. + # we perform this processing on CPU. input_ids_ = input_ids.to("cpu")[:, :cur_len] next_token_scores_ = next_token_scores.to("cpu") next_token_scores_processed = logits_processor(input_ids_, next_token_scores_) diff --git a/optimum/neuron/utils/argument_utils.py b/optimum/neuron/utils/argument_utils.py index b7e9b4ab0..5400c3065 100644 --- a/optimum/neuron/utils/argument_utils.py +++ b/optimum/neuron/utils/argument_utils.py @@ -147,6 +147,7 @@ def store_compilation_config( compiler_version: str, model_type: Optional[str] = None, task: str = None, + optional_outputs: Dict[str, bool] = None, **kwargs, ): if isinstance(config, OrderedDict): @@ -182,6 +183,9 @@ def store_compilation_config( elif neuron_model_type != original_model_type: config_args["model_type"] = neuron_model_type # Neuron custom model_type, eg. `t5-encoder`. + if optional_outputs is not None: + config_args["optional_outputs"] = optional_outputs + update_func("neuron", config_args) if hasattr(config, "_diffusers_version"): From 92cd6e5b297fcfa465b9fe85dc4a34878fa27378 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Fri, 1 Dec 2023 16:07:23 +0000 Subject: [PATCH 26/30] enhance tests --- optimum/exporters/neuron/__main__.py | 36 ++++++-- optimum/exporters/neuron/base.py | 7 +- optimum/exporters/neuron/config.py | 12 ++- optimum/exporters/neuron/convert.py | 3 +- optimum/exporters/neuron/model_configs.py | 4 +- optimum/exporters/neuron/model_wrappers.py | 6 +- optimum/exporters/neuron/utils.py | 12 ++- optimum/neuron/generation/utils.py | 102 ++++++++++++--------- optimum/neuron/modeling_base.py | 17 ++-- optimum/neuron/modeling_seq2seq.py | 55 ++++++++--- optimum/neuron/utils/argument_utils.py | 8 +- tests/cli/test_export_cli.py | 33 ++++++- tests/exporters/test_export.py | 4 +- tests/generation/conftest.py | 46 +++++++++- tests/generation/test_generate.py | 80 +++++++++++++++- 15 files changed, 324 insertions(+), 101 deletions(-) diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index 7d65fb241..0fcf91a79 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -201,8 +201,9 @@ def _get_submodels_and_neuron_configs( output: Path, dynamic_batch_size: bool = False, model_name_or_path: Optional[Union[str, Path]] = None, - submodels: Dict[str, Union[Path, str]] = None, - optional_outputs: Dict[str, bool] = None, + submodels: Optional[Dict[str, Union[Path, str]]] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, ): is_stable_diffusion = "stable-diffusion" in task is_encoder_decoder = ( @@ -210,14 +211,25 @@ def _get_submodels_and_neuron_configs( ) if is_stable_diffusion: + # TODO: Enable optional outputs for Stable Diffusion + if output_attentions or output_hidden_states: + raise ValueError( + f"`output_attentions` and `output_hidden_states` are not supported by the {task} task yet." + ) models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_stable_diffusion( model, input_shapes, task, output, dynamic_batch_size, submodels ) elif is_encoder_decoder: + optional_outputs = {"output_attentions": output_attentions, "output_hidden_states": output_hidden_states} models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_encoder_decoder( - model, input_shapes, task, output, dynamic_batch_size, model_name_or_path, optional_outputs + model, input_shapes, task, output, dynamic_batch_size, model_name_or_path, **optional_outputs ) else: + # TODO: Enable optional outputs for encoders + if output_attentions or output_hidden_states: + raise ValueError( + f"`output_attentions` and `output_hidden_states` are not supported by the {task} task yet." + ) neuron_config_constructor = TasksManager.get_exporter_config_constructor( model=model, exporter="neuron", task=task ) @@ -235,7 +247,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion( task: str, output: Path, dynamic_batch_size: bool = False, - submodels: Dict[str, Union[Path, str]] = None, + submodels: Optional[Dict[str, Union[Path, str]]] = None, ): model = replace_stable_diffusion_submodels(model, submodels) check_compiler_compatibility_for_stable_diffusion() @@ -286,7 +298,8 @@ def _get_submodels_and_neuron_configs_for_encoder_decoder( output: Path, dynamic_batch_size: bool = False, model_name_or_path: Optional[Union[str, Path]] = None, - optional_outputs: Dict[str, bool] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, ): if is_neuron_available(): raise RuntimeError( @@ -298,7 +311,8 @@ def _get_submodels_and_neuron_configs_for_encoder_decoder( task=task, dynamic_batch_size=dynamic_batch_size, input_shapes=input_shapes, - optional_outputs=optional_outputs, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, ) output_model_names = { ENCODER_NAME: os.path.join(ENCODER_NAME, NEURON_FILE_NAME), @@ -324,8 +338,9 @@ def main_export( local_files_only: bool = False, use_auth_token: Optional[Union[bool, str]] = None, do_validation: bool = True, - submodels: Dict[str, Union[Path, str]] = None, - optional_outputs: Dict[str, bool] = None, + submodels: Optional[Dict[str, Union[Path, str]]] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, **input_shapes, ): output = Path(output) @@ -357,7 +372,8 @@ def main_export( dynamic_batch_size=dynamic_batch_size, model_name_or_path=model_name_or_path, submodels=submodels, - optional_outputs=optional_outputs, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, ) _, neuron_outputs = export_models( @@ -438,7 +454,7 @@ def main(): trust_remote_code=args.trust_remote_code, do_validation=not args.disable_validation, submodels=submodels, - optional_outputs=optional_outputs, + **optional_outputs, **input_shapes, ) diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py index c5e3c9cbf..5f7277b53 100644 --- a/optimum/exporters/neuron/base.py +++ b/optimum/exporters/neuron/base.py @@ -158,11 +158,8 @@ def __init__( input_shapes[name] = value setattr(self, name, value) setattr(self, "input_shapes", input_shapes) - setattr( - self, - "optional_outputs", - {"output_attentions": output_attentions, "output_hidden_states": output_hidden_states}, - ) + setattr(self, "output_attentions", output_attentions) + setattr(self, "output_hidden_states", output_hidden_states) setattr(self, "compiler_type", compiler_type) setattr(self, "compiler_version", compiler_version) diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py index 597b886c7..01a3ae86a 100644 --- a/optimum/exporters/neuron/config.py +++ b/optimum/exporters/neuron/config.py @@ -118,15 +118,19 @@ def outputs(self) -> List[str]: + [f"past.{idx}.cross.key" for idx in range(self._config.num_decoder_layers)] + [f"past.{idx}.cross.value" for idx in range(self._config.num_decoder_layers)] ) - if self.optional_outputs["output_attentions"]: - # Flatten attentions tensors of all attention layers - common_outputs += [f"decoder_attention.{idx}" for idx in range(self._config.num_decoder_layers)] - if self.optional_outputs["output_hidden_states"]: + + if self.output_hidden_states: # Flatten hidden states of all layers common_outputs += [ f"decoder_hidden_state.{idx}" for idx in range(self._config.num_decoder_layers + 1) ] # +1 for the embedding layer + if self.output_attentions: + # Flatten attentions tensors of all attention layers + common_outputs += [f"decoder_attention.{idx}" for idx in range(self._config.num_decoder_layers)] + if getattr(self._config, "is_encoder_decoder", False) is True: + common_outputs += [f"cross_attention.{idx}" for idx in range(self._config.num_decoder_layers)] + return common_outputs def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]: diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index a3bfa9857..6f712d4cb 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -340,7 +340,8 @@ def export_models( compiler_version=NEURON_COMPILER_VERSION, model_type=getattr(sub_neuron_config, "MODEL_TYPE", None), task=getattr(sub_neuron_config, "task", None), - optional_outputs=getattr(sub_neuron_config, "optional_outputs", None), + output_attentions=getattr(sub_neuron_config, "output_attentions", False), + output_hidden_states=getattr(sub_neuron_config, "output_hidden_states", False), ) if isinstance(model_config, PretrainedConfig): model_config = DiffusersPretrainedConfig.from_dict(model_config.__dict__) diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index eaf03ba51..fe5835198 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -463,8 +463,8 @@ def patch_model_for_export(self, model, device="xla", **kwargs): batch_size=batch_size, sequence_length=sequence_length, num_beams=num_beams, - output_hidden_states=self.optional_outputs["output_hidden_states"], - output_attentions=self.optional_outputs["output_attentions"], + output_hidden_states=self.output_hidden_states, + output_attentions=self.output_attentions, device=device, ) diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py index 31d6d00ba..c9e3a6e93 100644 --- a/optimum/exporters/neuron/model_wrappers.py +++ b/optimum/exporters/neuron/model_wrappers.py @@ -277,7 +277,10 @@ def forward( if self.output_attentions: decoder_attentions = [ attention for attention in decoder_output["attentions"] - ] # flatten `hidden_states` which is a tuple of tensors + ] # flatten `decoder_attentions` which is a tuple of tensors + cross_attentions = [ + attention for attention in decoder_output["cross_attentions"] + ] # flatten `cross_attentions` which is a tuple of tensors if self.config.tie_word_embeddings: # Rescale output before projecting on vocab @@ -333,5 +336,6 @@ def forward( if self.output_attentions: neuron_outputs += decoder_attentions + neuron_outputs += cross_attentions return neuron_outputs diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py index afca52597..b49817f40 100644 --- a/optimum/exporters/neuron/utils.py +++ b/optimum/exporters/neuron/utils.py @@ -352,7 +352,8 @@ def get_encoder_decoder_models_for_export( task: str, input_shapes: Dict[str, int], dynamic_batch_size: Optional[bool] = False, - optional_outputs: Dict[str, bool] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, ) -> Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]: """ Returns the components of an encoder-decoder model and their subsequent neuron configs. @@ -367,8 +368,10 @@ def get_encoder_decoder_models_for_export( Static shapes used for compiling the encoder and the decoder. dynamic_batch_size (`bool`, defaults to `False`): Whether the Neuron compiled model supports dynamic batch size. - optional_outputs (`Dict[str, bool]`, defaults to `None`) - Whether to trace some optional output tensors. + output_attentions (`bool`, defaults to `False`): + Whether or not for the traced model to return the attentions tensors of all attention layers. + output_hidden_states (`bool`, defaults to `False`): + Whether or not for the traced model to return the hidden states of all layers. Returns: `Dict[str, Tuple["PreTrainedModel", "NeuronConfig"]]`: A Dict containing the model and @@ -400,7 +403,8 @@ def get_encoder_decoder_models_for_export( config=model.config, task=task, dynamic_batch_size=dynamic_batch_size, - **optional_outputs, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, **input_shapes, ) models_for_export[DECODER_NAME] = (model, decoder_neuron_config) diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py index 3476fe0a8..51027af4d 100644 --- a/optimum/neuron/generation/utils.py +++ b/optimum/neuron/generation/utils.py @@ -416,7 +416,7 @@ def generate( model_kwargs["use_cache"] = generation_config.use_cache accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys()) - requires_attention_mask = "encoder_outputs" not in model_kwargs + requires_attention_mask = "encoder_outputs" not in model_kwargs and not is_traced_inference if model_kwargs.get("attention_mask", None) is None and requires_attention_mask and accepts_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( @@ -434,7 +434,7 @@ def generate( "generation results, please set `padding_side='left'` when initializing the tokenizer." ) - if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs: + if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs and not is_traced_inference: # if model is encoder decoder encoder_outputs are created # and added to `model_kwargs` model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation( @@ -767,7 +767,14 @@ def greedy_search( ) # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None + scores = None + if return_dict_in_generate and output_scores: + if is_traced_inference: + logger.warning( + "`output_scores` will be neglected because currently we do not trace `next_token_scores` for greedy search (we do only in beam search). If you want us to support the option during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron." + ) + else: + scores = () decoder_attentions = () if (return_dict_in_generate and output_attentions) else None cross_attentions = () if (return_dict_in_generate and output_attentions) else None decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None @@ -847,32 +854,28 @@ def greedy_search( next_tokens_scores = logits_processor(input_ids.to("cpu")[:, :seq_length], next_token_logits.to("cpu")) next_tokens_scores = next_tokens_scores.to(input_ids.device) - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_tokens_scores,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - # argmax next_tokens = torch.argmax(next_tokens_scores, dim=-1) + + if return_dict_in_generate and output_scores: + scores += (next_tokens_scores,) else: next_tokens = outputs[0] - if return_dict_in_generate and output_scores: - logger.warning( - "`output_scores` will be neglected because currently we do not trace `next_token_scores` for greedy search. If you want us to support the option during the compilation, please file an issue to https://github.com/huggingface/optimum-neuron." + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) ) # finished sentences should have their next token be a padding token @@ -1162,8 +1165,19 @@ def beam_search( model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) if is_traced_inference: - outputs = self(**model_inputs, beam_scores=beam_scores) - next_token_scores, next_tokens, next_indices = outputs + outputs = self( + **model_inputs, + beam_scores=beam_scores, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + next_token_scores = outputs.next_token_scores + next_tokens = outputs.next_tokens + next_indices = outputs.next_indices + + if return_dict_in_generate and output_scores: + scores += (next_token_scores,) else: outputs = self( **model_inputs, @@ -1211,24 +1225,6 @@ def beam_search( next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores) - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_token_scores_processed,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - # reshape for beam search vocab_size = next_token_scores.shape[-1] next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) @@ -1242,6 +1238,24 @@ def beam_search( next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor") next_tokens = next_tokens % vocab_size + if return_dict_in_generate and output_scores: + scores += (next_token_scores_processed,) + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) # stateless beam_outputs = beam_scorer.process( input_ids.to("cpu")[:, :cur_len], diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py index 144826740..d9daa46ac 100644 --- a/optimum/neuron/modeling_base.py +++ b/optimum/neuron/modeling_base.py @@ -297,14 +297,15 @@ def _from_transformers( ) store_compilation_config( - config, - input_shapes, - compiler_kwargs, - input_names, - output_names, - dynamic_batch_size, - compiler_type, - compiler_version, + config=config, + input_shapes=input_shapes, + compiler_kwargs=compiler_kwargs, + input_names=input_names, + output_names=output_names, + dynamic_batch_size=dynamic_batch_size, + compiler_type=compiler_type, + compiler_version=compiler_version, + task=task, ) config.save_pretrained(save_dir_path) diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index b52e7e863..3395cc7a6 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -32,7 +32,7 @@ from transformers.generation.stopping_criteria import ( StoppingCriteriaList, ) -from transformers.modeling_outputs import Seq2SeqLMOutput +from transformers.modeling_outputs import ModelOutput from ..exporters.neuron import ( NeuronConfig, @@ -268,6 +268,8 @@ def _from_transformers( disable_fast_relayout: Optional[bool] = False, disable_fallback: bool = False, dynamic_batch_size: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, **kwargs_shapes, ) -> "NeuronModelForConditionalGeneration": if dynamic_batch_size is True: @@ -304,6 +306,8 @@ def _from_transformers( local_files_only=local_files_only, use_auth_token=use_auth_token, do_validation=False, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, **kwargs_shapes, ) @@ -350,12 +354,11 @@ def forward( decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.BoolTensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, - beam_scores=None, - # Leave following kwargs for compatibility, will not have any effect. + beam_scores: Optional[torch.FloatTensor] = None, return_dict: bool = False, output_attentions: bool = False, output_hidden_states: bool = False, - ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: + ) -> Union[Tuple[torch.FloatTensor], ModelOutput]: hidden_states = encoder_outputs["last_hidden_state"] if not hasattr(self, "beam_idx"): @@ -363,15 +366,40 @@ def forward( num_beams = attention_mask.shape[0] self.beam_idx = torch.arange(0, num_beams, dtype=torch.int64) - decoder_outputs = self.decoder( + outputs = self.decoder( decoder_input_ids, decoder_attention_mask, hidden_states, attention_mask, self.beam_idx, beam_scores ) - next_token_scores = decoder_outputs[0] - next_tokens = decoder_outputs[1] - next_indices = decoder_outputs[2] + # Fetch optional outputs + cur_idx = 0 + cross_attentions = None + decoder_attentions = None + decoder_hidden_states = None + + # Skip pkv which can't be copied from memory to buffer + if output_attentions and self.config.neuron.get("output_attentions"): + if self.config.is_encoder_decoder: + cross_attentions = outputs[-self.config.num_decoder_layers :] + cur_idx += self.config.num_decoder_layers + decoder_attentions = outputs[-(self.config.num_decoder_layers + cur_idx) : -cur_idx] + cur_idx += self.config.num_decoder_layers + + if output_hidden_states and self.config.neuron.get("output_hidden_states"): + decoder_hidden_states = outputs[-(self.config.num_decoder_layers + 1 + cur_idx) : -cur_idx] + + decoder_outputs = ModelOutput( + next_token_scores=outputs[0], + next_tokens=outputs[1], + next_indices=outputs[2], + cross_attentions=cross_attentions, + decoder_attentions=decoder_attentions, + decoder_hidden_states=decoder_hidden_states, + ) - return next_token_scores, next_tokens, next_indices + if return_dict: + return decoder_outputs + else: + return decoder_outputs.to_tuple() def generate( self, @@ -382,7 +410,7 @@ def generate( stopping_criteria: Optional[StoppingCriteriaList] = None, prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, assistant_model: Optional["PreTrainedModel"] = None, - num_return_sequences: Optional[int] = None, + num_return_sequences: int = 1, **kwargs, ): max_length = self.neuron_configs[ENCODER_NAME].sequence_length @@ -414,9 +442,14 @@ def generate( assistant_model=assistant_model, num_return_sequences=num_return_sequences, max_length=kwargs.pop("max_length", None) or max_length, + max_new_tokens=kwargs.pop("max_new_tokens", None), + output_attentions=kwargs.pop("output_attentions", False), + output_hidden_states=kwargs.pop("output_hidden_states", False), + output_scores=kwargs.pop("output_scores", False), + return_dict_in_generate=kwargs.pop("return_dict_in_generate", False), num_beams=num_beams, do_sample=kwargs.pop("do_sample", False), - use_cache=True, # pkv is cached by default + use_cache=True, # pkv is cached by default in decoder_attention_mask=decoder_attention_mask, # Pass fake encoder_outputs so the transfomers code will not invoke the encoder encoder_outputs={"last_hidden_state": torch.ones((batch_size, max_length, 1))}, diff --git a/optimum/neuron/utils/argument_utils.py b/optimum/neuron/utils/argument_utils.py index 5400c3065..4798136e1 100644 --- a/optimum/neuron/utils/argument_utils.py +++ b/optimum/neuron/utils/argument_utils.py @@ -147,7 +147,8 @@ def store_compilation_config( compiler_version: str, model_type: Optional[str] = None, task: str = None, - optional_outputs: Dict[str, bool] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, **kwargs, ): if isinstance(config, OrderedDict): @@ -183,8 +184,9 @@ def store_compilation_config( elif neuron_model_type != original_model_type: config_args["model_type"] = neuron_model_type # Neuron custom model_type, eg. `t5-encoder`. - if optional_outputs is not None: - config_args["optional_outputs"] = optional_outputs + # Add args of optional outputs + config_args["output_attentions"] = output_attentions + config_args["output_hidden_states"] = output_hidden_states update_func("neuron", config_args) diff --git a/tests/cli/test_export_cli.py b/tests/cli/test_export_cli.py index 61ed9d5af..a8abf30f4 100644 --- a/tests/cli/test_export_cli.py +++ b/tests/cli/test_export_cli.py @@ -250,7 +250,7 @@ def test_replace_unet(self): ) @requires_neuronx - def test_t5(self): + def test_encoder_decoder(self): model_id = "hf-internal-testing/tiny-random-t5" with tempfile.TemporaryDirectory() as tempdir: subprocess.run( @@ -277,3 +277,34 @@ def test_t5(self): shell=False, check=True, ) + + @requires_neuronx + def test_encoder_decoder_optional_outputs(self): + model_id = "hf-internal-testing/tiny-random-t5" + with tempfile.TemporaryDirectory() as tempdir: + subprocess.run( + [ + "optimum-cli", + "export", + "neuron", + "--model", + model_id, + "--task", + "text2text-generation", + "--batch_size", + "1", + "--sequence_length", + "18", + "--num_beams", + "4", + "--auto_cast", + "matmul", + "--auto_cast_type", + "bf16", + "--output_hidden_states", + "--output_attentions", + tempdir, + ], + shell=False, + check=True, + ) diff --git a/tests/exporters/test_export.py b/tests/exporters/test_export.py index 76b24a560..41507453a 100644 --- a/tests/exporters/test_export.py +++ b/tests/exporters/test_export.py @@ -225,7 +225,7 @@ class NeuronEncoderDecoderExportTestCase(unittest.TestCase): """ @parameterized.expand(ENCODER_DECODER_MODELS_TINY.items()) - def test_export_for_encoder_decoder_models(self, model_name, model_id): + def test_export_encoder_decoder_models(self, model_name, model_id): set_seed(SEED) # prepare neuron config / models @@ -239,6 +239,8 @@ def test_export_for_encoder_decoder_models(self, model_name, model_id): task="text2text-generation", output=Path(tmpdirname), model_name_or_path=model_id, + output_attentions=True, + output_hidden_states=True, ) _, neuron_outputs = export_models( models_and_neuron_configs=models_and_neuron_configs, diff --git a/tests/generation/conftest.py b/tests/generation/conftest.py index 85f203f85..c39a03b38 100644 --- a/tests/generation/conftest.py +++ b/tests/generation/conftest.py @@ -65,7 +65,7 @@ def neuron_decoder_path(export_decoder_id): @pytest.fixture(scope="module") @requires_neuronx -def neuron_seq2seq_path(export_seq2seq_id): +def neuron_seq2seq_beam_path(export_seq2seq_id): model = NeuronModelForSeq2SeqLM.from_pretrained( export_seq2seq_id, export=True, batch_size=1, sequence_length=64, num_beams=4 ) @@ -79,6 +79,28 @@ def neuron_seq2seq_path(export_seq2seq_id): yield model_path +@pytest.fixture(scope="module") +@requires_neuronx +def neuron_seq2seq_beam_path_with_optional_outputs(export_seq2seq_id): + model = NeuronModelForSeq2SeqLM.from_pretrained( + export_seq2seq_id, + export=True, + batch_size=1, + sequence_length=64, + num_beams=4, + output_attentions=True, + output_hidden_states=True, + ) + model_dir = TemporaryDirectory() + model_path = model_dir.name + model.save_pretrained(model_path) + del model + # Yield instead of returning to keep a reference to the temporary directory. + # It will go out of scope and be released only once all tests needing the fixture + # have been completed. + yield model_path + + @pytest.fixture(scope="module") @requires_neuronx def neuron_seq2seq_greedy_path(export_seq2seq_id): @@ -95,6 +117,28 @@ def neuron_seq2seq_greedy_path(export_seq2seq_id): yield model_path +@pytest.fixture(scope="module") +@requires_neuronx +def neuron_seq2seq_greedy_path_with_optional_outputs(export_seq2seq_id): + model = NeuronModelForSeq2SeqLM.from_pretrained( + export_seq2seq_id, + export=True, + batch_size=1, + sequence_length=64, + num_beams=1, + output_attentions=True, + output_hidden_states=True, + ) + model_dir = TemporaryDirectory() + model_path = model_dir.name + model.save_pretrained(model_path) + del model + # Yield instead of returning to keep a reference to the temporary directory. + # It will go out of scope and be released only once all tests needing the fixture + # have been completed. + yield model_path + + @pytest.fixture(scope="module") def neuron_push_decoder_id(export_decoder_id): model_name = export_decoder_id.split("/")[-1] diff --git a/tests/generation/test_generate.py b/tests/generation/test_generate.py index 06cbed335..f50b0fb59 100644 --- a/tests/generation/test_generate.py +++ b/tests/generation/test_generate.py @@ -63,11 +63,47 @@ def test_model_generation_input_dimensions(neuron_decoder_path): @is_inferentia_test @requires_neuronx -def test_seq2seq_generation_beam(neuron_seq2seq_path): - model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path) - tokenizer = AutoTokenizer.from_pretrained(neuron_seq2seq_path) +def test_seq2seq_generation_beam(neuron_seq2seq_beam_path): + model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_beam_path) + tokenizer = AutoTokenizer.from_pretrained(neuron_seq2seq_beam_path) inputs = tokenizer("translate English to German: Lets eat good food.", return_tensors="pt") - output = model.generate(**inputs, num_return_sequences=1) + + # 1. max length + output = model.generate(**inputs, num_return_sequences=2, max_length=5) + assert len(output[0]) <= 5 + + # 2. min length + output = model.generate(**inputs, num_return_sequences=2, min_length=10) + assert len(output[0]) >= 10 + + # 3. max new tokens + output = model.generate(**inputs, num_return_sequences=2, max_new_tokens=5) + assert len(output[0].unique()) <= 5 + + return output + + +@is_inferentia_test +@requires_neuronx +def test_seq2seq_generation_beam_with_optional_outputs(neuron_seq2seq_beam_path_with_optional_outputs): + model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_beam_path_with_optional_outputs) + tokenizer = AutoTokenizer.from_pretrained(neuron_seq2seq_beam_path_with_optional_outputs) + inputs = tokenizer("translate English to German: Lets eat good food.", return_tensors="pt") + + output = model.generate( + **inputs, + num_return_sequences=1, + max_length=20, + output_scores=True, + output_attentions=True, + output_hidden_states=True, + return_dict_in_generate=True, + ) + assert "scores" in output + assert "decoder_attentions" in output + assert "cross_attentions" in output + assert "decoder_hidden_states" in output + return output @@ -77,5 +113,39 @@ def test_seq2seq_generation_greedy(neuron_seq2seq_greedy_path): model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_greedy_path) tokenizer = AutoTokenizer.from_pretrained(neuron_seq2seq_greedy_path) inputs = tokenizer("translate English to German: Lets eat good food.", return_tensors="pt") - output = model.generate(**inputs, num_return_sequences=1) + + # 1. max length + output = model.generate(**inputs, num_return_sequences=1, max_length=5) + assert len(output[0]) <= 5 + + # 2. min length + output = model.generate(**inputs, num_return_sequences=1, min_length=10) + assert len(output[0]) >= 10 + + # 3. max new tokens + output = model.generate(**inputs, num_return_sequences=1, max_new_tokens=5) + assert len(output[0].unique()) <= 5 + + return output + + +@is_inferentia_test +@requires_neuronx +def test_seq2seq_generation_greedy_with_optional_outputs(neuron_seq2seq_greedy_path_with_optional_outputs): + model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_greedy_path_with_optional_outputs) + tokenizer = AutoTokenizer.from_pretrained(neuron_seq2seq_greedy_path_with_optional_outputs) + inputs = tokenizer("translate English to German: Lets eat good food.", return_tensors="pt") + + output = model.generate( + **inputs, + num_return_sequences=1, + max_length=20, + output_attentions=True, + output_hidden_states=True, + return_dict_in_generate=True, + ) + assert "decoder_attentions" in output + assert "cross_attentions" in output + assert "decoder_hidden_states" in output + return output From 6f69d6d5af63b903c2f929d77e1bea4bc6c04a7d Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Fri, 1 Dec 2023 16:29:59 +0000 Subject: [PATCH 27/30] fix style --- optimum/exporters/neuron/model_wrappers.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py index c9e3a6e93..0b1ae4504 100644 --- a/optimum/exporters/neuron/model_wrappers.py +++ b/optimum/exporters/neuron/model_wrappers.py @@ -270,17 +270,17 @@ def forward( last_hidden_state = decoder_output["last_hidden_state"] past_key_values = decoder_output["past_key_values"] if self.output_hidden_states: - decoder_hidden_states = [ - hidden_state for hidden_state in decoder_output["hidden_states"] - ] # flatten `hidden_states` which is a tuple of tensors + decoder_hidden_states = list( + decoder_output["hidden_states"] + ) # flatten `hidden_states` which is a tuple of tensors if self.output_attentions: - decoder_attentions = [ - attention for attention in decoder_output["attentions"] - ] # flatten `decoder_attentions` which is a tuple of tensors - cross_attentions = [ - attention for attention in decoder_output["cross_attentions"] - ] # flatten `cross_attentions` which is a tuple of tensors + decoder_attentions = list( + decoder_output["attentions"] + ) # flatten `decoder_attentions` which is a tuple of tensors + cross_attentions = list( + decoder_output["cross_attentions"] + ) # flatten `cross_attentions` which is a tuple of tensors if self.config.tie_word_embeddings: # Rescale output before projecting on vocab From 9f461f8f746a0f8fd9020b869765062d21f029ba Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Fri, 1 Dec 2023 16:34:10 +0000 Subject: [PATCH 28/30] fix style --- optimum/neuron/modeling_seq2seq.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index 3395cc7a6..3e6a4f45d 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -32,7 +32,6 @@ from transformers.generation.stopping_criteria import ( StoppingCriteriaList, ) -from transformers.modeling_outputs import ModelOutput from ..exporters.neuron import ( NeuronConfig, @@ -358,7 +357,7 @@ def forward( return_dict: bool = False, output_attentions: bool = False, output_hidden_states: bool = False, - ) -> Union[Tuple[torch.FloatTensor], ModelOutput]: + ) -> Union[Tuple[torch.FloatTensor], "ModelOutput"]: hidden_states = encoder_outputs["last_hidden_state"] if not hasattr(self, "beam_idx"): From d6a24b63bf9627b50edb96843fa0f552874bad79 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Sat, 2 Dec 2023 00:28:04 +0000 Subject: [PATCH 29/30] apply suggestions --- optimum/exporters/neuron/__main__.py | 12 ++++++------ optimum/exporters/neuron/convert.py | 2 +- optimum/exporters/neuron/model_configs.py | 10 ++-------- optimum/neuron/modeling_base.py | 12 +++++------- optimum/neuron/modeling_seq2seq.py | 17 ++++++++--------- optimum/neuron/utils/argument_utils.py | 8 +++++--- tests/generation/test_generate.py | 12 ++---------- 7 files changed, 29 insertions(+), 44 deletions(-) diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index 0fcf91a79..8e70ee4d7 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -249,8 +249,8 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion( dynamic_batch_size: bool = False, submodels: Optional[Dict[str, Union[Path, str]]] = None, ): - model = replace_stable_diffusion_submodels(model, submodels) check_compiler_compatibility_for_stable_diffusion() + model = replace_stable_diffusion_submodels(model, submodels) if is_neuron_available(): raise RuntimeError( "Stable diffusion export is not supported by neuron-cc on inf1, please use neuronx-cc on either inf2/trn1 instead." @@ -259,11 +259,11 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion( # Saving the model config and preprocessor as this is needed sometimes. model.scheduler.save_pretrained(output.joinpath("scheduler")) - if hasattr(model, "tokenizer") and model.tokenizer is not None: + if getattr(model, "tokenizer", None) is not None: model.tokenizer.save_pretrained(output.joinpath("tokenizer")) - if hasattr(model, "tokenizer_2") and model.tokenizer_2 is not None: + if getattr(model, "tokenizer_2", None) is not None: model.tokenizer_2.save_pretrained(output.joinpath("tokenizer_2")) - if hasattr(model, "feature_extractor") and model.feature_extractor is not None: + if getattr(model, "feature_extractor", None) is not None: model.feature_extractor.save_pretrained(output.joinpath("feature_extractor")) model.save_config(output) @@ -278,11 +278,11 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion( DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME), DIFFUSION_MODEL_VAE_DECODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_DECODER_NAME, NEURON_FILE_NAME), } - if hasattr(model, "text_encoder") and model.text_encoder is not None: + if getattr(model, "text_encoder", None) is not None: output_model_names[DIFFUSION_MODEL_TEXT_ENCODER_NAME] = os.path.join( DIFFUSION_MODEL_TEXT_ENCODER_NAME, NEURON_FILE_NAME ) - if hasattr(model, "text_encoder_2") and model.text_encoder_2 is not None: + if getattr(model, "text_encoder_2", None) is not None: output_model_names[DIFFUSION_MODEL_TEXT_ENCODER_2_NAME] = os.path.join( DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, NEURON_FILE_NAME ) diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index 6f712d4cb..d5b826ee6 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -223,7 +223,7 @@ def validate_model_outputs( value_failures = [] for i, (name, output) in enumerate(zip(neuron_output_names_list, neuron_outputs)): if isinstance(output, torch.Tensor): - ref_output = ref_outputs[name].numpy() if isinstance(ref_outputs, Dict) else ref_outputs[i].numpy() + ref_output = ref_outputs[name].numpy() if isinstance(ref_outputs, dict) else ref_outputs[i].numpy() output = output.numpy() elif isinstance(output, tuple): # eg. `hidden_states` of `AutoencoderKL` is a tuple of tensors. ref_output = torch.stack(ref_outputs[name]).numpy() diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index fe5835198..aa7d05fa8 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -31,6 +31,7 @@ NormalizedTextAndVisionConfig, is_diffusers_available, ) +from ...utils.normalized_config import T5LikeNormalizedTextConfig from ..tasks import TasksManager from .config import ( TextAndVisionNeuronConfig, @@ -416,14 +417,7 @@ class T5DecoderNeuronConfig(TextSeq2SeqNeuronConfig): MANDATORY_AXES = ("batch_size", "sequence_length", "num_beams") MODEL_TYPE = "t5-decoder" CUSTOM_MODEL_WRAPPER = T5DecoderWrapper - NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args( - hidden_size="d_model", - num_attention_heads="num_heads", - encoder_num_layers="num_layers", - decoder_num_layers="num_decoder_layers", - key_value_dim="d_kv", - allow_new=True, - ) + NORMALIZED_CONFIG_CLASS = T5LikeNormalizedTextConfig @property def is_decoder(self) -> bool: diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py index d9daa46ac..6cc1cd95c 100644 --- a/optimum/neuron/modeling_base.py +++ b/optimum/neuron/modeling_base.py @@ -393,10 +393,10 @@ def _neuron_config_init(cls, config: "PretrainedConfig") -> "NeuronConfig": ) return - neuron_configs = config.neuron + neuron_config = config.neuron # Fetch compiler information - compiler_type = neuron_configs.get("compiler_type") - compiler_version = neuron_configs.get("compiler_version") + compiler_type = neuron_config.get("compiler_type") + compiler_version = neuron_config.get("compiler_version") # Fetch mandatory shapes from config compile_shapes = { @@ -408,16 +408,14 @@ def _neuron_config_init(cls, config: "PretrainedConfig") -> "NeuronConfig": # Neuron config constructuor task = getattr(config, "task") or TasksManager.infer_task_from_model(cls.auto_model_class) task = TasksManager.map_from_synonym(task) - model_type = neuron_configs.get("model_type", None) - if not (model_type and model_type != "None"): - model_type = config.model_type + model_type = neuron_config.get("model_type", None) or config.model_type neuron_config_constructor = TasksManager.get_exporter_config_constructor( model_type=model_type, exporter="neuron", task=task ) return neuron_config_constructor( config, - dynamic_batch_size=neuron_configs.get("dynamic_batch_size", False), + dynamic_batch_size=neuron_config.get("dynamic_batch_size", False), compiler_type=compiler_type, compiler_version=compiler_version, **compile_shapes, diff --git a/optimum/neuron/modeling_seq2seq.py b/optimum/neuron/modeling_seq2seq.py index 3e6a4f45d..3d42a7129 100644 --- a/optimum/neuron/modeling_seq2seq.py +++ b/optimum/neuron/modeling_seq2seq.py @@ -26,12 +26,9 @@ import torch from huggingface_hub import snapshot_download from transformers import AutoConfig, AutoModelForSeq2SeqLM, GenerationConfig -from transformers.generation.logits_process import ( - LogitsProcessorList, -) -from transformers.generation.stopping_criteria import ( - StoppingCriteriaList, -) +from transformers.generation.logits_process import LogitsProcessorList +from transformers.generation.stopping_criteria import StoppingCriteriaList +from transformers.utils import ModelOutput from ..exporters.neuron import ( NeuronConfig, @@ -51,7 +48,6 @@ if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedModel - from transformers.utils import ModelOutput if is_neuronx_available(): import torch_neuronx @@ -357,7 +353,7 @@ def forward( return_dict: bool = False, output_attentions: bool = False, output_hidden_states: bool = False, - ) -> Union[Tuple[torch.FloatTensor], "ModelOutput"]: + ) -> Union[Tuple[torch.FloatTensor], ModelOutput]: hidden_states = encoder_outputs["last_hidden_state"] if not hasattr(self, "beam_idx"): @@ -424,7 +420,10 @@ def generate( past_key_values = self.encoder(**inputs) decoder_attention_mask = torch.cat( - [torch.zeros((batch_size, max_length - 1), dtype=torch.int64), torch.ones((1, 1), dtype=torch.int64)], + [ + torch.zeros((batch_size, max_length - 1), dtype=torch.int64), + torch.ones((batch_size, 1), dtype=torch.int64), + ], axis=1, ) diff --git a/optimum/neuron/utils/argument_utils.py b/optimum/neuron/utils/argument_utils.py index 4798136e1..d910cd074 100644 --- a/optimum/neuron/utils/argument_utils.py +++ b/optimum/neuron/utils/argument_utils.py @@ -176,13 +176,15 @@ def store_compilation_config( config_args["output_names"] = output_names original_model_type = getattr(config, "model_type", None) - neuron_model_type = str(model_type).replace("_", "-") + neuron_model_type = str(model_type).replace("_", "-") if model_type is not None else model_type if original_model_type is None: update_func( "model_type", neuron_model_type ) # Add model_type to the config if it doesn't exist before, eg. submodel of Stable Diffusion. - elif neuron_model_type != original_model_type: - config_args["model_type"] = neuron_model_type # Neuron custom model_type, eg. `t5-encoder`. + else: + config_args["model_type"] = ( + neuron_model_type or original_model_type + ) # Prioritize Neuron custom model_type, eg. `t5-encoder`. # Add args of optional outputs config_args["output_attentions"] = output_attentions diff --git a/tests/generation/test_generate.py b/tests/generation/test_generate.py index f50b0fb59..1f7630b4d 100644 --- a/tests/generation/test_generate.py +++ b/tests/generation/test_generate.py @@ -78,9 +78,7 @@ def test_seq2seq_generation_beam(neuron_seq2seq_beam_path): # 3. max new tokens output = model.generate(**inputs, num_return_sequences=2, max_new_tokens=5) - assert len(output[0].unique()) <= 5 - - return output + assert len(output[0].unique()) <= 5 + 1 # +1 for `decoder_start_token_id` @is_inferentia_test @@ -104,8 +102,6 @@ def test_seq2seq_generation_beam_with_optional_outputs(neuron_seq2seq_beam_path_ assert "cross_attentions" in output assert "decoder_hidden_states" in output - return output - @is_inferentia_test @requires_neuronx @@ -124,9 +120,7 @@ def test_seq2seq_generation_greedy(neuron_seq2seq_greedy_path): # 3. max new tokens output = model.generate(**inputs, num_return_sequences=1, max_new_tokens=5) - assert len(output[0].unique()) <= 5 - - return output + assert len(output[0]) <= 5 + 1 # +1 for `decoder_start_token_id` @is_inferentia_test @@ -147,5 +141,3 @@ def test_seq2seq_generation_greedy_with_optional_outputs(neuron_seq2seq_greedy_p assert "decoder_attentions" in output assert "cross_attentions" in output assert "decoder_hidden_states" in output - - return output From 3b07ba1dd2c827e9e671f7bda35e4d3c775d4e10 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Sat, 2 Dec 2023 08:52:13 +0000 Subject: [PATCH 30/30] fix tests --- tests/generation/test_export.py | 4 ++-- tests/generation/test_hub.py | 12 +++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/generation/test_export.py b/tests/generation/test_export.py index 32c53c4a4..fb69f2a88 100644 --- a/tests/generation/test_export.py +++ b/tests/generation/test_export.py @@ -71,6 +71,6 @@ def test_seq2seq_export(export_seq2seq_id, batch_size, sequence_length, num_beam @is_inferentia_test @requires_neuronx -def test_seq2seq_model_from_path(neuron_seq2seq_path): - model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path) +def test_seq2seq_model_from_path(neuron_seq2seq_greedy_path): + model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_greedy_path) return model diff --git a/tests/generation/test_hub.py b/tests/generation/test_hub.py index ff8e90615..7e1faa196 100644 --- a/tests/generation/test_hub.py +++ b/tests/generation/test_hub.py @@ -61,17 +61,19 @@ def test_seq2seq_model_from_hub(): @is_inferentia_test @requires_neuronx -def test_push_seq2seq_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id): - model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_path) - model.push_to_hub(neuron_seq2seq_path, neuron_push_seq2seq_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING) +def test_push_seq2seq_to_hub(neuron_seq2seq_greedy_path, neuron_push_seq2seq_id): + model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_greedy_path) + model.push_to_hub( + neuron_seq2seq_greedy_path, neuron_push_seq2seq_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING + ) api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN) try: hub_files_info = api.list_files_info(neuron_push_seq2seq_id) hub_files_path = [info.rfilename for info in hub_files_info] - for path, _, files in os.walk(neuron_seq2seq_path): + for path, _, files in os.walk(neuron_seq2seq_greedy_path): for name in files: local_file_path = os.path.join(path, name) - hub_file_path = os.path.relpath(local_file_path, neuron_seq2seq_path) + hub_file_path = os.path.relpath(local_file_path, neuron_seq2seq_greedy_path) assert hub_file_path in hub_files_path finally: api.delete_repo(neuron_push_seq2seq_id)