huggingface · dacorvo · Feb 6, 2025 · Feb 4, 2025 · Feb 4, 2025 · Feb 4, 2025
diff --git a/.github/workflows/doc-pr-build.yml b/.github/workflows/doc-pr-build.yml
@@ -33,7 +33,7 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: '3.11'
-      
+
       - name: Setup environment
         run: |
           pip install --upgrade pip
@@ -58,7 +58,7 @@ jobs:
           echo ${{ env.COMMIT_SHA }} > ./commit_sha
           echo ${{ env.PR_NUMBER }} > ./pr_number
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: doc-build-artifact
           path: neuron-doc-build/
diff --git a/.github/workflows/test_inf2.yml b/.github/workflows/test_inf2.yml
@@ -53,11 +53,7 @@ jobs:
         run: |
           source aws_neuron_venv_pytorch/bin/activate
           HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/cli
-      - name: Run decoder tests
-        run: |
-          source aws_neuron_venv_pytorch/bin/activate
-          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/decoder
-      - name: Run other generation tests
+      - name: Run non-llm generation tests
         run: |
           source aws_neuron_venv_pytorch/bin/activate
           HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test --ignore=tests/generation/test_parallel.py tests/generation

diff --git a/.github/workflows/test_inf2_llm.yml b/.github/workflows/test_inf2_llm.yml
@@ -0,0 +1,51 @@
+name: Optimum neuron / Test INF2 LLM
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "setup.py"
+      - "optimum/**.py"
+      - ".github/workflows/test_inf2_llm.yml"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "setup.py"
+      - "optimum/**.py"
+      - ".github/workflows/test_inf2_llm.yml"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  do-the-job:
+    name: Run INF2 LLM tests
+    runs-on:
+      group: aws-inf2-8xlarge
+    steps:
+      - name: Install Neuron runtime
+        run: |
+          . /etc/os-release
+          sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
+          deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
+          EOF
+          wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
+          sudo apt-get update -y
+          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa  -y
+          export PATH=/opt/aws/neuron/bin:$PATH
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Install python dependencies
+        run: |
+          sudo apt install python3-venv python3-dev -y
+          python3 -m venv aws_neuron_venv_pytorch
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install -U pip
+          pip install --upgrade setuptools==69.5.1
+          python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+          python -m pip install .[neuronx,tests]
+      - name: Run decoder tests
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/decoder
diff --git a/examples/text-generation/generation.py b/examples/text-generation/generation.py
@@ -7,7 +7,7 @@
 from optimum.neuron import NeuronModelForCausalLM
 
 
-def generate(model, tokenizer, prompts, length, temperature):
+def generate(model, tokenizer, prompts, max_new_tokens, temperature):
     # Specifiy padding options for decoder-only architecture
     tokenizer.pad_token_id = tokenizer.eos_token_id
     tokenizer.padding_side = "left"
@@ -17,7 +17,7 @@ def generate(model, tokenizer, prompts, length, temperature):
     start = time.time()
     with torch.inference_mode():
         sample_output = model.generate(
-            **tokens, do_sample=True, max_length=length, temperature=temperature, top_k=50, top_p=0.9
+            **tokens, do_sample=True, max_new_tokens=max_new_tokens, temperature=temperature, top_k=50, top_p=0.9
         )
     end = time.time()
     outputs = [tokenizer.decode(tok) for tok in sample_output]

diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
@@ -38,6 +38,7 @@
     NEURON_FILE_NAME,
     is_neuron_available,
     is_neuronx_available,
+    is_transformers_neuronx_available,
     map_torch_dtype,
 )
 from ...neuron.utils.misc import maybe_save_preprocessors
@@ -47,7 +48,7 @@
 from ...utils import is_diffusers_available, logging
 from ..error_utils import AtolError, OutputMatchError, ShapeError
 from ..tasks import TasksManager
-from .base import NeuronConfig, NeuronDecoderConfig
+from .base import NeuronExportConfig
 from .convert import export_models, validate_models_outputs
 from .model_configs import *  # noqa: F403
 from .utils import (
@@ -70,6 +71,11 @@
 
     NEURON_COMPILER = "Neuronx"
 
+
+if is_transformers_neuronx_available():
+    from .model_configs import NeuronDecoderExportConfig
+
+
 if is_diffusers_available():
     from diffusers import StableDiffusionXLPipeline
 
@@ -122,7 +128,7 @@ def get_input_shapes_and_config_class(task: str, args: argparse.Namespace) -> Di
     return input_shapes, neuron_config_constructor.func
 
 
-def get_neuron_config_class(task: str, model_id: str) -> NeuronConfig:
+def get_neuron_config_class(task: str, model_id: str) -> NeuronExportConfig:
     config = AutoConfig.from_pretrained(model_id)
 
     model_type = config.model_type.replace("_", "-")
@@ -724,7 +730,7 @@ def main():
         submodels = None
     else:
         input_shapes, neuron_config_class = get_input_shapes_and_config_class(task, args)
-        if NeuronDecoderConfig in inspect.getmro(neuron_config_class):
+        if is_transformers_neuronx_available() and NeuronDecoderExportConfig in inspect.getmro(neuron_config_class):
             # TODO: warn about ignored args:
             # dynamic_batch_size, compiler_workdir, optlevel,
             # atol, disable_validation, library_name

diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Neuron configuration base classes."""
 
-import importlib
 import re
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
@@ -24,7 +23,7 @@
 from optimum.utils import logging
 
 from ...exporters.base import ExportConfig
-from ...neuron.utils import is_neuron_available, is_transformers_neuronx_available
+from ...neuron.utils import is_neuron_available
 
 
 if TYPE_CHECKING:
@@ -40,7 +39,7 @@ class MissingMandatoryAxisDimension(ValueError):
     pass
 
 
-class NeuronConfig(ExportConfig):
+class NeuronExportConfig(ExportConfig):
     """Base class for Neuron exportable models
 
     Class attributes:
@@ -77,7 +76,7 @@ def get_input_args_for_task(cls, task: str) -> Tuple[str]:
         return tuple(axes)
 
 
-class NeuronDefaultConfig(NeuronConfig, ABC):
+class NeuronDefaultConfig(NeuronExportConfig, ABC):
     """
     Base class for configuring the export of Neuron TorchScript models.
 
@@ -427,62 +426,3 @@ def forward(self, *input):
                 return outputs
 
         return ModelWrapper(model, list(dummy_inputs.keys()))
-
-
-class NeuronDecoderConfig(NeuronConfig):
-    """
-    Base class for configuring the export of Neuron Decoder models
-
-    Class attributes:
-
-    - INPUT_ARGS (`Tuple[Union[str, Tuple[Union[str, Tuple[str]]]]]`) -- A tuple where each element is either:
-        - An argument  name, for instance "batch_size" or "sequence_length", that indicates that the argument can
-        be passed to export the model,
-    - NEURONX_CLASS (`str`) -- the name of the transformers-neuronx class to instantiate for the model.
-    It is a full class name defined relatively to the transformers-neuronx module, e.g. `gpt2.model.GPT2ForSampling`
-    - CONTINUOUS_BATCHING (`bool`, defaults to `False`) -- Whether the model supports continuous batching or not.
-    - ATTENTION_LAYOUT (`str`, defaults to `HSB`) -- Layout to be used for attention computation.
-
-    The NEURONX_CLASS must always be defined in each model configuration.
-
-    Args:
-        task (`str`): The task the model should be exported for.
-    """
-
-    INPUT_ARGS = ("batch_size", "sequence_length")
-    NEURONX_CLASS = None
-    CONTINUOUS_BATCHING = False
-    ATTENTION_lAYOUT = "HSB"
-    FUSE_QKV = True
-
-    def __init__(self, task: str):
-        if not is_transformers_neuronx_available():
-            raise ModuleNotFoundError(
-                "The mandatory transformers-neuronx package is missing. Please install optimum-neuron[neuronx]."
-            )
-        if isinstance(self.NEURONX_CLASS, type):
-            self._neuronx_class = self.NEURONX_CLASS
-        else:
-            module_name, class_name = self.NEURONX_CLASS.rsplit(".", maxsplit=1)
-            module = importlib.import_module(f"transformers_neuronx.{module_name}")
-            self._neuronx_class = getattr(module, class_name, None)
-            if self._neuronx_class is None:
-                raise ImportError(
-                    f"{class_name} not found in {module_name}. Please check transformers-neuronx version."
-                )
-
-    @property
-    def neuronx_class(self):
-        return self._neuronx_class
-
-    @property
-    def continuous_batching(self):
-        return self.CONTINUOUS_BATCHING
-
-    @property
-    def attention_layout(self):
-        return self.ATTENTION_lAYOUT
-
-    @property
-    def fuse_qkv(self):
-        return self.FUSE_QKV
diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py
@@ -29,7 +29,7 @@
     DummyVisionInputGenerator,
     logging,
 )
-from .base import NeuronDecoderConfig, NeuronDefaultConfig
+from .base import NeuronDefaultConfig
 
 
 logger = logging.get_logger(__name__)
@@ -70,14 +70,6 @@ class AudioNeuronConfig(NeuronDefaultConfig):
     INPUT_ARGS = ("batch_size", "audio_sequence_length")
 
 
-class TextNeuronDecoderConfig(NeuronDecoderConfig):
-    """
-    Handles text decoder architectures.
-    """
-
-    pass
-
-
 class TextSeq2SeqNeuronConfig(NeuronDefaultConfig):
     """
     Handles encoder-decoder-based text architectures.