Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an HLO backend for LLM models #775

Merged
merged 9 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/doc-pr-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: '3.11'

- name: Setup environment
run: |
pip install --upgrade pip
Expand All @@ -58,7 +58,7 @@ jobs:
echo ${{ env.COMMIT_SHA }} > ./commit_sha
echo ${{ env.PR_NUMBER }} > ./pr_number

- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
with:
name: doc-build-artifact
path: neuron-doc-build/
6 changes: 1 addition & 5 deletions .github/workflows/test_inf2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,7 @@ jobs:
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/cli
- name: Run decoder tests
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/decoder
- name: Run other generation tests
- name: Run non-llm generation tests
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test --ignore=tests/generation/test_parallel.py tests/generation
Expand Down
51 changes: 51 additions & 0 deletions .github/workflows/test_inf2_llm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
name: Optimum neuron / Test INF2 LLM

on:
push:
branches: [ main ]
paths:
- "setup.py"
- "optimum/**.py"
- ".github/workflows/test_inf2_llm.yml"
pull_request:
branches: [ main ]
paths:
- "setup.py"
- "optimum/**.py"
- ".github/workflows/test_inf2_llm.yml"

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
do-the-job:
name: Run INF2 LLM tests
runs-on:
group: aws-inf2-8xlarge
steps:
- name: Install Neuron runtime
run: |
. /etc/os-release
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa -y
export PATH=/opt/aws/neuron/bin:$PATH
- name: Checkout
uses: actions/checkout@v2
- name: Install python dependencies
run: |
sudo apt install python3-venv python3-dev -y
python3 -m venv aws_neuron_venv_pytorch
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -U pip
pip install --upgrade setuptools==69.5.1
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
python -m pip install .[neuronx,tests]
- name: Run decoder tests
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/decoder
4 changes: 2 additions & 2 deletions examples/text-generation/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from optimum.neuron import NeuronModelForCausalLM


def generate(model, tokenizer, prompts, length, temperature):
def generate(model, tokenizer, prompts, max_new_tokens, temperature):
# Specifiy padding options for decoder-only architecture
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
Expand All @@ -17,7 +17,7 @@ def generate(model, tokenizer, prompts, length, temperature):
start = time.time()
with torch.inference_mode():
sample_output = model.generate(
**tokens, do_sample=True, max_length=length, temperature=temperature, top_k=50, top_p=0.9
**tokens, do_sample=True, max_new_tokens=max_new_tokens, temperature=temperature, top_k=50, top_p=0.9
)
end = time.time()
outputs = [tokenizer.decode(tok) for tok in sample_output]
Expand Down
12 changes: 9 additions & 3 deletions optimum/exporters/neuron/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
NEURON_FILE_NAME,
is_neuron_available,
is_neuronx_available,
is_transformers_neuronx_available,
map_torch_dtype,
)
from ...neuron.utils.misc import maybe_save_preprocessors
Expand All @@ -47,7 +48,7 @@
from ...utils import is_diffusers_available, logging
from ..error_utils import AtolError, OutputMatchError, ShapeError
from ..tasks import TasksManager
from .base import NeuronConfig, NeuronDecoderConfig
from .base import NeuronExportConfig
from .convert import export_models, validate_models_outputs
from .model_configs import * # noqa: F403
from .utils import (
Expand All @@ -70,6 +71,11 @@

NEURON_COMPILER = "Neuronx"


if is_transformers_neuronx_available():
from .model_configs import NeuronDecoderExportConfig


if is_diffusers_available():
from diffusers import StableDiffusionXLPipeline

Expand Down Expand Up @@ -122,7 +128,7 @@ def get_input_shapes_and_config_class(task: str, args: argparse.Namespace) -> Di
return input_shapes, neuron_config_constructor.func


def get_neuron_config_class(task: str, model_id: str) -> NeuronConfig:
def get_neuron_config_class(task: str, model_id: str) -> NeuronExportConfig:
config = AutoConfig.from_pretrained(model_id)

model_type = config.model_type.replace("_", "-")
Expand Down Expand Up @@ -724,7 +730,7 @@ def main():
submodels = None
else:
input_shapes, neuron_config_class = get_input_shapes_and_config_class(task, args)
if NeuronDecoderConfig in inspect.getmro(neuron_config_class):
if is_transformers_neuronx_available() and NeuronDecoderExportConfig in inspect.getmro(neuron_config_class):
# TODO: warn about ignored args:
# dynamic_batch_size, compiler_workdir, optlevel,
# atol, disable_validation, library_name
Expand Down
66 changes: 3 additions & 63 deletions optimum/exporters/neuron/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
# limitations under the License.
"""Neuron configuration base classes."""

import importlib
import re
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
Expand All @@ -24,7 +23,7 @@
from optimum.utils import logging

from ...exporters.base import ExportConfig
from ...neuron.utils import is_neuron_available, is_transformers_neuronx_available
from ...neuron.utils import is_neuron_available


if TYPE_CHECKING:
Expand All @@ -40,7 +39,7 @@ class MissingMandatoryAxisDimension(ValueError):
pass


class NeuronConfig(ExportConfig):
class NeuronExportConfig(ExportConfig):
"""Base class for Neuron exportable models

Class attributes:
Expand Down Expand Up @@ -77,7 +76,7 @@ def get_input_args_for_task(cls, task: str) -> Tuple[str]:
return tuple(axes)


class NeuronDefaultConfig(NeuronConfig, ABC):
class NeuronDefaultConfig(NeuronExportConfig, ABC):
"""
Base class for configuring the export of Neuron TorchScript models.

Expand Down Expand Up @@ -427,62 +426,3 @@ def forward(self, *input):
return outputs

return ModelWrapper(model, list(dummy_inputs.keys()))


class NeuronDecoderConfig(NeuronConfig):
"""
Base class for configuring the export of Neuron Decoder models

Class attributes:

- INPUT_ARGS (`Tuple[Union[str, Tuple[Union[str, Tuple[str]]]]]`) -- A tuple where each element is either:
- An argument name, for instance "batch_size" or "sequence_length", that indicates that the argument can
be passed to export the model,
- NEURONX_CLASS (`str`) -- the name of the transformers-neuronx class to instantiate for the model.
It is a full class name defined relatively to the transformers-neuronx module, e.g. `gpt2.model.GPT2ForSampling`
- CONTINUOUS_BATCHING (`bool`, defaults to `False`) -- Whether the model supports continuous batching or not.
- ATTENTION_LAYOUT (`str`, defaults to `HSB`) -- Layout to be used for attention computation.

The NEURONX_CLASS must always be defined in each model configuration.

Args:
task (`str`): The task the model should be exported for.
"""

INPUT_ARGS = ("batch_size", "sequence_length")
NEURONX_CLASS = None
CONTINUOUS_BATCHING = False
ATTENTION_lAYOUT = "HSB"
FUSE_QKV = True

def __init__(self, task: str):
if not is_transformers_neuronx_available():
raise ModuleNotFoundError(
"The mandatory transformers-neuronx package is missing. Please install optimum-neuron[neuronx]."
)
if isinstance(self.NEURONX_CLASS, type):
self._neuronx_class = self.NEURONX_CLASS
else:
module_name, class_name = self.NEURONX_CLASS.rsplit(".", maxsplit=1)
module = importlib.import_module(f"transformers_neuronx.{module_name}")
self._neuronx_class = getattr(module, class_name, None)
if self._neuronx_class is None:
raise ImportError(
f"{class_name} not found in {module_name}. Please check transformers-neuronx version."
)

@property
def neuronx_class(self):
return self._neuronx_class

@property
def continuous_batching(self):
return self.CONTINUOUS_BATCHING

@property
def attention_layout(self):
return self.ATTENTION_lAYOUT

@property
def fuse_qkv(self):
return self.FUSE_QKV
10 changes: 1 addition & 9 deletions optimum/exporters/neuron/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
DummyVisionInputGenerator,
logging,
)
from .base import NeuronDecoderConfig, NeuronDefaultConfig
from .base import NeuronDefaultConfig


logger = logging.get_logger(__name__)
Expand Down Expand Up @@ -70,14 +70,6 @@ class AudioNeuronConfig(NeuronDefaultConfig):
INPUT_ARGS = ("batch_size", "audio_sequence_length")


class TextNeuronDecoderConfig(NeuronDecoderConfig):
"""
Handles text decoder architectures.
"""

pass


class TextSeq2SeqNeuronConfig(NeuronDefaultConfig):
"""
Handles encoder-decoder-based text architectures.
Expand Down
Loading
Loading