From ad9e51bbd04dfeb1bb9f47fcc3a1b0ebb5766764 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 28 May 2024 16:50:45 +0200 Subject: [PATCH] Refactor documentation and improve tgi deployment (#610) * feat(decoder): export Tokenizer if available * feat(decoder): extend checkpoint folder permissions This allow the checkpoint files to be visible even if they have been created by another user (like the docker root user). * feat(tgi): remove redundant env var * docs(tgi): use privileged option * docs(tgi): simplify deployment instructions * fix(tgi): reduce CPU mem usage when loading neuron model * docs(inference): merge two similar pages * docs: move TGI README to documentation * feat(tgi): reference export documentation in error message * Apply suggestions from code review Co-authored-by: Michael Benayoun * review(tgi): revert to info traces * Apply suggestions from code review Co-authored-by: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com> * review: add details on export parameters * review: add padding tip --------- Co-authored-by: Michael Benayoun Co-authored-by: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com> --- docs/source/_toctree.yml | 4 +- docs/source/guides/export_model.mdx | 212 +++++++++++++---- docs/source/guides/models.mdx | 217 ------------------ docs/source/guides/neuronx_tgi.mdx | 183 +++++++++++++++ docs/source/tutorials/llama2-13b-chatbot.mdx | 5 +- optimum/exporters/neuron/__main__.py | 12 +- optimum/neuron/modeling_decoder.py | 1 + text-generation-inference/README.md | 197 +--------------- .../text_generation_server/generator.py | 2 +- .../server/text_generation_server/model.py | 9 +- .../tests/fixtures/service.py | 2 +- .../tests/integration/test_implicit_env.py | 2 - text-generation-inference/tgi_env.py | 14 +- 13 files changed, 384 insertions(+), 476 deletions(-) delete mode 100644 docs/source/guides/models.mdx create mode 100644 docs/source/guides/neuronx_tgi.mdx diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index ca4d1cb73..4eee084aa 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -38,10 +38,10 @@ title: Distributed Training - local: guides/export_model title: Export a model to Inferentia - - local: guides/models - title: Neuron models for inference - local: guides/pipelines title: Inference pipelines with AWS Neuron + - local: guides/neuronx_tgi + title: NeuronX Text-generation-inference for AWS inferentia2 title: How-To Guides - sections: - local: benchmarks/inferentia-llama2-7b diff --git a/docs/source/guides/export_model.mdx b/docs/source/guides/export_model.mdx index 0e629ceea..d11ec8a95 100644 --- a/docs/source/guides/export_model.mdx +++ b/docs/source/guides/export_model.mdx @@ -40,20 +40,12 @@ AWS provides two generations of the Inferentia accelerator built for machine lea In production environments, to deploy 🤗 [Transformers](https://huggingface.co/docs/transformers/index) models on Neuron devices, you need to compile your models and export them to a serialized format before inference. Through Ahead-Of-Time (AOT) compilation with Neuron Compiler( [neuronx-cc](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/compiler/neuronx-cc/index.html) or [neuron-cc](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/compiler/neuron-cc/neuron-cc.html) ), your models will be converted to serialized and optimized [TorchScript modules](https://pytorch.org/docs/stable/generated/torch.jit.ScriptModule.html). - -To understand a little bit more about the compilation, here are general steps executed under the hood: - -Compilation flow - -**NEFF**: Neuron Executable File Format which is a binary executable on Neuron devices. - - -Although pre-compilation avoids overhead during the inference, traced Neuron module has some limitations: -* Traced Neuron module will be static, which requires fixed input shapes and data types used during the compilation. As the model won't be dynamically recompiled, the inference will fail if any of the above conditions change. - (*But these limitations could be bypass with [dynamic batching](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/api-reference-guide/inference/api-torch-neuronx-trace.html#dynamic-batching) and [bucketing](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/appnotes/torch-neuron/bucketing-app-note.html#bucketing-app-note)*). -* Neuron models are hardware-specialized, which means: - * Models traced with Neuron can no longer be executed in non-Neuron environment. - * Models compiled for inf1 (NeuronCore-v1) are not compatible with inf2 (NeuronCore-v2), and vice versa. +Although pre-compilation avoids overhead during the inference, a compiled Neuron model has some limitations: +* The input shapes and data types used during the compilation cannot be changed. +* Neuron models are specialized for each hardware and SDK version, which means: + * Models compiled with Neuron can no longer be executed in non-Neuron environment. + * Models compiled for inf1 (NeuronCore-v1) are not compatible with inf2 (NeuronCore-v2), and vice versa. + * Models compiled for an SDK version are (generally) not compatible with another SDK version. In this guide, we'll show you how to export your models to serialized models optimized for Neuron devices. @@ -167,15 +159,41 @@ Input shapes: ``` -In the last section, you can see some input shape options to pass for exporting static neuron model, meaning that exact shape inputs should be used during the inference as given during compilation. If you are going to use variable-size inputs, you can pad your inputs to the shape used for compilation as a workaround. If you want the batch size to be dynamic, you can pass `--dynamic-batch-size` to enable dynamic batching, which means that you will be able to use inputs with difference batch size during inference, but it comes with a potential tradeoff in terms of latency. +### Exporting standard (non-LLM) models + +Most models present on the Hugging Face hub can be straightforwardly exported using torch trace, then converted to serialized and optimized TorchScript modules. + + + +Compilation flow + +**NEFF**: Neuron Executable File Format which is a binary executable on Neuron devices. + + +When exporting a model, two sets of export arguments must be passed: + +- `compiler_args` are optional arguments for the compiler, these arguments usually control how the compiler makes tradeoff between the inference performance (latency and throughput) and the accuracy, +- `input_shapes` are mandatory static shape information that you need to send to the neuron compiler. -Exporting a checkpoint can be done as follows: +Please type the following command to see all export parameters: ```bash -optimum-cli export neuron --model distilbert-base-uncased-distilled-squad --batch_size 1 --sequence_length 16 distilbert_base_uncased_squad_neuron/ +optimum-cli export neuron -h ``` -You should see the following logs which validate the model on Neuron devices by comparing with PyTorch model on CPU: +Exporting a standard NLP model can be done as follows: + +```bash +optimum-cli export neuron --model distilbert-base-uncased-distilled-squad \ + --batch_size 1 --sequence_length 16 \ + --auto_cast matmul --auto_cast_type fp16 \ + distilbert_base_uncased_squad_neuron/ +``` + +Here the model was exported with a static input shape of `(1, 16)`, and with compiler arguments specifying +that matmul operation must be performed with `float16` precision for faster inference. + +After export, you should see the following logs which validate the model on Neuron devices by comparing with PyTorch model on CPU: ```bash Validating Neuron model... @@ -196,23 +214,20 @@ optimum-cli export neuron --model local_path --task question-answering --batch_s Note that providing the `--task` argument for a model on the Hub will disable the automatic task detection. The resulting `model.neuron` file, can then be loaded and run on Neuron devices. -## Exporting a model to Neuron via NeuronModel - -You will also be able to export your models to Neuron format with `optimum.neuron.NeuronModelForXXX` model classes. Here is an example: +For each model architecture, you can find the list of supported tasks via the [`~exporters.tasks.TasksManager`]. For example, for DistilBERT, for the Neuron export, we have: ```python ->>> from optimum.neuron import NeuronModelForSequenceClassification - ->>> input_shapes = {"batch_size": 1, "sequence_length": 64} # mandatory shapes ->>> model = NeuronModelForSequenceClassification.from_pretrained( -... "distilbert-base-uncased-finetuned-sst-2-english", export=True, **input_shapes -... ) +>>> from optimum.exporters.tasks import TasksManager +>>> from optimum.exporters.neuron.model_configs import * # Register neuron specific configs to the TasksManager -# Save the model ->>> model.save_pretrained("./distilbert-base-uncased-finetuned-sst-2-english_neuron/") +>>> distilbert_tasks = list(TasksManager.get_supported_tasks_for_model_type("distilbert", "neuron").keys()) +>>> print(distilbert_tasks) +['feature-extraction', 'fill-mask', 'multiple-choice', 'question-answering', 'text-classification', 'token-classification'] ``` -And the exported model can be used for inference directly with the `NeuronModelForXXX` class: +You can then pass one of these tasks to the `--task` argument in the `optimum-cli export neuron` command, as mentioned above. + +Once exported, the neuron model can be used for inference directly with the `NeuronModelForXXX` class: ```python >>> from transformers import AutoTokenizer @@ -227,7 +242,15 @@ And the exported model can be used for inference directly with the `NeuronModelF 'POSITIVE' ``` -## Exporting Stable Diffusion to Neuron +As you see, there is no need to pass the neuron arguments used during the export as they are +saved in a `config.json` file, and will be restored automatically by `NeuronModelForXXX` class. + + +Be careful, inputs are always padded to the shapes used for the compilation, and the padding brings computation overhead. +Adjust the static shapes to be higher than the shape of the inputs that you will feed into the model during the inference, but not much more. + + +### Exporting Stable Diffusion to Neuron With the Optimum CLI you can compile components in the Stable Diffusion pipeline to gain acceleration on neuron devices during the inference. @@ -260,7 +283,7 @@ optimum-cli export neuron --model stabilityai/stable-diffusion-2-1-base \ sd_neuron/ ``` -## Exporting Stable Diffusion XL to Neuron +### Exporting Stable Diffusion XL to Neuron Similar to Stable Diffusion, you will be able to use Optimum CLI to compile components in the SDXL pipeline for inference on neuron devices. @@ -292,21 +315,126 @@ optimum-cli export neuron --model stabilityai/stable-diffusion-xl-base-1.0 \ sd_neuron/ ``` -## Selecting a task +### Exporting LLMs to Neuron -Specifying a `--task` should not be necessary in most cases when exporting from a model on the Hugging Face Hub. +LLM models are not exported using Torch tracing, but converted directly to Neuron graphs into which the +transformers checkpoint weights can be loaded. -However, in case you need to check for a given a model architecture what tasks the Neuron export supports, we got you covered. First, you can check the list of supported tasks [here](https://huggingface.co/docs/optimum/exporters/task_manager#pytorch). +Just like the standard NLP models, you need to specify static parameters when exporting an LLM model: -For each model architecture, you can find the list of supported tasks via the [`~exporters.tasks.TasksManager`]. For example, for DistilBERT, for the Neuron export, we have: +- `batch_size` is the number of input sequences that the model will accept. Defaults to 1, +- `sequence_length` is the maximum number of tokens in an input sequence. Defaults to `max_position_embeddings` (`n_positions` for older models). +- `auto_cast_type` specifies the format to encode the weights. It can be one of `fp32` (`float32`), `fp16` (`float16`) or `bf16` (`bfloat16`). Defaults to `fp32`. +- `num_cores` is the number of neuron cores used when instantiating the model. Each neuron core has 16 Gb of memory, which means that +bigger models need to be split on multiple cores. Defaults to 1, + +```bash +optimum-cli export neuron --model meta-llama/Meta-Llama-3-8B \ + --batch_size 1 \ + --sequence_length 4096 \ + --auto_cast_type fp16 `# cast operations from BF16 to FP16` \ + --num_cores 2 \ + llama3_neuron/ +``` + +An important restriction is that LLM models can only be exported on Neuron platforms, as they are tailored +to fit on the actual devices during export. + + +The export of LLM models can take much longer than standard models (sometimes more than one hour). + + +As explained before, the neuron model parameters are static. +This means in particular that during inference: + +- the `batch_size` of the inputs should be lower to the `batch_size` used during export, +- the `length` of the input sequences should be lower than the `sequence_length` used during export, +- the maximum number of tokens (input + generated) cannot exceed the `sequence_length` used during export. + +Once exported, neuron mmodels can simply be reloaded using the `NeuronModelForCausalLM` class. +As with the original transformers models, use `generate()` instead of `forward()` to generate text sequences. + +```diff +from transformers import AutoTokenizer +-from transformers import AutoModelForCausalLM ++from optimum.neuron import NeuronModelForCausalLM + +# Instantiate and convert to Neuron a PyTorch checkpoint +-model = AutoModelForCausalLM.from_pretrained("gpt2") ++model = NeuronModelForCausalLM.from_pretrained("./gpt2-neuron") + +tokenizer = AutoTokenizer.from_pretrained("gpt2") +tokenizer.pad_token_id = tokenizer.eos_token_id + +tokens = tokenizer("I really wish ", return_tensors="pt") +with torch.inference_mode(): + sample_output = model.generate( + **tokens, + do_sample=True, + min_length=128, + max_length=256, + temperature=0.7, + ) + outputs = [tokenizer.decode(tok) for tok in sample_output] + print(outputs) +``` + +The generation is highly configurable. Please refer to https://huggingface.co/docs/transformers/generation_strategies for details. + +Please be aware that: + +- for each model architecture, default values are provided for all parameters, but values passed to the `generate` method will take precedence, +- the generation parameters can be stored in a `generation_config.json` file. When such a file is present in model directory, +it will be parsed to set the default parameters (the values passed to the `generate` method still take precedence). + + +## Exporting a model to Neuron programmatically via NeuronModel + +As an alternative to the `optimim-cli`, you will also be able to export your models to Neuron +inside your own python script or notebook with `optimum.neuron.NeuronModelForXXX` model classes. + +Here is an example: ```python ->>> from optimum.exporters.tasks import TasksManager ->>> from optimum.exporters.neuron.model_configs import * # Register neuron specific configs to the TasksManager +>>> from optimum.neuron import NeuronModelForSequenceClassification ->>> distilbert_tasks = list(TasksManager.get_supported_tasks_for_model_type("distilbert", "neuron").keys()) ->>> print(distilbert_tasks) -['feature-extraction', 'fill-mask', 'multiple-choice', 'question-answering', 'text-classification', 'token-classification'] +>>> input_shapes = {"batch_size": 1, "sequence_length": 64} # mandatory shapes +>>> model = NeuronModelForSequenceClassification.from_pretrained( +... "distilbert-base-uncased-finetuned-sst-2-english", export=True, **input_shapes +... ) + +# Save the model +>>> model.save_pretrained("./distilbert-base-uncased-finetuned-sst-2-english_neuron/") + +# Push the neuron model to HF Hub +>>> model.push_to_hub( # doctest: +SKIP +... "a_local_path_for_compiled_neuron_model", repository_id="my-neuron-repo", use_auth_token=True +... ) ``` -You can then pass one of these tasks to the `--task` argument in the `optimum-cli export neuron` command, as mentioned above. +This example can be adapted for other model types using the same export parameters as the `optimum-cli`. + +## Exporting neuron models using NeuronX TGI + +The NeuronX TGI image includes not only NeuronX runtime, but also all packages and tools required to export Neuron models. + +Use the following command to export a model to Neuron using a TGI image: + +``` +docker run --emtrypoint optimum-cli \ + -v $(pwd)/data:/data \ + --privileged \ + ghcr.io/huggingface/neuronx-tgi:latest \ + export neuron \ + --model / \ + --batch_size 1 \ + --sequence_length 4096 \ + --auto_cast_type fp16 \ + --num_cores 2 \ + /data/ +``` + +The exported model will be saved under `./data/`. + + + diff --git a/docs/source/guides/models.mdx b/docs/source/guides/models.mdx deleted file mode 100644 index 9b664948f..000000000 --- a/docs/source/guides/models.mdx +++ /dev/null @@ -1,217 +0,0 @@ - - -# Neuron Model Inference - -*The APIs presented in the following documentation are relevant for the inference on [inf2](https://aws.amazon.com/ec2/instance-types/inf2/), -[trn1](https://aws.amazon.com/ec2/instance-types/trn1/) and [inf1](https://aws.amazon.com/ec2/instance-types/inf1/).* - -`NeuronModelForXXX` classes help to load models from the [Hugging Face Hub](hf.co/models) and compile them to a serialized format optimized for -neuron devices. You will then be able to load the model and run inference with the acceleration powered by AWS Neuron devices. - -## Switching from Transformers to Optimum - -The `optimum.neuron.NeuronModelForXXX` model classes are APIs compatible with Hugging Face Transformers models. This means seamless integration -with Hugging Face's ecosystem. You can just replace your `AutoModelForXXX` class with the corresponding `NeuronModelForXXX` class in `optimum.neuron`. - -If you already use Transformers, you will be able to reuse your code just by replacing model classes: - -```diff -from transformers import AutoTokenizer --from transformers import AutoModelForSequenceClassification -+from optimum.neuron import NeuronModelForSequenceClassification - -# PyTorch checkpoint --model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") - -+model = NeuronModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english", -+ export=True, **neuron_kwargs) -``` - -As shown above, when you use `NeuronModelForXXX` for the first time, you will need to set `export=True` to compile your model from PyTorch to a neuron-compatible format. - -You will also need to pass Neuron specific parameters to configure the export. Each model architecture has its own set of parameters, as detailed in the next paragraphs. - -Once your model has been exported, you can save it either on your local or in the [Hugging Face Model Hub](https://hf.co/models): - -```python -# Save the neuron model ->>> model.save_pretrained("a_local_path_for_compiled_neuron_model") - -# Push the neuron model to HF Hub ->>> model.push_to_hub( # doctest: +SKIP -... "a_local_path_for_compiled_neuron_model", repository_id="my-neuron-repo", use_auth_token=True -... ) -``` - -And the next time when you want to run inference, just load your compiled model which will save you the compilation time: - -```python ->>> from optimum.neuron import NeuronModelForSequenceClassification ->>> model = NeuronModelForSequenceClassification.from_pretrained("my-neuron-repo") -``` - -As you see, there is no need to pass the neuron arguments used during the export as they are -saved in a `config.json` file, and will be restored automatically by `NeuronModelForXXX` class. - - - -When running inference for the first time, there is a warmup phase when you run the pipeline for the first time. This run would take 3x-4x higher latency than a regular run. - - - -## Discriminative NLP models - -As explained in the previous section, you will need only few modifications to your Transformers code to export and run NLP models: - -```diff -from transformers import AutoTokenizer --from transformers import AutoModelForSequenceClassification -+from optimum.neuron import NeuronModelForSequenceClassification - -# PyTorch checkpoint --model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") - -# Compile your model during the first time -+compiler_args = {"auto_cast": "matmul", "auto_cast_type": "bf16"} -+input_shapes = {"batch_size": 1, "sequence_length": 64} -+model = NeuronModelForSequenceClassification.from_pretrained( -+ "distilbert-base-uncased-finetuned-sst-2-english", export=True, **compiler_args, **input_shapes, -+) - -tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") -inputs = tokenizer("Hamilton is considered to be the best musical of human history.", return_tensors="pt") - -logits = model(**inputs).logits -print(model.config.id2label[logits.argmax().item()]) -# 'POSITIVE' -``` - -`compiler_args` are optional arguments for the compiler, these arguments usually control how the compiler makes tradeoff between the inference performance (latency and throughput) and the accuracy. Here we cast FP32 operations to BF16 using the Neuron matrix-multiplication engine. - -`input_shapes` are mandatory static shape information that you need to send to the neuron compiler. Wondering what shapes are mandatory for your model? Check it out -with the following code: - -```python ->>> from transformers import AutoModelForSequenceClassification ->>> from optimum.exporters import TasksManager - ->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") - -# Infer the task name if you don't know ->>> task = TasksManager.infer_task_from_model(model) # 'text-classification' - ->>> neuron_config_constructor = TasksManager.get_exporter_config_constructor( -... model=model, exporter="neuron", task='text-classification' -... ) ->>> print(neuron_config_constructor.func.get_mandatory_axes_for_task(task)) -# ('batch_size', 'sequence_length') -``` - - - -Be careful, the input shapes used for compilation should be inferior than the size of inputs that you will feed into the model during the inference. - - - - - -- What if input sizes are smaller than compilation input shapes? - -No worries, `NeuronModelForXXX` class will pad your inputs to an eligible shape. Besides you can set `dynamic_batch_size=True` in the `from_pretrained` method to enable dynamic batching, which means that your inputs can have variable batch size. - - - -*(Just keep in mind: dynamicity and padding comes with not only flexibility but also performance drop. Fair enough!)* - - - -## Generative NLP models - -As explained before, you will need only a few modifications to your Transformers code to export and run NLP models: - -### Configuring the export of a generative model - -As for non-generative models, two sets of parameters can be passed to the `from_pretrained()` method to configure how a transformers checkpoint is exported to -a neuron optimized model: - -- `compiler_args = { num_cores, auto_cast_type }` are optional arguments for the compiler, these arguments usually control how the compiler makes tradeoff between the inference latency and throughput and the accuracy. -- `input_shapes = { batch_size, sequence_length }` correspond to the static shape of the model input and the KV-cache (attention keys and values for past tokens). - -- `num_cores` is the number of neuron cores used when instantiating the model. Each neuron core has 16 Gb of memory, which means that -bigger models need to be split on multiple cores. Defaults to 1, -- `auto_cast_type` specifies the format to encode the weights. It can be one of `fp32` (`float32`), `fp16` (`float16`) or `bf16` (`bfloat16`). Defaults to `fp32`. -- `batch_size` is the number of input sequences that the model will accept. Defaults to 1, -- `sequence_length` is the maximum number of tokens in an input sequence. Defaults to `max_position_embeddings` (`n_positions` for older models). - -```diff -from transformers import AutoTokenizer --from transformers import AutoModelForCausalLM -+from optimum.neuron import NeuronModelForCausalLM - -# Instantiate and convert to Neuron a PyTorch checkpoint -+compiler_args = {"num_cores": 1, "auto_cast_type": 'fp32'} -+input_shapes = {"batch_size": 1, "sequence_length": 512} --model = AutoModelForCausalLM.from_pretrained("gpt2") -+model = NeuronModelForCausalLM.from_pretrained("gpt2", export=True, **compiler_args, **input_shapes) -``` - -As explained before, these parameters can only be configured during export. -This means in particular that during inference: - -- the `batch_size` of the inputs should be equal to the `batch_size` used during export, -- the `length` of the input sequences should be lower than the `sequence_length` used during export, -- the maximum number of tokens (input + generated) cannot exceed the `sequence_length` used during export. - -### Text generation inference - -As with the original transformers models, use `generate()` instead of `forward()` to generate text sequences. - -```diff -from transformers import AutoTokenizer --from transformers import AutoModelForCausalLM -+from optimum.neuron import NeuronModelForCausalLM - -# Instantiate and convert to Neuron a PyTorch checkpoint --model = AutoModelForCausalLM.from_pretrained("gpt2") -+model = NeuronModelForCausalLM.from_pretrained("gpt2", export=True) - -tokenizer = AutoTokenizer.from_pretrained("gpt2") -tokenizer.pad_token_id = tokenizer.eos_token_id - -tokens = tokenizer("I really wish ", return_tensors="pt") -with torch.inference_mode(): - sample_output = model.generate( - **tokens, - do_sample=True, - min_length=128, - max_length=256, - temperature=0.7, - ) - outputs = [tokenizer.decode(tok) for tok in sample_output] - print(outputs) -``` - -The generation is highly configurable. Please refer to https://huggingface.co/docs/transformers/generation_strategies for details. - -Please be aware that: - -- for each model architecture, default values are provided for all parameters, but values passed to the `generate` method will take precedence, -- the generation parameters can be stored in a `generation_config.json` file. When such a file is present in model directory, -it will be parsed to set the default parameters (the values passed to the `generate` method still take precedence). - - -Happy inference with Neuron! 🚀 diff --git a/docs/source/guides/neuronx_tgi.mdx b/docs/source/guides/neuronx_tgi.mdx new file mode 100644 index 000000000..9f4552830 --- /dev/null +++ b/docs/source/guides/neuronx_tgi.mdx @@ -0,0 +1,183 @@ +# NeuronX Text-generation-inference for AWS inferentia2 + +Text Generation Inference ([TGI](https://huggingface.co/docs/text-generation-inference/)) is a toolkit for deploying and serving Large Language Models (LLMs). + +It is available for Inferentia2. + +## Features + +The basic TGI features are supported: + +- continuous batching, +- token streaming, +- greedy search and multinomial sampling using [transformers](https://huggingface.co/docs/transformers/generation_strategies#customize-text-generation). + +## License + +NeuronX TGI is released under an [Apache2 License](https://github.com/huggingface/text-generation-inference?tab=Apache-2.0-1-ov-file#readme). + +## Deploy the service from the Hugging Face hub + +The simplest way to deploy the NeuronX TGI service for a specific model is to follow the +deployment instructions in the model card: + +- click on the "Deploy" button on the right, +- select your deployment service ("Inference Endpoints" and "SageMaker" are supported), +- select "AWS Inferentia", +- follow the instructions. + + +## Deploy the service on a dedicated host + +The service is launched simply by running the neuronx-tgi container with two sets of parameters: + +``` +docker run ghcr.io/huggingface/neuronx-tgi:latest +``` + +- system parameters are used to map ports, volumes and devices between the host and the service, +- service parameters are forwarded to the `text-generation-launcher`. + +When deploying a service, you will need a pre-compiled Neuron model. The NeuronX TGI service supports two main modes of operation: + +- you can either deploy the service on a model that has already been exported to Neuron, +- or alternatively you can take advantage of the Neuron Model Cache to export your own model. + +### Common system parameters + +Whenever you launch a TGI service, we highly recommend you to mount a shared volume mounted as `/data` in the container: this is where +the models will be cached to speed up further instantiations of the service. + +Note also that enough neuron devices should be visible by the container.The simplest way to achieve that is to launch the service in `privileged` mode to get access to all neuron devices. +Alternatively, each device can be explicitly exposed using the `--device` option. + +Finally, you might want to export the `HF_TOKEN` if you want to access gated repositories. + +Here is an example of a service instantiation: + +``` +docker run -p 8080:80 \ + -v $(pwd)/data:/data \ + --privileged \ + -e HF_TOKEN=${HF_TOKEN} \ + ghcr.io/huggingface/neuronx-tgi:latest \ + +``` + +If you only want to map the first device, the launch command becomes: + +``` +docker run -p 8080:80 \ + -v $(pwd)/data:/data \ + --device=/dev/neuron0 \ + -e HF_TOKEN=${HF_TOKEN} \ + ghcr.io/huggingface/neuronx-tgi:latest \ + +``` + +### Using a standard model from the 🤗 [HuggingFace Hub](https://huggingface.co/aws-neuron) (recommended) + +We maintain a Neuron Model Cache of the most popular architecture and deployment parameters under [aws-neuron/optimum-neuron-cache](https://huggingface.co/aws-neuron/optimum-neuron-cache). + +If you just want to try the service quickly using a model that has not been exported yet, it is thus still +possible to export it dynamically, pending some conditions: +- you must specify the export parameters when launching the service (or use default parameters), +- the model configuration must be cached. + +The snippet below shows how you can deploy a service from a hub standard model: + +``` +docker run -p 8080:80 \ + -v $(pwd)/data:/data \ + --privileged \ + -e HF_TOKEN=${HF_TOKEN} \ + -e HF_AUTO_CAST_TYPE="fp16" \ + -e HF_NUM_CORES=2 \ + ghcr.io/huggingface/neuronx-tgi:latest \ + --model-id NousResearch/Llama-2-7b-chat-hf \ + --max-batch-size 1 \ + --max-input-length 3164 \ + --max-total-tokens 4096 +``` + +### Using a model exported to a local path + +Alternatively, you can first [export the model to neuron format](https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-neuronx-tgi) locally. + +You can then deploy the service inside the shared volume: + +``` +docker run -p 8080:80 \ + -v $(pwd)/data:/data \ + --privileged \ + ghcr.io/huggingface/neuronx-tgi:latest \ + --model-id /data/ +``` + +Note: You don't need to specify any service parameters, as they will all be deduced from the model export configuration. + +### Using a neuron model from the 🤗 [HuggingFace Hub](https://huggingface.co/) + +The easiest way to share a neuron model inside your organization is to push it on the Hugging Face hub, so that it can be deployed directly without requiring an export. + +The snippet below shows how you can deploy a service from a hub neuron model: + +``` +docker run -p 8080:80 \ + -v $(pwd)/data:/data \ + --privileged \ + -e HF_TOKEN=${HF_TOKEN} \ + ghcr.io/huggingface/neuronx-tgi:latest \ + --model-id / +``` + +### Choosing service parameters + +Use the following command to list the available service parameters: + +``` +docker run ghcr.io/huggingface/neuronx-tgi --help +``` + +The configuration of an inference endpoint is always a compromise between throughput and latency: serving more requests in parallel will allow a higher throughput, but it will increase the latency. + +The neuron models have static input dimensions `[batch_size, max_length]`. + +This adds several restrictions to the following parameters: + +- `--max-batch-size` must be set to `batch size`, +- `--max-input-length` must be lower than `max_length`, +- `--max-total-tokens` must be set to `max_length` (it is per-request). + +Although not strictly necessary, but important for efficient prefilling: + +- `--max-batch-prefill-tokens` should be set to `batch_size` * `max-input-length`. + +### Choosing the correct batch size + +As seen in the previous paragraph, neuron model static batch size has a direct influence on the endpoint latency and throughput. + +Please refer to [text-generation-inference](https://github.com/huggingface/text-generation-inference) for optimization hints. + +Note that the main constraint is to be able to fit the model for the specified `batch_size` within the total device memory available +on your instance (16GB per neuron core, with 2 cores per device). + +## Query the service + +You can query the model using either the `/generate` or `/generate_stream` routes: + +``` +curl 127.0.0.1:8080/generate \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \ + -H 'Content-Type: application/json' +``` + +``` +curl 127.0.0.1:8080/generate_stream \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \ + -H 'Content-Type: application/json' +``` + +Note: replace 127.0.0.1:8080 with your actual IP address and port. diff --git a/docs/source/tutorials/llama2-13b-chatbot.mdx b/docs/source/tutorials/llama2-13b-chatbot.mdx index 40458786e..1bca187e5 100644 --- a/docs/source/tutorials/llama2-13b-chatbot.mdx +++ b/docs/source/tutorials/llama2-13b-chatbot.mdx @@ -33,12 +33,9 @@ For this guide, we will use the non-gated [NousResearch/Llama-2-13b-chat-hf](htt This model is part of the **Llama 2** family of models, and has been tuned to recognize chat interactions between a *user* and an *assistant* (more on that later). -As explained in the [optimum-neuron documentation](https://huggingface.co/docs/optimum-neuron/guides/export_model#why-compile-to-neuron-model) +As explained in the [optimum-neuron documentation](https://huggingface.co/docs/optimum-neuron/guides/export_model#exporting-llm-models-to-neuron) , models need to be compiled and exported to a serialized format before running them on Neuron devices. -Fortunately, 🤗 **optimum-neuron** offers a [very simple API](https://huggingface.co/docs/optimum-neuron/guides/models#configuring-the-export-of-a-generative-model) -to export standard 🤗 [transformers models](https://huggingface.co/docs/transformers/index) to the Neuron format. - When exporting the model, we will specify two sets of parameters: - using *compiler_args*, we specify on how many cores we want the model to be deployed (each neuron device has two cores), and with which precision (here *float16*), diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index d75aa03b8..833485000 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -22,7 +22,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from requests.exceptions import ConnectionError as RequestsConnectionError -from transformers import AutoConfig, PretrainedConfig +from transformers import AutoConfig, AutoTokenizer, PretrainedConfig from ...neuron.utils import ( DECODER_NAME, @@ -603,6 +603,7 @@ def main_export( def decoder_export( model_name_or_path: str, output: Union[str, Path], + trust_remote_code: Optional[bool] = None, **kwargs, ): from ...neuron import NeuronModelForCausalLM @@ -611,8 +612,15 @@ def decoder_export( if not output.parent.exists(): output.parent.mkdir(parents=True) - model = NeuronModelForCausalLM.from_pretrained(model_name_or_path, export=True, **kwargs) + model = NeuronModelForCausalLM.from_pretrained( + model_name_or_path, export=True, trust_remote_code=trust_remote_code, **kwargs + ) model.save_pretrained(output) + try: + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code) + tokenizer.save_pretrained(output) + except Exception: + logger.warning(f"No tokenizer found while exporting {model_name_or_path}.") def main(): diff --git a/optimum/neuron/modeling_decoder.py b/optimum/neuron/modeling_decoder.py index 38575e1bb..9dbb3fdf8 100644 --- a/optimum/neuron/modeling_decoder.py +++ b/optimum/neuron/modeling_decoder.py @@ -252,6 +252,7 @@ def _create_checkpoint( # Save the model checkpoint in a temporary directory checkpoint_dir = TemporaryDirectory() + os.chmod(checkpoint_dir.name, 0o775) model.save_pretrained(checkpoint_dir.name) return checkpoint_dir diff --git a/text-generation-inference/README.md b/text-generation-inference/README.md index c38d88ab3..e22de648a 100644 --- a/text-generation-inference/README.md +++ b/text-generation-inference/README.md @@ -1,201 +1,16 @@ -# Text-generation-inference docker image for AWS inferentia2 +# NeuronX TGI: Text-generation-inference for AWS inferentia2 -This docker image integrates into a base image: +NeuronX TGI is distributed as docker images for [EC2](https://github.com/huggingface/optimum-neuron/pkgs/container/neuronx-tgi) and SageMaker. + +These docker images integrate: - the AWS Neuron SDK for Inferentia2, - the [Text Generation Inference](https://github.com/huggingface/text-generation-inference) launcher and scheduling front-end, - a neuron specific inference server for text-generation. -## Features - -The basic features of the [Text Generation Inference](https://github.com/huggingface/text-generation-inference) product are supported: - -- continuous batching, -- token streaming, -- greedy search and multinomial sampling using [transformers](https://huggingface.co/docs/transformers/generation_strategies#customize-text-generation). - -The main differences with the standard service for CUDA and CPU backends are that: - -- the service uses a single internal static batch, -- new requests are inserted in the static batch during prefill, -- the static KV cache is rebuilt entirely during prefill. - -## License - -This docker image is released under [HFOIL 1.0](https://github.com/huggingface/text-generation-inference/blob/bde25e62b33b05113519e5dbf75abda06a03328e/LICENSE). - -HFOIL stands for Hugging Face Optimized Inference License, and it has been specifically designed for our optimized inference solutions. While the source code remains accessible, HFOIL is not a true open source license because we added a restriction: to sell a hosted or managed service built on top of TGI, we require a separate agreement. - -Please refer to [this reference documentation](https://github.com/huggingface/text-generation-inference/issues/726) to see if the HFOIL 1.0 restrictions apply to your deployment. - -## Deploy the service - -The service is launched simply by running the neuronx-tgi container with two sets of parameters: - -``` -docker run ghcr.io/huggingface/neuronx-tgi:latest -``` - -- system parameters are used to map ports, volumes and devices between the host and the service, -- service parameters are forwarded to the `text-generation-launcher`. - -When deploying a service, you will need a working Neuron model. The NeuronX TGI service supports two main modes of operation: - -- you can either deploy the service on a model that has already been exported to Neuron, -- or alternatively you can take advantage of the Neuron Model Cache to export your own model. - -### Common system parameters - -Whenever you launch a TGI service, we highly recommend you to mount a shared volume mounted as `/data` in the container: this is where -the models will be cached to speed up further instantiations of the service. - -Note also that all neuron devices have to be explicitly made visible to the container. - -Finally, you might want to export the `HF_TOKEN` if you want to access gated repository. - -Here is an example of a service instantiation: - -``` -docker run -p 8080:80 \ - -v $(pwd)/data:/data \ - --device=/dev/neuron0 \ - -e HF_TOKEN=${HF_TOKEN} \ - ghcr.io/huggingface/neuronx-tgi:latest \ - -``` - -If your instance has 12 neuron devices, the launch command becomes: - -``` -docker run -p 8080:80 \ - -v $(pwd)/data:/data \ - --device=/dev/neuron0 \ - --device=/dev/neuron1 \ - --device=/dev/neuron2 \ - --device=/dev/neuron3 \ - --device=/dev/neuron4 \ - --device=/dev/neuron5 \ - --device=/dev/neuron6 \ - --device=/dev/neuron7 \ - --device=/dev/neuron8 \ - --device=/dev/neuron9 \ - --device=/dev/neuron10 \ - --device=/dev/neuron11 \ - -e HF_TOKEN=${HF_TOKEN} \ - ghcr.io/huggingface/neuronx-tgi:latest \ - -``` - - -### Using a neuron model from the 🤗 [HuggingFace Hub](https://huggingface.co/aws-neuron) (recommended) - -There are plenty of already exported neuron models on the hub, under the [aws-neuron](https://huggingface.co/aws-neuron) organization. - -The snippet below shows how you can deploy a service from a hub neuron model: - -``` -docker run -p 8080:80 \ - -v $(pwd)/data:/data \ - --device=/dev/neuron0 \ - -e HF_TOKEN=${HF_TOKEN} \ - ghcr.io/huggingface/neuronx-tgi:latest \ - --model-id aws-neuron/Llama-2-7b-hf-neuron-budget \ - --max-batch-size 1 \ - --max-input-length 1024 \ - --max-total-tokens 2048 -``` - -### Using a standard model from the 🤗 [HuggingFace Hub](https://huggingface.co/aws-neuron) - - -We maintain a Neuron Model Cache of the most popular architecture and deployment parameters under [aws-neuron/optimum-neuron-cache](https://huggingface.co/aws-neuron/optimum-neuron-cache). - -If you just want to try the service quickly using a model that has not bee exported yet, it is thus still -possible to export it dynamically, pending some conditions: -- you must specify the export parameters when launching the service (or use default parameters), -- the model configuration must be cached. - -The snippet below shows how you can deploy a service from a hub standard model: - -``` -docker run -p 8080:80 \ - -v $(pwd)/data:/data \ - --device=/dev/neuron0 \ - -e HF_TOKEN=${HF_TOKEN} \ - -e HF_BATCH_SIZE=1 \ - -e HF_SEQUENCE_LENGTH=4096 \ - -e HF_AUTO_CAST_TYPE="fp16" \ - -e HF_NUM_CORES=2 \ - ghcr.io/huggingface/neuronx-tgi:latest \ - --model-id NousResearch/Llama-2-7b-chat-hf \ - --max-batch-size 1 \ - --max-input-length 3164 \ - --max-total-tokens 4096 -``` - -### Using a model exported to a local path - -Alternatively, you can first [export the model to neuron format](https://huggingface.co/docs/optimum-neuron/main/en/guides/models#configuring-the-export-of-a-generative-model) locally, and deploy the service inside the shared volume: - -``` -docker run -p 8080:80 \ - -v $(pwd)/data:/data \ - --device=/dev/neuron0 \ - ghcr.io/huggingface/neuronx-tgi:latest \ - --model-id /data/ \ - ... -``` - -### Choosing service parameters - -Use the following command to list the available service parameters: - -``` -docker run ghcr.io/huggingface/neuronx-tgi --help -``` - -The configuration of an inference endpoint is always a compromise between throughput and latency: serving more requests in parallel will allow a higher throughput, but it will increase the latency. - -The neuron models have static input dimensions `[batch_size, max_length]`. - -This adds several restrictions to the following parameters: - -- `--max-batch-size` must be set to `batch size`, -- `--max-input-length` must be lower than `max_length`, -- `--max-total-tokens` must be set to `max_length` (it is per-request). - -Although not strictly necessary, but important for efficient prefilling: - -- `--max-batch-prefill-tokens` should be set to `batch_size` * `max-input-length`. - -### Choosing the correct batch size - -As seen in the previous paragraph, neuron model static batch size has a direct influence on the endpoint latency and throughput. - -Please refer to [text-generation-inference](https://github.com/huggingface/text-generation-inference) for optimization hints. +## Usage -Note that the main constraint is to be able to fit the model for the specified `batch_size` within the total device memory available -on your instance (16GB per neuron core, with 2 cores per device). - -All neuron models on the 🤗 [HuggingFace Hub](https://huggingface.co/aws-neuron) include the number of cores required to run them. - -## Query the service - -You can query the model using either the `/generate` or `/generate_stream` routes: - -``` -curl 127.0.0.1:8080/generate \ - -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \ - -H 'Content-Type: application/json' -``` - -``` -curl 127.0.0.1:8080/generate_stream \ - -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \ - -H 'Content-Type: application/json' -``` +Please refer to the official [documentation](https://huggingface.co/docs/optimum-neuron/main/en/guides/neuronx_tgi). ## Build your own image diff --git a/text-generation-inference/server/text_generation_server/generator.py b/text-generation-inference/server/text_generation_server/generator.py index c6f3bb193..0c1d07851 100644 --- a/text-generation-inference/server/text_generation_server/generator.py +++ b/text-generation-inference/server/text_generation_server/generator.py @@ -585,7 +585,7 @@ def from_pretrained(cls, model_id: str, revision: str = None): ) else: logger.info("Loading model on neuron devices (this can take a few minutes).") - model = NeuronModelForCausalLM.from_pretrained(model_id, revision=revision) + model = NeuronModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, revision=revision) end = time.time() logger.info(f"Model successfully loaded in {end - start:.2f} s.") tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) diff --git a/text-generation-inference/server/text_generation_server/model.py b/text-generation-inference/server/text_generation_server/model.py index 726140557..c4a692c95 100644 --- a/text-generation-inference/server/text_generation_server/model.py +++ b/text-generation-inference/server/text_generation_server/model.py @@ -13,10 +13,10 @@ def get_export_kwargs_from_env(): - batch_size = os.environ.get("HF_BATCH_SIZE", None) + batch_size = os.environ.get("MAX_BATCH_SIZE", None) if batch_size is not None: batch_size = int(batch_size) - sequence_length = os.environ.get("HF_SEQUENCE_LENGTH", None) + sequence_length = os.environ.get("MAX_TOTAL_TOKENS", None) if sequence_length is not None: sequence_length = int(sequence_length) num_cores = os.environ.get("HF_NUM_CORES", None) @@ -97,9 +97,12 @@ def fetch_model( export_config = NeuronModelForCausalLM.get_export_config(model_id, config, revision=revision, **export_kwargs) neuron_config = export_config.neuron if not is_cached(model_id, neuron_config): + hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache" + neuron_export_url = "https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-neuronx-tgi" error_msg = ( f"No cached version found for {model_id} with {neuron_config}." - "You can start a discussion to request it on https://huggingface.co/aws-neuron/optimum-neuron-cache." + f"You can start a discussion to request it on {hub_cache_url}" + f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}" ) raise ValueError(error_msg) logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.") diff --git a/text-generation-inference/tests/fixtures/service.py b/text-generation-inference/tests/fixtures/service.py index c5ab82ca4..c4b165e6c 100644 --- a/text-generation-inference/tests/fixtures/service.py +++ b/text-generation-inference/tests/fixtures/service.py @@ -128,7 +128,7 @@ def docker_launcher( env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN env["HF_TOKEN"] = HF_TOKEN - for var in ["HF_BATCH_SIZE", "HF_SEQUENCE_LENGTH", "HF_AUTO_CAST_TYPE", "HF_NUM_CORES"]: + for var in ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "HF_AUTO_CAST_TYPE", "HF_NUM_CORES"]: if var in os.environ: env[var] = os.environ[var] diff --git a/text-generation-inference/tests/integration/test_implicit_env.py b/text-generation-inference/tests/integration/test_implicit_env.py index 8d200b8fc..8110e3cb7 100644 --- a/text-generation-inference/tests/integration/test_implicit_env.py +++ b/text-generation-inference/tests/integration/test_implicit_env.py @@ -19,9 +19,7 @@ async def tgi_service(request, launcher, neuron_model_config): "MAX_BATCH_SIZE", "MAX_INPUT_LENGTH", "MAX_TOTAL_TOKEN", - "HF_BATCH_SIZE", "HF_NUM_CORES", - "HF_SEQUENCE_LENGTH", "HF_AUTO_CAST_TYPE", ]: if var in os.environ: diff --git a/text-generation-inference/tgi_env.py b/text-generation-inference/tgi_env.py index 6c18941fb..4584358ae 100755 --- a/text-generation-inference/tgi_env.py +++ b/text-generation-inference/tgi_env.py @@ -17,14 +17,12 @@ logger = logging.getLogger(__name__) tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_LENGTH"] -tgi_server_env_vars = ["HF_BATCH_SIZE", "HF_SEQUENCE_LENGTH", "HF_NUM_CORES", "HF_AUTO_CAST_TYPE"] +tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"] env_config_peering = [ ("MAX_BATCH_SIZE", "batch_size"), ("MAX_TOTAL_TOKENS", "sequence_length"), - ("HF_BATCH_SIZE", "batch_size"), ("HF_AUTO_CAST_TYPE", "auto_cast_type"), - ("HF_SEQUENCE_LENGTH", "sequence_length"), ("HF_NUM_CORES", "num_cores"), ] @@ -41,12 +39,8 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace: argv = sys.argv # All these are params passed to tgi and intercepted here parser.add_argument("--max-input-length", type=int, default=os.getenv("MAX_INPUT_LENGTH", 0)) - parser.add_argument( - "--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", os.getenv("HF_SEQUENCE_LENGTH", 0)) - ) - parser.add_argument( - "--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", os.getenv("HF_BATCH_SIZE", 0)) - ) + parser.add_argument("--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0)) + parser.add_argument("--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0)) parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID")) parser.add_argument("--revision", type=str, default=os.getenv("REVISION")) @@ -62,14 +56,12 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace: # from the order of the parser defaults, the tgi router value can override the tgi server ones if args.max_total_tokens > 0: os.environ["MAX_TOTAL_TOKENS"] = str(args.max_total_tokens) - os.environ["HF_SEQUENCE_LENGTH"] = str(args.max_total_tokens) if args.max_input_length > 0: os.environ["MAX_INPUT_LENGTH"] = str(args.max_input_length) if args.max_batch_size > 0: os.environ["MAX_BATCH_SIZE"] = str(args.max_batch_size) - os.environ["HF_BATCH_SIZE"] = str(args.max_batch_size) if args.revision: os.environ["REVISION"] = str(args.revision)