Skip to content

Commit

Permalink
Add tensor parallel support to T5 via NxD (#697)
Browse files Browse the repository at this point in the history
* add args to command

* stage

* experiment

* stage

* modify wrapper / fix for t5 cli

* 1 bug solved, 3000 to go

* yay, encoder compiled

* decoder done

* export in modeling

* inference done

* add tests

* use parallizer

* fix tests

* fix tests

* fix tests

* add doc

* fix typo

* fix typo

* improve doc

* improve doc

* improve doc layout

* apply suggestions

* fix tests

* apply suggestion

* add suggestions
  • Loading branch information
JingyaHuang authored Oct 24, 2024
1 parent 0ea7285 commit 1de603e
Show file tree
Hide file tree
Showing 17 changed files with 678 additions and 174 deletions.
25 changes: 10 additions & 15 deletions benchmark/text-generation-inference/performance/generate_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import os

import pandas as pd

from guidellm.core import GuidanceReport, TextGenerationBenchmark


Expand All @@ -16,11 +15,7 @@ def _benchmark_rate_id(benchmark: TextGenerationBenchmark) -> str:
:return: A string representing the benchmark rate ID.
:rtype: str
"""
rate_id = (
f"{benchmark.mode}@{benchmark.rate:.2f} req/sec"
if benchmark.rate
else f"{benchmark.mode}"
)
rate_id = f"{benchmark.mode}@{benchmark.rate:.2f} req/sec" if benchmark.rate else f"{benchmark.mode}"
return rate_id


Expand All @@ -38,20 +33,20 @@ def main():
for path in paths:
filename = os.path.basename(path)
# Extract model_id
model_id, date = filename.replace(suffix, '').split('#')
model_id, date = filename.replace(suffix, "").split("#")
with open(path) as f:
report = GuidanceReport.from_json(f.read())
for benchmark in report.benchmarks:
for b in benchmark.benchmarks_sorted:
d = {
"model_id": model_id,
"Date": date,
"Input type": _benchmark_rate_id(b),
"Requests per Second": b.completed_request_rate,
"Request Latency (s)": b.request_latency,
"Time-to-first-token (ms)": b.time_to_first_token,
"Inter Token Latency (ms)": b.inter_token_latency,
"Output Token Throughput (t/s)": b.output_token_throughput,
"model_id": model_id,
"Date": date,
"Input type": _benchmark_rate_id(b),
"Requests per Second": b.completed_request_rate,
"Request Latency (s)": b.request_latency,
"Time-to-first-token (ms)": b.time_to_first_token,
"Inter Token Latency (ms)": b.inter_token_latency,
"Output Token Throughput (t/s)": b.output_token_throughput,
}
results.append(pd.DataFrame.from_dict(d, orient="index").transpose())

Expand Down
5 changes: 5 additions & 0 deletions docs/source/package_reference/modeling.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ The following Neuron model classes are available for natural language processing
[[autodoc]] modeling.NeuronModelForCausalLM
- forward

### NeuronModelForSeq2SeqLM

[[autodoc]] modeling_seq2seq.NeuronModelForSeq2SeqLM
- forward

## Computer Vision

The following Neuron model classes are available for computer vision tasks.
Expand Down
6 changes: 6 additions & 0 deletions optimum/commands/export/neuronx.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@ def parse_args_neuronx(parser: "ArgumentParser"):
choices=["bf16", "fp16", "tf32"],
help='The data type to cast FP32 operations to when auto-cast mode is enabled. Can be `"bf16"`, `"fp16"` or `"tf32"`.',
)
optional_group.add_argument(
"--tensor_parallel_size",
type=int,
default=1,
help="Tensor parallelism size, the number of neuron cores on which to shard the model.",
)
optional_group.add_argument(
"--dynamic-batch-size",
action="store_true",
Expand Down
26 changes: 25 additions & 1 deletion optimum/exporters/neuron/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ def get_submodels_and_neuron_configs(
task: str,
output: Path,
library_name: str,
tensor_parallel_size: int = 1,
subfolder: str = "",
dynamic_batch_size: bool = False,
model_name_or_path: Optional[Union[str, Path]] = None,
Expand Down Expand Up @@ -300,7 +301,14 @@ def get_submodels_and_neuron_configs(
elif is_encoder_decoder:
optional_outputs = {"output_attentions": output_attentions, "output_hidden_states": output_hidden_states}
models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_encoder_decoder(
model, input_shapes, task, output, dynamic_batch_size, model_name_or_path, **optional_outputs
model=model,
input_shapes=input_shapes,
tensor_parallel_size=tensor_parallel_size,
task=task,
output=output,
dynamic_batch_size=dynamic_batch_size,
model_name_or_path=model_name_or_path,
**optional_outputs,
)
else:
# TODO: Enable optional outputs for encoders
Expand Down Expand Up @@ -427,6 +435,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
def _get_submodels_and_neuron_configs_for_encoder_decoder(
model: "PreTrainedModel",
input_shapes: Dict[str, int],
tensor_parallel_size: int,
task: str,
output: Path,
dynamic_batch_size: bool = False,
Expand All @@ -442,15 +451,19 @@ def _get_submodels_and_neuron_configs_for_encoder_decoder(
models_and_neuron_configs = get_encoder_decoder_models_for_export(
model=model,
task=task,
tensor_parallel_size=tensor_parallel_size,
dynamic_batch_size=dynamic_batch_size,
input_shapes=input_shapes,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
model_name_or_path=model_name_or_path,
)
output_model_names = {
ENCODER_NAME: os.path.join(ENCODER_NAME, NEURON_FILE_NAME),
DECODER_NAME: os.path.join(DECODER_NAME, NEURON_FILE_NAME),
}
model.config.save_pretrained(output)
model.generation_config.save_pretrained(output)
maybe_save_preprocessors(model_name_or_path, output)

return models_and_neuron_configs, output_model_names
Expand All @@ -475,6 +488,7 @@ def load_models_and_neuron_configs(
lora_weight_names: Optional[Union[str, List[str]]],
lora_adapter_names: Optional[Union[str, List[str]]],
lora_scales: Optional[Union[float, List[float]]],
tensor_parallel_size: int = 1,
controlnet_ids: Optional[Union[str, List[str]]] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
Expand All @@ -499,6 +513,7 @@ def load_models_and_neuron_configs(
models_and_neuron_configs, output_model_names = get_submodels_and_neuron_configs(
model=model,
input_shapes=input_shapes,
tensor_parallel_size=tensor_parallel_size,
task=task,
library_name=library_name,
output=output,
Expand All @@ -522,6 +537,7 @@ def main_export(
model_name_or_path: str,
output: Union[str, Path],
compiler_kwargs: Dict[str, Any],
tensor_parallel_size: int = 1,
model: Optional[Union["PreTrainedModel", "ModelMixin"]] = None,
task: str = "auto",
dynamic_batch_size: bool = False,
Expand Down Expand Up @@ -563,6 +579,7 @@ def main_export(
model_name_or_path=model_name_or_path,
output=output,
model=model,
tensor_parallel_size=tensor_parallel_size,
task=task,
dynamic_batch_size=dynamic_batch_size,
cache_dir=cache_dir,
Expand Down Expand Up @@ -597,6 +614,12 @@ def main_export(
)

# Validate compiled model
if do_validation and tensor_parallel_size > 1:
# TODO: support the validation of tp models.
logger.warning(
"The validation is not yet supported for tensor parallel model, the validation will be turned off."
)
do_validation = False
if do_validation is True:
try:
validate_models_outputs(
Expand Down Expand Up @@ -698,6 +721,7 @@ def main():
model_name_or_path=args.model,
output=args.output,
compiler_kwargs=compiler_kwargs,
tensor_parallel_size=args.tensor_parallel_size,
task=task,
dynamic_batch_size=args.dynamic_batch_size,
atol=args.atol,
Expand Down
10 changes: 10 additions & 0 deletions optimum/exporters/neuron/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def __init__(
task: str,
compiler_type: Optional[str] = None,
compiler_version: Optional[str] = None,
tensor_parallel_size: int = 1,
batch_size: Optional[int] = None,
text_batch_size: Optional[int] = None,
image_batch_size: Optional[int] = None,
Expand Down Expand Up @@ -174,6 +175,7 @@ def __init__(
self._config = config
self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
self.mandatory_axes = ()
self.tensor_parallel_size = tensor_parallel_size
self.task = task
self._axes: Dict[str, int] = {}
self.dynamic_batch_size = dynamic_batch_size
Expand Down Expand Up @@ -227,6 +229,14 @@ def task(self, value: str):
self._task = value
self.mandatory_axes = self.get_mandatory_axes_for_task(self.task)

@property
def tensor_parallel_size(self) -> int:
return self._tensor_parallel_size

@tensor_parallel_size.setter
def tensor_parallel_size(self, value: int):
self._tensor_parallel_size = value

def __getattr__(self, attr_name) -> Any:
if attr_name != "_axes" and attr_name in self._axes:
return self._axes[attr_name]
Expand Down
Loading

0 comments on commit 1de603e

Please sign in to comment.