diff --git a/.github/workflows/test_trainium_common.yml b/.github/workflows/test_trainium_common.yml index e665aefa1..d7cbc1e84 100644 --- a/.github/workflows/test_trainium_common.yml +++ b/.github/workflows/test_trainium_common.yml @@ -32,6 +32,8 @@ jobs: run: echo "/home/ubuntu/.local/bin" >> $GITHUB_PATH - name: Set pip repository pointing to the Neuron repository run: pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com + - name: Update pip + run: pip install -U pip - name: Install Python dependencies run: pip install .[tests,neuronx] - name: Run tests on Neuron cores diff --git a/.github/workflows/test_trainium_distributed.yml b/.github/workflows/test_trainium_distributed.yml index 1c2ebf3e8..bd8d68162 100644 --- a/.github/workflows/test_trainium_distributed.yml +++ b/.github/workflows/test_trainium_distributed.yml @@ -35,5 +35,5 @@ jobs: run: pip install .[tests,neuronx] - name: Run tests on Neuron cores run: | - HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m "is_trainium_test" tests/distributed/ + HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m "is_trainium_test" tests/distributed/ -v --durations=0 -x --ignore tests/distributed/test_training.py diff --git a/docs/source/guides/distributed_training.mdx b/docs/source/guides/distributed_training.mdx index d22141a4a..d15a332a0 100644 --- a/docs/source/guides/distributed_training.mdx +++ b/docs/source/guides/distributed_training.mdx @@ -182,11 +182,11 @@ Just as for ZeRO-1, it is possible to wrap the optimizer class to make it lazy. ```python from torch.optim import AdamW from optimum.neuron import NeuronAccelerator -from optimum.neuron.accelerate.utils import TensorParallelismPlugin +from optimum.neuron.accelerate.utils import ModelParallelismPlugin from optimum.neuron.distributed import lazy_load_for_parallelism tensor_parallel_size = 8 -tp_plugin = TensorParallelismPlugin( +mp_plugin = ModelParallelismPlugin( tensor_parallel_size, parallelize_embeddings=True, sequence_parallel_enabled=True, @@ -195,7 +195,7 @@ tp_plugin = TensorParallelismPlugin( accelerator = NeuronAccelerator( ... - tp_plugin=tp_plugin, + mp_plugin=mp_plugin, ) with lazy_load_for_parallelism(tensor_parallel_size=tensor_parallel_size): diff --git a/docs/source/package_reference/distributed.mdx b/docs/source/package_reference/distributed.mdx index f23ceb6c0..7e295d5a2 100644 --- a/docs/source/package_reference/distributed.mdx +++ b/docs/source/package_reference/distributed.mdx @@ -24,7 +24,7 @@ The [`~optimum.neuron.distributed.Parallelizer`] class is the base abstract clas [[autodoc]] distributed.Parallelizer - _parallelize - parallelize - - optimizer_for_tp + - optimizer_for_mp - save_model_checkpoint - load_model_checkpoint diff --git a/examples/image-classification/run_image_classification.py b/examples/image-classification/run_image_classification.py old mode 100644 new mode 100755 index 26340a43b..620167685 --- a/examples/image-classification/run_image_classification.py +++ b/examples/image-classification/run_image_classification.py @@ -16,6 +16,7 @@ import logging import os import sys +import warnings from dataclasses import dataclass, field from typing import Optional @@ -28,6 +29,7 @@ from torchvision.transforms import ( CenterCrop, Compose, + Lambda, Normalize, RandomHorizontalFlip, RandomResizedCrop, @@ -56,7 +58,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.31.0") +check_min_version("4.35.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") @@ -143,12 +145,28 @@ class ModelArguments: metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."}) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -177,6 +195,15 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() + if model_args.use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", + FutureWarning, + ) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_image_classification", model_args, data_args) @@ -200,8 +227,8 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -230,7 +257,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, task="image-classification", - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: data_files = {} @@ -277,16 +304,21 @@ def compute_metrics(p): finetuning_task="image-classification", cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) - with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + with lazy_load_for_parallelism( + tensor_parallel_size=training_args.tensor_parallel_size, + pipeline_parallel_size=training_args.pipeline_parallel_size, + ): model = AutoModelForImageClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) @@ -294,7 +326,8 @@ def compute_metrics(p): model_args.image_processor_name or model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) # Define torchvision transforms to be applied to each image. @@ -302,7 +335,11 @@ def compute_metrics(p): size = image_processor.size["shortest_edge"] else: size = (image_processor.size["height"], image_processor.size["width"]) - normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std) + normalize = ( + Normalize(mean=image_processor.image_mean, std=image_processor.image_std) + if hasattr(image_processor, "image_mean") and hasattr(image_processor, "image_std") + else Lambda(lambda x: x) + ) _train_transforms = Compose( [ RandomResizedCrop(size), diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py index aa0e346c1..d54efc143 100755 --- a/examples/language-modeling/run_clm.py +++ b/examples/language-modeling/run_clm.py @@ -25,6 +25,7 @@ import math import os import sys +import warnings from dataclasses import dataclass, field from itertools import chain from typing import Optional @@ -56,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.31.0") +check_min_version("4.35.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -77,7 +78,7 @@ class ModelArguments: default=None, metadata={ "help": ( - "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." + "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch." ) }, ) @@ -112,12 +113,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -135,7 +152,7 @@ class ModelArguments: default=False, metadata={ "help": ( - "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded." + "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " "set True will benefit LLM loading time and RAM consumption." ) }, @@ -239,6 +256,15 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() + if model_args.use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", + FutureWarning, + ) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_clm", model_args, data_args) @@ -263,8 +289,8 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -301,7 +327,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, streaming=data_args.streaming, ) if "validation" not in raw_datasets.keys(): @@ -310,7 +336,7 @@ def main(): data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, streaming=data_args.streaming, ) raw_datasets["train"] = load_dataset( @@ -318,7 +344,7 @@ def main(): data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, streaming=data_args.streaming, ) else: @@ -340,7 +366,7 @@ def main(): extension, data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, **dataset_args, ) # If no validation data is there, validation_split_percentage will be used to divide the dataset. @@ -350,7 +376,7 @@ def main(): data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, **dataset_args, ) raw_datasets["train"] = load_dataset( @@ -358,7 +384,7 @@ def main(): data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, **dataset_args, ) @@ -374,7 +400,8 @@ def main(): config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, + "token": model_args.token, + "trust_remote_code": model_args.trust_remote_code, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) @@ -392,7 +419,8 @@ def main(): "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, + "token": model_args.token, + "trust_remote_code": model_args.trust_remote_code, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) @@ -400,7 +428,7 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You are instantiating a new tokenizer from scratch. This is not supported by this script. " "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) @@ -410,21 +438,28 @@ def main(): if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype) ) - with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + with lazy_load_for_parallelism( + tensor_parallel_size=training_args.tensor_parallel_size, + pipeline_parallel_size=training_args.pipeline_parallel_size, + ): model = AutoModelForCausalLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, torch_dtype=torch_dtype, low_cpu_mem_usage=model_args.low_cpu_mem_usage, ) else: - with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): - model = AutoModelForCausalLM.from_config(config) + with lazy_load_for_parallelism( + tensor_parallel_size=training_args.tensor_parallel_size, + pipeline_parallel_size=training_args.pipeline_parallel_size, + ): + model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code) n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") @@ -476,17 +511,16 @@ def tokenize_function(examples): if data_args.block_size is None: block_size = tokenizer.model_max_length - if block_size > 1024: + if block_size > config.max_position_embeddings: logger.warning( - "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" - " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" - " override this default with `--block_size xxx`." + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx." ) - block_size = 1024 + block_size = min(1024, config.max_position_embeddings) else: if data_args.block_size > tokenizer.model_max_length: logger.warning( - f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model " f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." ) block_size = min(data_args.block_size, tokenizer.model_max_length) @@ -512,7 +546,7 @@ def group_texts(examples): # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map with training_args.main_process_first(desc="grouping texts together"): if not data_args.streaming: diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py index 083694c0e..b917291c6 100755 --- a/examples/language-modeling/run_mlm.py +++ b/examples/language-modeling/run_mlm.py @@ -25,6 +25,7 @@ import math import os import sys +import warnings from dataclasses import dataclass, field from itertools import chain from typing import Optional @@ -54,7 +55,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.31.0") +check_min_version("4.35.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -108,12 +109,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -121,7 +138,7 @@ class ModelArguments: default=False, metadata={ "help": ( - "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded." + "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " "set True will benefit LLM loading time and RAM consumption." ) }, @@ -239,6 +256,15 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() + if model_args.use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", + FutureWarning, + ) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_mlm", model_args, data_args) @@ -263,8 +289,8 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): logger.info(f"Training/evaluation parameters {training_args}") @@ -302,7 +328,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, streaming=data_args.streaming, ) if "validation" not in raw_datasets.keys(): @@ -311,7 +337,7 @@ def main(): data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, streaming=data_args.streaming, ) raw_datasets["train"] = load_dataset( @@ -319,7 +345,7 @@ def main(): data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, streaming=data_args.streaming, ) else: @@ -336,7 +362,7 @@ def main(): extension, data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # If no validation data is there, validation_split_percentage will be used to divide the dataset. @@ -346,14 +372,14 @@ def main(): data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) raw_datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at @@ -367,7 +393,8 @@ def main(): config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, + "token": model_args.token, + "trust_remote_code": model_args.trust_remote_code, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) @@ -385,7 +412,8 @@ def main(): "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, + "token": model_args.token, + "trust_remote_code": model_args.trust_remote_code, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) @@ -393,26 +421,33 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You are instantiating a new tokenizer from scratch. This is not supported by this script. " "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: - with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + with lazy_load_for_parallelism( + tensor_parallel_size=training_args.tensor_parallel_size, + pipeline_parallel_size=training_args.pipeline_parallel_size, + ): model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, low_cpu_mem_usage=model_args.low_cpu_mem_usage, ) else: logger.info("Training new model from scratch") - with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): - model = AutoModelForMaskedLM.from_config(config) + with lazy_load_for_parallelism( + tensor_parallel_size=training_args.tensor_parallel_size, + pipeline_parallel_size=training_args.pipeline_parallel_size, + ): + model = AutoModelForMaskedLM.from_config(config, trust_remote_code=model_args.trust_remote_code) # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. @@ -440,7 +475,7 @@ def main(): else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the " f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) @@ -525,7 +560,7 @@ def group_texts(examples): # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map with training_args.main_process_first(desc="grouping texts together"): if not data_args.streaming: diff --git a/examples/multiple-choice/run_swag.py b/examples/multiple-choice/run_swag.py index cd522127a..fa8396fd0 100755 --- a/examples/multiple-choice/run_swag.py +++ b/examples/multiple-choice/run_swag.py @@ -21,6 +21,7 @@ import logging import os import sys +import warnings from dataclasses import dataclass, field from itertools import chain from typing import Optional, Union @@ -48,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.31.0") +check_min_version("4.35.0") logger = logging.getLogger(__name__) @@ -80,12 +81,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -226,6 +243,15 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() + if model_args.use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", + FutureWarning, + ) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_swag", model_args, data_args) @@ -250,8 +276,8 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -293,7 +319,7 @@ def main(): extension, data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: # Downloading and loading the swag dataset from the hub. @@ -301,7 +327,7 @@ def main(): "swag", "regular", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -315,23 +341,29 @@ def main(): model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) - with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + with lazy_load_for_parallelism( + tensor_parallel_size=training_args.tensor_parallel_size, + pipeline_parallel_size=training_args.pipeline_parallel_size, + ): model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) # When using your own dataset or a different dataset from swag, you will probably need to change this. @@ -351,7 +383,7 @@ def main(): else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the " f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py index b369571e9..c872e9a05 100755 --- a/examples/question-answering/run_qa.py +++ b/examples/question-answering/run_qa.py @@ -21,6 +21,7 @@ import logging import os import sys +import warnings from dataclasses import dataclass, field from typing import Optional @@ -50,7 +51,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.31.0") +check_min_version("4.35.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") @@ -80,12 +81,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -228,6 +245,15 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() + if model_args.use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", + FutureWarning, + ) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_qa", model_args, data_args) @@ -252,8 +278,8 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -290,7 +316,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: data_files = {} @@ -309,7 +335,7 @@ def main(): data_files=data_files, field="data", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -323,23 +349,29 @@ def main(): model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) - with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + with lazy_load_for_parallelism( + tensor_parallel_size=training_args.tensor_parallel_size, + pipeline_parallel_size=training_args.pipeline_parallel_size, + ): model = AutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) # Tokenizer check: this script requires a fast tokenizer. @@ -367,7 +399,7 @@ def main(): if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the " f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) diff --git a/examples/question-answering/run_seq2seq_qa.py b/examples/question-answering/run_seq2seq_qa.py index fe5213a8d..abb883c0a 100644 --- a/examples/question-answering/run_seq2seq_qa.py +++ b/examples/question-answering/run_seq2seq_qa.py @@ -21,6 +21,7 @@ import logging import os import sys +import warnings from dataclasses import dataclass, field from typing import List, Optional, Tuple @@ -47,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.31.0") +check_min_version("4.35.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") @@ -81,12 +82,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -155,7 +172,7 @@ class DataTrainingArguments: metadata={ "help": ( "The maximum total sequence length for validation target text after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded. Will default to `max_answer_length`." + "than this will be truncated, sequences shorter will be padded. Will default to `max_answer_length`. " "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " "during ``evaluate`` and ``predict``." ) @@ -274,6 +291,15 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() + if model_args.use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", + FutureWarning, + ) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_seq2seq_qa", model_args, data_args) @@ -298,8 +324,8 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -336,7 +362,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: data_files = {} @@ -354,7 +380,7 @@ def main(): data_files=data_files, field="data", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -368,23 +394,29 @@ def main(): model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) - with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + with lazy_load_for_parallelism( + tensor_parallel_size=training_args.tensor_parallel_size, + pipeline_parallel_size=training_args.pipeline_parallel_size, + ): model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch @@ -441,13 +473,13 @@ def main(): if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"): logger.warning( - "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" + "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for " f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the " f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) diff --git a/examples/question-answering/trainer_seq2seq_qa.py b/examples/question-answering/trainer_seq2seq_qa.py index a4acb5ee6..6e04bf3f6 100644 --- a/examples/question-answering/trainer_seq2seq_qa.py +++ b/examples/question-answering/trainer_seq2seq_qa.py @@ -47,12 +47,13 @@ def evaluate( **gen_kwargs, ) -> Dict[str, float]: gen_kwargs = gen_kwargs.copy() - gen_kwargs["max_length"] = ( - gen_kwargs["max_length"] if gen_kwargs.get("max_length") is not None else self.args.generation_max_length - ) - gen_kwargs["num_beams"] = ( - gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams - ) + + # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the + # training args + if gen_kwargs.get("max_length") is None and self.args.generation_max_length is not None: + gen_kwargs["max_length"] = self.args.generation_max_length + if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None: + gen_kwargs["num_beams"] = self.args.generation_num_beams self._gen_kwargs = gen_kwargs eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py index 4b05b3b08..5a442c075 100755 --- a/examples/summarization/run_summarization.py +++ b/examples/summarization/run_summarization.py @@ -21,6 +21,7 @@ import logging import os import sys +import warnings from dataclasses import dataclass, field from typing import Optional @@ -53,7 +54,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.31.0") +check_min_version("4.35.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") @@ -100,12 +101,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -189,7 +206,7 @@ class DataTrainingArguments: metadata={ "help": ( "The maximum total sequence length for validation target text after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`." + "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`. " "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " "during ``evaluate`` and ``predict``." ) @@ -248,14 +265,14 @@ class DataTrainingArguments: }, ) source_prefix: Optional[str] = field( - default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."} + default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."} ) forced_bos_token: Optional[str] = field( default=None, metadata={ "help": ( - "The token to force as the first generated token after the decoder_start_token_id." + "The token to force as the first generated token after the decoder_start_token_id. " "Useful for multilingual models like mBART where the first generated token" "needs to be the target language token (Usually it is the target language token)" ) @@ -313,6 +330,15 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() + if model_args.use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", + FutureWarning, + ) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_summarization", model_args, data_args) @@ -337,8 +363,8 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -387,7 +413,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: data_files = {} @@ -404,7 +430,7 @@ def main(): extension, data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -418,23 +444,29 @@ def main(): model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) - with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + with lazy_load_for_parallelism( + tensor_parallel_size=training_args.tensor_parallel_size, + pipeline_parallel_size=training_args.pipeline_parallel_size, + ): model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch @@ -532,7 +564,7 @@ def main(): if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"): logger.warning( - "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" + "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for " f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) @@ -694,7 +726,13 @@ def compute_metrics(eval_preds): results = {} if training_args.do_eval: logger.info("*** Evaluate ***") - metrics = trainer.evaluate(metric_key_prefix="eval") + if isinstance(eval_dataset, dict): + metrics = {} + for eval_ds_name, eval_ds in eval_dataset.items(): + dataset_metrics = trainer.evaluate(eval_dataset=eval_ds, metric_key_prefix=f"eval_{eval_ds_name}") + metrics.update(dataset_metrics) + else: + metrics = trainer.evaluate(metric_key_prefix="eval") max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py index 31d2cc67a..75b321be0 100755 --- a/examples/text-classification/run_glue.py +++ b/examples/text-classification/run_glue.py @@ -20,6 +20,7 @@ import os import random import sys +import warnings from dataclasses import dataclass, field from typing import Optional @@ -49,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.31.0") +check_min_version("4.35.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") @@ -189,12 +190,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -217,6 +234,15 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() + if model_args.use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", + FutureWarning, + ) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_glue", model_args, data_args) @@ -241,8 +267,8 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -282,7 +308,7 @@ def main(): "glue", data_args.task_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) elif data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. @@ -290,7 +316,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: # Loading a dataset from your local files. @@ -319,7 +345,7 @@ def main(): "csv", data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: # Loading a dataset from local json files @@ -327,7 +353,7 @@ def main(): "json", data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -362,23 +388,29 @@ def main(): finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) - with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + with lazy_load_for_parallelism( + tensor_parallel_size=training_args.tensor_parallel_size, + pipeline_parallel_size=training_args.pipeline_parallel_size, + ): model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) @@ -432,7 +464,7 @@ def main(): if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the " f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) diff --git a/examples/text-classification/run_xnli.py b/examples/text-classification/run_xnli.py index 339a649fe..4b06d2653 100755 --- a/examples/text-classification/run_xnli.py +++ b/examples/text-classification/run_xnli.py @@ -21,6 +21,7 @@ import os import random import sys +import warnings from dataclasses import dataclass, field from typing import Optional @@ -49,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.31.0") +check_min_version("4.35.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") @@ -153,12 +154,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -176,6 +193,15 @@ def main(): parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() + if model_args.use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", + FutureWarning, + ) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_xnli", model_args) @@ -200,8 +226,8 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -233,7 +259,7 @@ def main(): model_args.language, split="train", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: train_dataset = load_dataset( @@ -241,7 +267,7 @@ def main(): model_args.train_language, split="train", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) label_list = train_dataset.features["label"].names @@ -251,7 +277,7 @@ def main(): model_args.language, split="validation", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) label_list = eval_dataset.features["label"].names @@ -261,7 +287,7 @@ def main(): model_args.language, split="test", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) label_list = predict_dataset.features["label"].names @@ -279,7 +305,8 @@ def main(): finetuning_task="xnli", cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, @@ -287,16 +314,21 @@ def main(): cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) - with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + with lazy_load_for_parallelism( + tensor_parallel_size=training_args.tensor_parallel_size, + pipeline_parallel_size=training_args.pipeline_parallel_size, + ): model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) diff --git a/examples/token-classification/run_ner.py b/examples/token-classification/run_ner.py index ba33cd4a5..b8d870a23 100755 --- a/examples/token-classification/run_ner.py +++ b/examples/token-classification/run_ner.py @@ -22,6 +22,7 @@ import logging import os import sys +import warnings from dataclasses import dataclass, field from typing import Optional @@ -50,7 +51,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.31.0") +check_min_version("4.35.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") @@ -80,12 +81,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -218,6 +235,15 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() + if model_args.use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", + FutureWarning, + ) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_ner", model_args, data_args) @@ -242,8 +268,8 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -280,7 +306,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: data_files = {} @@ -349,7 +375,8 @@ def get_label_list(labels): finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path @@ -359,7 +386,8 @@ def get_label_list(labels): cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, add_prefix_space=True, ) else: @@ -368,17 +396,22 @@ def get_label_list(labels): cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) - with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + with lazy_load_for_parallelism( + tensor_parallel_size=training_args.tensor_parallel_size, + pipeline_parallel_size=training_args.pipeline_parallel_size, + ): model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py index cc1d79239..31d40b2c3 100755 --- a/examples/translation/run_translation.py +++ b/examples/translation/run_translation.py @@ -21,6 +21,7 @@ import logging import os import sys +import warnings from dataclasses import dataclass, field from typing import Optional @@ -53,7 +54,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.31.0") +check_min_version("4.35.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") @@ -90,12 +91,28 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) use_auth_token: bool = field( + default=None, + metadata={ + "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead." + }, + ) + trust_remote_code: bool = field( default=False, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." ) }, ) @@ -157,7 +174,7 @@ class DataTrainingArguments: metadata={ "help": ( "The maximum total sequence length for validation target text after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`." + "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`. " "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " "during ``evaluate`` and ``predict``." ) @@ -262,6 +279,15 @@ def main(): else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() + if model_args.use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", + FutureWarning, + ) + if model_args.token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + model_args.token = model_args.use_auth_token + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_translation", model_args, data_args) @@ -286,8 +312,8 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") @@ -336,7 +362,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: data_files = {} @@ -353,10 +379,10 @@ def main(): extension, data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. + # https://huggingface.co/docs/datasets/loading. # Load pretrained model and tokenizer # @@ -367,23 +393,29 @@ def main(): model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) - with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + with lazy_load_for_parallelism( + tensor_parallel_size=training_args.tensor_parallel_size, + pipeline_parallel_size=training_args.pipeline_parallel_size, + ): model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch @@ -444,7 +476,7 @@ def main(): if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"): logger.warning( - "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" + "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for " f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) diff --git a/optimum/neuron/__init__.py b/optimum/neuron/__init__.py index f9ceb961d..92082cc7a 100644 --- a/optimum/neuron/__init__.py +++ b/optimum/neuron/__init__.py @@ -47,12 +47,13 @@ "NeuronAccelerator", "NeuronAcceleratorState", "NeuronPartialState", + "ModelParallelismPlugin", ], "pipelines": ["pipeline"], } if TYPE_CHECKING: - from .accelerate import NeuronAccelerator, NeuronAcceleratorState, NeuronPartialState + from .accelerate import ModelParallelismPlugin, NeuronAccelerator, NeuronAcceleratorState, NeuronPartialState from .hf_argparser import NeuronHfArgumentParser from .modeling import ( NeuronModelForCausalLM, diff --git a/optimum/neuron/accelerate/__init__.py b/optimum/neuron/accelerate/__init__.py index e39649fd7..7a611f826 100644 --- a/optimum/neuron/accelerate/__init__.py +++ b/optimum/neuron/accelerate/__init__.py @@ -15,4 +15,4 @@ from .accelerator import NeuronAccelerator from .state import NeuronAcceleratorState, NeuronPartialState -from .utils.dataclasses import NeuronDistributedType +from .utils.dataclasses import ModelParallelismPlugin, NeuronDistributedType diff --git a/optimum/neuron/accelerate/accelerator.py b/optimum/neuron/accelerate/accelerator.py index cf7437175..af3f691ff 100644 --- a/optimum/neuron/accelerate/accelerator.py +++ b/optimum/neuron/accelerate/accelerator.py @@ -15,13 +15,14 @@ """Custom Accelerator class for Neuron.""" import collections +import contextlib import inspect import os import re import shutil from functools import partial from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, Union import torch from accelerate import Accelerator @@ -34,22 +35,26 @@ from ...utils import logging from ..distributed import Parallelizer, ParallelizersManager from ..utils import ( + DynamicPatch, ModelPatcher, Patcher, is_neuronx_distributed_available, is_torch_xla_available, patch_within_function, + patched_finfo, ) from ..utils.misc import args_and_kwargs_to_kwargs_only -from ..utils.require_utils import requires_neuronx_distributed +from ..utils.require_utils import requires_neuronx_distributed, requires_torch_xla from .optimizer import NeuronAcceleratedOptimizer from .scheduler import NeuronAcceleratedScheduler from .state import NeuronAcceleratorState from .utils import ( + ModelParallelismPlugin, NeuronDistributedType, NeuronFullyShardedDataParallelPlugin, - TensorParallelismPlugin, + get_tied_parameters_dict, patch_accelerate_is_tpu_available, + tie_parameters, ) from .utils.operations import _xla_gather @@ -75,10 +80,25 @@ logger = logging.get_logger(__name__) -# TODO: should we do a XLAFSDPNeuronAccelerator instead? +MODEL_PATCHING_SPECS = [ + ("config.layerdrop", 0), + ("no_sync", lambda: contextlib.nullcontext()), + ( + "forward", + DynamicPatch(patch_within_function(("torch.finfo", patched_finfo))), + ), +] + +NxDPPMODEL_PATCHING_SPECS = [ + ( + "forward", + DynamicPatch(patch_within_function(("torch.finfo", patched_finfo))), + ), +] + + class NeuronAccelerator(Accelerator): - # @patch_within_function(("accelerate.accelerator.AcceleratorState", NeuronAcceleratorState)) - def __init__(self, *args, tp_plugin: Optional[TensorParallelismPlugin] = None, zero_1: bool = False, **kwargs): + def __init__(self, *args, mp_plugin: Optional[ModelParallelismPlugin] = None, zero_1: bool = False, **kwargs): # Patches accelerate.utils.imports.is_tpu_available to match `is_torch_xla_available` patch_accelerate_is_tpu_available() @@ -113,18 +133,28 @@ def __init__(self, *args, tp_plugin: Optional[TensorParallelismPlugin] = None, z self.fsdp_plugin = fsdp_plugin use_neuronx_distributed_tp = os.environ.get("ACCELERATE_USE_NEURONX_DISTRIBUTED_TP", "false") - if tp_plugin is None: + use_neuronx_distributed_pp = os.environ.get("ACCELERATE_USE_NEURONX_DISTRIBUTED_PP", "false") + if mp_plugin is None: if use_neuronx_distributed_tp == "false": tp_size = 1 else: tp_size = int(use_neuronx_distributed_tp) - tp_plugin = TensorParallelismPlugin(tensor_parallel_size=tp_size, parallelize_embeddings=True) + if use_neuronx_distributed_pp == "false": + pp_size = 1 + else: + pp_size = int(use_neuronx_distributed_pp) + mp_plugin = ModelParallelismPlugin( + tensor_parallel_size=tp_size, parallelize_embeddings=True, pipeline_parallel_size=pp_size + ) self._model_cpu_parameters_to_xla = {} - if tp_plugin.should_parallelize: + if mp_plugin.tensor_parallel_size > 1: os.environ["ACCELERATE_USE_NEURONX_DISTRIBUTED_TP"] = "true" - patched_accelerator_state = partial(NeuronAcceleratorState, tp_plugin=tp_plugin) + if mp_plugin.pipeline_parallel_size > 1: + os.environ["ACCELERATE_USE_NEURONX_DISTRIBUTED_PP"] = "true" + + patched_accelerator_state = partial(NeuronAcceleratorState, mp_plugin=mp_plugin) with Patcher([("accelerate.accelerator.AcceleratorState", patched_accelerator_state)]): super().__init__(**full_kwargs) @@ -136,7 +166,7 @@ def __init__(self, *args, tp_plugin: Optional[TensorParallelismPlugin] = None, z if self.process_index == -1 and self.zero_1: raise ValueError("XLA ZeRO Stage 1 can only be enabled in a distributed training setting.") - if fsdp_plugin is not None and tp_plugin is not None: + if fsdp_plugin is not None and mp_plugin is not None: raise ValueError("It is not possible to both use neuronx_distributed Tensor Parallelism and XLA FSDP.") if num_steps != 1: @@ -164,7 +194,7 @@ def _prepare_data_loader_for_distributed( sampler = DistributedSampler(data_loader.dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) - data_loader_for_tp = DataLoader( + distributed_dataloader = DataLoader( data_loader.dataset, batch_size=data_loader.batch_size, sampler=sampler, @@ -173,11 +203,11 @@ def _prepare_data_loader_for_distributed( pin_memory=data_loader.pin_memory, drop_last=data_loader.drop_last, ) - data_loader_for_tp._is_accelerate_prepared = True - return data_loader_for_tp + distributed_dataloader._is_accelerate_prepared = True + return distributed_dataloader def prepare_data_loader(self, data_loader: DataLoader, device_placement: Optional[bool] = None): - if self.state.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM: + if self.state.distributed_type is NeuronDistributedType.MODEL_PARALLELISM: from neuronx_distributed import parallel_layers num_replicas = parallel_layers.parallel_state.get_data_parallel_size() @@ -187,15 +217,17 @@ def prepare_data_loader(self, data_loader: DataLoader, device_placement: Optiona rank = xm.get_ordinal() if self.state.num_processes > 1: data_loader = self._prepare_data_loader_for_distributed(data_loader, num_replicas=num_replicas, rank=rank) - data_loader = MpDeviceLoader(data_loader, self.device) + # No need to wrap the dataloader if we are using pipeline parallelism. + if self.state.mp_plugin.pipeline_parallel_size == 1: + data_loader = MpDeviceLoader(data_loader, self.device) return data_loader # TODO: fix that. # return super().prepare_data_loader(data_loader, device_placement=device_placement) - def _prepare_optimizer_for_tp(self, optimizer: torch.optim.Optimizer, device_placement=None): + def _prepare_optimizer_for_mp(self, optimizer: torch.optim.Optimizer, device_placement=None): cpu_parameters_to_xla = collections.ChainMap(*self._model_cpu_parameters_to_xla.values()) if not self.zero_1: - optimizer = Parallelizer.optimizer_for_tp(optimizer, cpu_parameters_to_xla) + optimizer = Parallelizer.optimizer_for_mp(optimizer, cpu_parameters_to_xla) else: xla_parameters, _ = Parallelizer.optimizer_cpu_params_to_xla_params(optimizer, cpu_parameters_to_xla) if hasattr(optimizer, "_args_to_recreate"): @@ -234,6 +266,7 @@ def _prepare_optimizer_for_zero_1(self, optimizer: torch.optim.Optimizer, device args, kwargs = optimizer._args_to_recreate params = args[0] defaults = args_and_kwargs_to_kwargs_only(optimizer.__class__, args[1:], kwargs) + zero_1_optimizer = NeuronZero1Optimizer( params, optimizer.__class__, @@ -262,16 +295,36 @@ def _prepare_optimizer_for_zero_1(self, optimizer: torch.optim.Optimizer, device @patch_within_function(("accelerate.accelerator.AcceleratedOptimizer", NeuronAcceleratedOptimizer)) def prepare_optimizer(self, optimizer: torch.optim.Optimizer, device_placement: Optional[bool] = None): - if self.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM: - optimizer = self._prepare_optimizer_for_tp(optimizer, device_placement=device_placement) + if self.distributed_type is NeuronDistributedType.MODEL_PARALLELISM: + optimizer = self._prepare_optimizer_for_mp(optimizer, device_placement=device_placement) if self.zero_1: optimizer = self._prepare_optimizer_for_zero_1(optimizer, device_placement=device_placement) + # Edge case: if the optimizer was created lazily outside of the Model Parallelism and/or ZeRO-1 setting, we make + # sure to actually load the proper parameters. + if hasattr(optimizer, "_args_to_recreate"): + args, kwargs = optimizer._args_to_recreate + optimizer = optimizer.__class__(*args, **kwargs) + return super().prepare_optimizer(optimizer, device_placement=device_placement) @patch_within_function(("accelerate.accelerator.AcceleratedScheduler", NeuronAcceleratedScheduler)) def prepare_scheduler(self, scheduler: "LRScheduler"): return super().prepare_scheduler(scheduler) + @staticmethod + def patch_model_for_neuron( + model: "torch.nn.Module", patching_specs: Optional[List[Tuple[str, Any]]] = None + ) -> "torch.nn.Module": + if patching_specs is None: + patching_specs = MODEL_PATCHING_SPECS + prepared_patching_specs = [] + for spec in patching_specs: + prepared_patching_specs.append((model,) + spec) + + model_patcher = ModelPatcher(prepared_patching_specs, ignore_missing_attributes=True) + model_patcher.patch() + return model + def prepare_model_for_xla_fsdp( self, model: torch.nn.Module, device_placement: Optional[bool] = None, evaluation_mode: bool = False ): @@ -342,49 +395,92 @@ def prepare_model_for_xla_fsdp( return model - def _prepare_model_for_tp( + @requires_neuronx_distributed + def _prepare_model_for_mp( self, model: torch.nn.Module, device_placement: Optional[bool] = None, evaluation_mode: bool = False ): + from neuronx_distributed.pipeline import NxDPPModel + if model in self._models or Parallelizer.was_parallelized(model): return model - cpu_ids = [id(v) for v in model.parameters()] + cpu_ids = {name: id(param) for name, param in model.named_parameters()} + tied_parameters_dict = get_tied_parameters_dict(model) + model_main_input_name = getattr(model, "main_input_name", None) # TODO: enable self.device (if needed). - model = self.state.tp_plugin.parallelize_model(model, device=None) + model = self.state.mp_plugin.parallelize_model(model, device=None) + + if model_main_input_name is not None: + setattr(model, "main_input_name", model_main_input_name) + + if isinstance(model, NxDPPModel): + model.local_module = self.patch_model_for_neuron( + model.local_module, patching_specs=NxDPPMODEL_PATCHING_SPECS + ) + model_to_cast = model.local_module + else: + model_to_cast = model + + model_to_cast = model.local_module if isinstance(model, NxDPPModel) else model if os.environ.get("XLA_USE_BF16", "0") == "1" or os.environ.get("XLA_DOWNCAST_BF16", "0") == "1": - model.to(torch.bfloat16) + model_to_cast.to(torch.bfloat16) else: - model.to(torch.float32) + model_to_cast.to(torch.float32) - def _tie_or_clone_weights_for_tp(self, output_embeddings, input_embeddings): + def _tie_or_clone_weights_for_mp(self, output_embeddings, input_embeddings): """Tie or clone module weights depending of whether we are using TorchScript or not""" output_embeddings.weight = input_embeddings.weight if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"): output_embeddings.out_features = input_embeddings.num_embeddings - with ModelPatcher(patching_specs=[(model, "_tie_or_clone_weights", _tie_or_clone_weights_for_tp)]): - model.tie_weights() - move_model_to_device(model, self.device) - model.tie_weights() - self._model_cpu_parameters_to_xla[id(model)] = dict(zip(cpu_ids, model.parameters())) + if isinstance(model, NxDPPModel): + with ModelPatcher(patching_specs=[(model, "_tie_or_clone_weights", _tie_or_clone_weights_for_mp)]): + model.move_model_to_device() + tie_parameters(model, tied_parameters_dict) + xla_params = dict(model.local_named_parameters()) + self._model_cpu_parameters_to_xla[id(model)] = { + cpu_ids[name]: xla_params[name] for name, _ in model.local_named_parameters() + } + else: + with ModelPatcher(patching_specs=[(model, "_tie_or_clone_weights", _tie_or_clone_weights_for_mp)]): + move_model_to_device(model, self.device) + tie_parameters(model, tied_parameters_dict) + xla_params = dict(model.named_parameters()) + symmetric_diff = set(cpu_ids.keys()).symmetric_difference((xla_params.keys())) + if symmetric_diff: + raise ValueError( + f"The parameters on CPU do not match the parameters on the XLA device: {', '.join(symmetric_diff)}." + ) + + self._model_cpu_parameters_to_xla[id(model)] = { + cpu_ids[name]: xla_params[name] for name, _ in model.named_parameters() + } + device_placement = False return super().prepare_model(model, device_placement=device_placement, evaluation_mode=evaluation_mode) + @requires_torch_xla + @requires_neuronx_distributed def prepare_model( self, model: torch.nn.Module, device_placement: Optional[bool] = None, evaluation_mode: bool = False ): # If the model was already prepared, we skip. if model in self._models: return model + + model = self.patch_model_for_neuron(model) + if self.distributed_type is NeuronDistributedType.XLA_FSDP: return self.prepare_model_for_xla_fsdp( model, device_placement=device_placement, evaluation_mode=evaluation_mode ) - elif self.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM: - return self._prepare_model_for_tp( + elif self.distributed_type is NeuronDistributedType.MODEL_PARALLELISM: + return self._prepare_model_for_mp( model, device_placement=device_placement, evaluation_mode=evaluation_mode ) + move_model_to_device(model, xm.xla_device()) + device_placement = False return super().prepare_model(model, device_placement=device_placement, evaluation_mode=evaluation_mode) def backward_for_xla_fsdp(self, loss, **kwargs): @@ -410,11 +506,15 @@ def clip_grad_norm_for_xla_fsdp(self, parameters, max_norm, norm_type: int = 2): if parameters == list(model.parameters()): return model.clip_grad_norm_(max_norm, norm_type) + @requires_neuronx_distributed def _prepare_clip_grad_norm(self, parameters, max_norm, norm_type: int = 2): + from neuronx_distributed.pipeline import NxDPPModel + self.unscale_gradients() parameters = list(parameters) for model in self._models: - if parameters == list(model.parameters()): + model_parameters = model.local_parameters() if isinstance(model, NxDPPModel) else model.parameters() + if parameters == list(model_parameters) or self.zero_1: for opt in self._optimizers: # Under this setting, the gradient clipping will be deferred to the optimizer step. # It will happen after the gradients have been reduced and before the optimizer step. @@ -423,7 +523,7 @@ def _prepare_clip_grad_norm(self, parameters, max_norm, norm_type: int = 2): def clip_grad_norm_(self, parameters, max_norm, norm_type=2): if self.distributed_type is NeuronDistributedType.XLA_FSDP: return self.clip_grad_norm_for_xla_fsdp(parameters, max_norm, norm_type=norm_type) - elif self.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM or self.zero_1: + elif self.distributed_type is NeuronDistributedType.MODEL_PARALLELISM or self.zero_1: return self._prepare_clip_grad_norm(parameters, max_norm, norm_type=norm_type) return super().clip_grad_norm_(parameters, max_norm, norm_type=norm_type) @@ -434,7 +534,7 @@ def clip_grad_value_(self, parameters, clip_value): def _custom_save_state( self, - save_model_func: Callable[["Accelerator", "PreTrainedModel", Union[str, Path], int], Any], + save_model_func: Optional[Callable[["Accelerator", "PreTrainedModel", Union[str, Path], int], Any]], save_optimizer_func: Callable[ ["Accelerator", "torch.optim.Optimizer", "PreTrainedModel", Union[str, Path], int], Any ], @@ -475,18 +575,25 @@ def _inner(folder): xm.mark_step() # Save the models - weights = [] - for i, model in enumerate(self._models): - save_model_func(self, model, output_dir, i) + if save_model_func is not None: + for i, model in enumerate(self._models): + save_model_func(self, model, output_dir, i) # Save the optimizers - optimizers = [] - for i, opt in enumerate(self._optimizers): + if not self._optimizers and save_model_func is None: + optimizers = [None] * len(self._models) + else: + optimizers = self._optimizers + for i, opt in enumerate(optimizers): save_optimizer_func(self, opt, self._models[i], output_dir, i) # Save the lr schedulers taking care of DeepSpeed nuances schedulers = self._schedulers + # Setting those to be empty list so that `save_accelerator_state` does not redo the job. + weights = [] + optimizers = [] + # Call model loading hooks that might have been registered with # accelerator.register_model_state_hook for hook in self._save_model_state_pre_hook.values(): @@ -515,15 +622,15 @@ def save_optimizer_func(accelerator, optimizer, model, output_dir, i): save_model_func, save_optimizer_func, output_dir=output_dir, **save_model_func_kwargs ) - def save_state_for_tp(self, output_dir: Optional[str] = None, **save_model_func_kwargs): - def save_model_func(accelelerator, model, output_dir, i): - return + def save_state_for_mp(self, output_dir: Optional[str] = None, **save_model_func_kwargs): + # The model is saved at the same time as the optimizer. + save_model_func = None def save_optimizer_func(accelerator, optimizer, model, output_dir, i): - logger.info("Saving TP model and optimizer") + logger.info("Saving parallel model and optimizer") parallelizer = ParallelizersManager.parallelizer_for_model(model) parallelizer.save_model_checkpoint(model, output_dir, as_regular=False, optimizer=optimizer) - logger.info(f"TP model and optimizer saved to the directory {output_dir}") + logger.info(f"Parallel model and optimizer saved to the directory {output_dir}") return self._custom_save_state( save_model_func, save_optimizer_func, output_dir=output_dir, **save_model_func_kwargs @@ -533,8 +640,8 @@ def save_optimizer_func(accelerator, optimizer, model, output_dir, i): def save_state(self, output_dir: Optional[str] = None, **save_model_func_kwargs) -> str: if self.distributed_type is NeuronDistributedType.XLA_FSDP: return self.save_state_for_xla_fsdp(output_dir=output_dir, **save_model_func_kwargs) - elif self.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM: - return self.save_state_for_tp(output_dir=output_dir, **save_model_func_kwargs) + elif self.distributed_type is NeuronDistributedType.MODEL_PARALLELISM: + return self.save_state_for_mp(output_dir=output_dir, **save_model_func_kwargs) return super().save_state(output_dir=output_dir, **save_model_func_kwargs) def gather(self, tensor, out_of_graph: bool = False): diff --git a/optimum/neuron/accelerate/optimizer.py b/optimum/neuron/accelerate/optimizer.py index e55221a27..d62709179 100644 --- a/optimum/neuron/accelerate/optimizer.py +++ b/optimum/neuron/accelerate/optimizer.py @@ -14,18 +14,17 @@ # limitations under the License. """Custom AcceleratedOptimizer for Neuron.""" -from typing import TYPE_CHECKING, Optional +from typing import Optional +import torch from accelerate.optimizer import AcceleratedOptimizer from accelerate.utils import DistributedType -from ..utils import is_neuronx_distributed_available, is_torch_xla_available +from ..utils import is_torch_xla_available +from ..utils.require_utils import requires_neuronx_distributed from .utils.dataclasses import NeuronDistributedType -if TYPE_CHECKING: - import torch - if is_torch_xla_available(): import accelerate import torch_xla.core.xla_model as xm @@ -33,8 +32,29 @@ accelerate.optimizer.xm = xm -if is_neuronx_distributed_available(): - from neuronx_distributed import parallel_layers + +@requires_neuronx_distributed +def allreduce_sequence_parallel_gradients(optimizer): + """ + All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used. + + Modified from megatron-lm: + https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425 + """ + from neuronx_distributed.parallel_layers.mappings import reduce_from_tensor_model_parallel_region + + grads = [] + for param_group in optimizer.__getstate__()["param_groups"]: + for group, params in param_group.items(): + if group == "params": + for p in params: + if isinstance(p, torch.Tensor) and p.grad is not None: + sequence_parallel_param = getattr(p, "sequence_parallel_enabled", False) + if sequence_parallel_param: + grads.append(p.grad.data) + for grad in grads: + # sum v.s. average: sum + reduce_from_tensor_model_parallel_region(grad) class NeuronAcceleratedOptimizer(AcceleratedOptimizer): @@ -49,7 +69,7 @@ def __init__( self.parameters = [] self.parameter_ids = {} self.clip_grad_norm_to_perform = None - if self.accelerator_state.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM: + if self.accelerator_state.distributed_type is NeuronDistributedType.MODEL_PARALLELISM: self.parameters = [p for group in self.optimizer.param_groups for p in group["params"]] self.parameter_ids = {id(p) for p in self.parameters} @@ -59,11 +79,19 @@ def load_state_dict(self, state_dict): def prepare_clip_grad_norm(self, parameters, max_norm, norm_type=2): parameter_ids = {id(p) for p in parameters} - if parameter_ids == self.parameter_ids: + if parameter_ids == self.parameter_ids or isinstance(self.optimizer, ZeroRedundancyOptimizer): self.clip_grad_norm_to_perform = {"max_norm": max_norm, "norm_type": norm_type} + @requires_neuronx_distributed def step(self, closure=None): + from neuronx_distributed import parallel_layers + from neuronx_distributed.parallel_layers.grads import bucket_allreduce_gradients + if self.gradient_state.sync_gradients: + # For sequence-parallel, we have to explicitly all-reduce the layernorm gradients. + if self.accelerator_state.distributed_type is NeuronDistributedType.MODEL_PARALLELISM: + allreduce_sequence_parallel_gradients(self.optimizer) + if isinstance(self.optimizer, ZeroRedundancyOptimizer): if self.clip_grad_norm_to_perform is not None: # `ZeroRedundancyOptimizer` does not allow to pass a norm type, it could be done but postponing for @@ -74,18 +102,21 @@ def step(self, closure=None): self.optimizer.grad_clipping = False optimizer_args = {"closure": closure} if closure is not None else {} self.optimizer.step(closure) + # Resetting everything. + self.optimizer.grad_clipping = False + self.clip_grad_norm_to_perform = None elif self.accelerator_state.distributed_type is DistributedType.TPU: optimizer_args = {"closure": closure} if closure is not None else {} # By default barrier=False, but making sure it's the case here since we use ParalleLoader. xm.optimizer_step(self.optimizer, optimizer_args=optimizer_args, barrier=False) elif self.accelerator_state.distributed_type is NeuronDistributedType.XLA_FSDP: self.optimizer.step(closure) - elif self.accelerator_state.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM: - xm.reduce_gradients( - self.optimizer, groups=parallel_layers.parallel_state.get_data_parallel_group(as_list=True) - ) + elif self.accelerator_state.distributed_type is NeuronDistributedType.MODEL_PARALLELISM: + if parallel_layers.parallel_state.get_data_parallel_size() > 1: + bucket_allreduce_gradients(xm._fetch_gradients(self.optimizer)) if self.clip_grad_norm_to_perform is not None: parallel_layers.clip_grad_norm(self.parameters, **self.clip_grad_norm_to_perform) + self.clip_grad_norm_to_perform = None self.optimizer.step() elif self.scaler is not None: scale_before = self.scaler.get_scale() diff --git a/optimum/neuron/accelerate/state.py b/optimum/neuron/accelerate/state.py index 1ca852685..1b1fe8c6e 100644 --- a/optimum/neuron/accelerate/state.py +++ b/optimum/neuron/accelerate/state.py @@ -36,6 +36,7 @@ from ...utils import logging from ..utils import is_neuronx_distributed_available, is_torch_xla_available from .utils import NeuronDistributedType, NeuronFullyShardedDataParallelPlugin +from .utils.dataclasses import ModelParallelismPlugin if is_torch_xla_available(): @@ -189,7 +190,7 @@ def __init__(self, cpu: bool = False, **kwargs): self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0) def wait_for_everyone(self): - if self.distributed_type in [NeuronDistributedType.XLA_FSDP, NeuronDistributedType.TENSOR_PARALLELISM]: + if self.distributed_type in [NeuronDistributedType.XLA_FSDP, NeuronDistributedType.MODEL_PARALLELISM]: xm.rendezvous("accelerate.utils.wait_for_everyone") else: super().wait_for_everyone() @@ -223,7 +224,7 @@ def __init__( deepspeed_plugin=None, fsdp_plugin=None, megatron_lm_plugin=None, - tp_plugin=None, + mp_plugin=None, _from_accelerator: bool = False, **kwargs, ): @@ -262,29 +263,36 @@ def __init__( os.environ["XLA_USE_BF16"] = str(1) os.environ["XLA_DOWNCAST_BF16"] = str(0) self.downcast_bfloat = False - if os.environ.get("ACCELERATE_USE_NEURONX_DISTRIBUTED_TP", "false") == "true": + if ( + os.environ.get("ACCELERATE_USE_NEURONX_DISTRIBUTED_TP", "false") == "true" + or os.environ.get("ACCELERATE_USE_NEURONX_DISTRIBUTED_PP", "false") == "true" + ): if not is_neuronx_distributed_available(): raise RuntimeError( - "Tensor parallelism requires the neuronx_distributed package. You can install it by " + "Model parallelism requires the neuronx_distributed package. You can install it by " "running: python -m pip install neuronx_distributed --extra-index-url " "https://pip.repos.neuron.amazonaws.com" ) - if tp_plugin is None: + if mp_plugin is None: raise ValueError( - "Could not initialize `neuronx_distributed` tensor parallelism because no " - "TensorParallelismPlugin was provided." - ) - if tp_plugin.should_parallelize: - parallel_state.initialize_model_parallel( - tensor_model_parallel_size=tp_plugin.tensor_parallel_size + "Could not initialize `neuronx_distributed` model parallelism because no " + "`ModelParallelismPlugin` was provided." ) - self.distributed_type = NeuronDistributedType.TENSOR_PARALLELISM + if mp_plugin.should_parallelize: + if not parallel_state.model_parallel_is_initialized(): + parallel_state.initialize_model_parallel( + tensor_model_parallel_size=mp_plugin.tensor_parallel_size, + pipeline_model_parallel_size=mp_plugin.pipeline_parallel_size, + ) + self.distributed_type = NeuronDistributedType.MODEL_PARALLELISM else: logger.warning( - "Tensor parallelism is requested but nothing is done because the tensor parallel size is " - "set to 1." + "Model parallelism is requested but nothing is done because the tensor parallel size and " + "the pipeline parallel size are set to 1." ) - self.tp_plugin = tp_plugin + self.mp_plugin = mp_plugin + else: + self.mp_plugin = ModelParallelismPlugin() if os.environ.get("ACCELERATE_USE_FSDP", "false") == "true": self.distributed_type = NeuronDistributedType.XLA_FSDP if self._mixed_precision != "no": diff --git a/optimum/neuron/accelerate/utils/__init__.py b/optimum/neuron/accelerate/utils/__init__.py index 129f75c1c..211d33cf0 100644 --- a/optimum/neuron/accelerate/utils/__init__.py +++ b/optimum/neuron/accelerate/utils/__init__.py @@ -13,5 +13,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .dataclasses import NeuronDistributedType, NeuronFullyShardedDataParallelPlugin, TensorParallelismPlugin -from .misc import patch_accelerate_is_tpu_available +from .dataclasses import ModelParallelismPlugin, NeuronDistributedType, NeuronFullyShardedDataParallelPlugin +from .misc import get_tied_parameters_dict, patch_accelerate_is_tpu_available, tie_parameters diff --git a/optimum/neuron/accelerate/utils/dataclasses.py b/optimum/neuron/accelerate/utils/dataclasses.py index d5ade238a..f4d0dc0dd 100644 --- a/optimum/neuron/accelerate/utils/dataclasses.py +++ b/optimum/neuron/accelerate/utils/dataclasses.py @@ -46,7 +46,7 @@ class NeuronDistributedType(str, enum.Enum): """ XLA_FSDP = "XLA_FSDP" - TENSOR_PARALLELISM = "TENSOR_PARALLELISM" + MODEL_PARALLELISM = "MODEL_PARALLELISM" @dataclass @@ -140,21 +140,28 @@ def load_optimizer(self, accelerator, optimizer, model, input_dir, optimizer_ind @dataclass -class TensorParallelismPlugin: +class ModelParallelismPlugin: tensor_parallel_size: int = 1 parallelize_embeddings: bool = True sequence_parallel_enabled: bool = False + pipeline_parallel_size: int = 1 + pipeline_parallel_num_microbatches: int = 1 + pipeline_parallel_use_zero1_optimizer: bool = False checkpoint_dir: Optional[Union[str, Path]] = None def __post_init__(self): if self.tensor_parallel_size < 1: raise ValueError(f"The tensor parallel size must be >= 1, but {self.tensor_parallel_size} was given here.") + if self.pipeline_parallel_size < 1: + raise ValueError( + f"The pipeline parallel size must be >= 1, but {self.pipeline_parallel_size} was given here." + ) if isinstance(self.checkpoint_dir, str): self.checkpoint_dir = Path(self.checkpoint_dir) @property def should_parallelize(self): - return self.tensor_parallel_size > 1 + return self.tensor_parallel_size > 1 or self.pipeline_parallel_size > 1 def parallelize_model( self, @@ -167,6 +174,8 @@ def parallelize_model( device=device, parallelize_embeddings=self.parallelize_embeddings, sequence_parallel_enabled=self.sequence_parallel_enabled, + pipeline_parallel_num_microbatches=self.pipeline_parallel_num_microbatches, + pipeline_parallel_use_zero1_optimizer=self.pipeline_parallel_use_zero1_optimizer, checkpoint_dir=self.checkpoint_dir, ) return parallelized_model diff --git a/optimum/neuron/accelerate/utils/misc.py b/optimum/neuron/accelerate/utils/misc.py index cbea3183c..773649474 100644 --- a/optimum/neuron/accelerate/utils/misc.py +++ b/optimum/neuron/accelerate/utils/misc.py @@ -14,7 +14,18 @@ # limitations under the License. """Utilities of various sorts related to accelerate with Neuron.""" -from ...utils import is_torch_xla_available, patch_everywhere +from typing import TYPE_CHECKING, Dict, Union + +import torch + +from ...distributed.utils import named_parameters +from ...utils import is_torch_neuronx_available, is_torch_xla_available, patch_everywhere +from ...utils.require_utils import requires_neuronx_distributed + + +if TYPE_CHECKING: + if is_torch_neuronx_available(): + from neuronx_distributed.pipeline import NxDPPModel def is_tpu_available(check_device=True): @@ -26,3 +37,48 @@ def is_tpu_available(check_device=True): def patch_accelerate_is_tpu_available(): patch_everywhere("is_tpu_available", is_tpu_available, module_name_prefix="accelerate") + + +@requires_neuronx_distributed +def get_tied_parameters_dict(model: Union["torch.nn.Module", "NxDPPModel"]) -> Dict[str, str]: + from neuronx_distributed.pipeline import NxDPPModel + + unique_parameters = {} + tied_parameters = {} + if isinstance(model, NxDPPModel): + module = model.local_module + else: + module = model + for name, param in named_parameters(module, remove_duplicate=False): + if param in unique_parameters: + tied_parameter_name = unique_parameters[param] + tied_parameters[name] = tied_parameter_name + else: + unique_parameters[param] = name + return tied_parameters + + +@requires_neuronx_distributed +def tie_parameters(model: Union["torch.nn.Module", "NxDPPModel"], tied_parameters_dict: Dict[str, str]): + from neuronx_distributed.pipeline import NxDPPModel + + if isinstance(model, NxDPPModel): + module = model.local_module + else: + module = model + + for param_to_tie_name, param_name in tied_parameters_dict.items(): + param_to_tie_name = param_to_tie_name.rsplit(".", maxsplit=1) + + param_to_tie_parent_module = ( + module if len(param_to_tie_name) == 1 else module.get_submodule(param_to_tie_name[0]) + ) + param_to_tie = getattr(param_to_tie_parent_module, param_to_tie_name[1]) + + param_name = param_name.rsplit(".", maxsplit=1) + parent_module = module if len(param_name) == 1 else module.get_submodule(param_name[0]) + param = getattr(parent_module, param_name[1]) + + if param_to_tie is not param: + del param_to_tie + setattr(param_to_tie_parent_module, param_to_tie_name[1], param) diff --git a/optimum/neuron/distributed/base.py b/optimum/neuron/distributed/base.py index 94d355558..8f9d65343 100644 --- a/optimum/neuron/distributed/base.py +++ b/optimum/neuron/distributed/base.py @@ -21,15 +21,16 @@ from dataclasses import asdict from pathlib import Path from tempfile import TemporaryDirectory -from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set, Tuple, Type, Union import torch +from transformers import PreTrainedModel from transformers.utils import WEIGHTS_NAME from ...utils import logging from ..utils import is_neuronx_distributed_available, is_torch_xla_available -from ..utils.deprecate_utils import deprecate -from ..utils.require_utils import requires_neuronx_distributed +from ..utils.patching import Patcher +from ..utils.require_utils import requires_neuronx_distributed, requires_torch_xla from .parallel_layers import ( IOSequenceParallelizer, LayerNormSequenceParallelizer, @@ -40,16 +41,20 @@ TENSOR_PARALLEL_SHARDS_DIR_NAME, ParameterMetadata, WeightInformation, - initialize_linear, initialize_parallel_linear, + initialize_torch_nn_module, + linear_to_parallel_linear, load_tensor_for_weight, + named_parameters, + parameter_can_be_initialized, try_to_hf_initialize, + was_already_initialized_during_parallelization, ) if TYPE_CHECKING: - from transformers import PreTrainedModel - + if is_neuronx_distributed_available(): + from neuronx_distributed.pipeline import NxDPPModel logger = logging.get_logger() @@ -67,31 +72,64 @@ def __exit__(self, *exc): self.tmpdir.cleanup() -@deprecate( - "2.0.0", - package_name="torch", - reason="torch.nn.Module._named_members takes a `remove_duplicate` parameter starting from 2.0.0", -) -def _named_members(module, get_members_fn, prefix="", recurse=True, remove_duplicate: bool = True): - r"""Helper method for yielding various names + members of modules.""" - memo = set() - modules = module.named_modules(prefix=prefix, remove_duplicate=remove_duplicate) if recurse else [(prefix, module)] - for module_prefix, mod in modules: - members = get_members_fn(mod) - for k, v in members: - if v is None or v in memo: - continue - if remove_duplicate: - memo.add(v) - name = module_prefix + ("." if module_prefix else "") + k - yield name, v +class SequenceParallelismSpecs: + SEQUENCE_PARALLEL_LAYERNORM_PATTERNS: Optional[List[str]] = None + LAYERNORM_TYPE: LayerNormType = LayerNormType.REGULAR + SEQUENCE_COLLECTIVE_OPS_INFOS: Optional[List[SequenceCollectiveOpInfo]] = None + @abstractclassmethod + def patch_for_sequence_parallelism(cls, model: "PreTrainedModel", sequence_parallel_enabled: bool): + """ + This method needs to be overriden. It must patch anything model-specfic to make the model compatible with + sequence parallelism. + """ + if sequence_parallel_enabled: + raise NotImplementedError( + f"No patching for the attention mechanism for sequence parallelism was implemented for {model.__class__}" + ) -def named_parameters(module: "torch.nn.Module", prefix: str = "", recurse: bool = True, remove_duplicate: bool = True): - gen = _named_members( - module, lambda mod: mod._parameters.items(), prefix=prefix, recurse=recurse, remove_duplicate=remove_duplicate - ) - yield from gen + +class PipelineParallelismSpecs: + TRASNFORMER_LAYER_CLS: Type["torch.nn.Module"] + DEFAULT_INPUT_NAMES: Tuple[str, ...] + LEAF_MODULE_CLASSES_NAMES: Optional[List[Union[str, Type["torch.nn.Module"]]]] = None + OUTPUT_LOSS_SPECS: Tuple[bool, ...] = (True, False) + + @classmethod + @requires_torch_xla + def create_pipeline_cuts(cls, model: PreTrainedModel, pipeline_parallel_size: int) -> List[str]: + """ + Creates the pipeline cuts, e.g. the name of the layers at each the cuts happen for pipeline parallelism. + """ + import torch_xla.core.xla_model as xm + + num_layers = sum(1 if isinstance(mod, cls.TRASNFORMER_LAYER_CLS) else 0 for mod in model.modules()) + if num_layers % pipeline_parallel_size != 0: + raise ValueError( + f"The number of transformer layers ({num_layers}) is not divisible by the pipeline parallel size " + f"({pipeline_parallel_size})." + ) + num_layers_per_partition = num_layers // pipeline_parallel_size + layers_names = [name for (name, mod) in model.named_modules() if isinstance(mod, cls.TRASNFORMER_LAYER_CLS)] + pipeline_cuts = [ + layers_names[cut_idx] + for cut_idx in range(num_layers_per_partition - 1, num_layers - 1, num_layers_per_partition) + ] + + if xm.get_local_ordinal() == 0: + logger.info(f"Pipeline parallelism cuts: {pipeline_cuts}.") + + return pipeline_cuts + + @classmethod + def leaf_module_cls(cls) -> List[str]: + if cls.LEAF_MODULE_CLASSES_NAMES is None: + return [] + return [class_ if isinstance(class_, str) else class_.__name__ for class_ in cls.LEAF_MODULE_CLASSES_NAMES] + + @classmethod + def get_patching_specs(cls) -> List[Tuple[str, Any]]: + return [] class Parallelizer(ABC): @@ -99,9 +137,8 @@ class Parallelizer(ABC): Base abstract class that handles model parallelism. """ - SEQUENCE_PARALLEL_LAYERNORM_PATTERNS: Optional[List[str]] = None - LAYERNORM_TYPE: LayerNormType = LayerNormType.REGULAR - SEQUENCE_COLLECTIVE_OPS_INFOS: Optional[List[SequenceCollectiveOpInfo]] = None + SEQUENCE_PARALLELSIM_SPECS_CLS: Optional[Type[SequenceParallelismSpecs]] = None + PIPELINE_PARALLELISM_SPECS_CLS: Optional[Type[PipelineParallelismSpecs]] = None def __init__(self): self._validate_required_libaries_are_available() @@ -128,6 +165,76 @@ def saved_model_in_temporary_directory(cls, model: "PreTrainedModel"): finally: tmpdir.cleanup() + @classmethod + def supports_sequence_parallelism(cls) -> bool: + return cls.SEQUENCE_PARALLELSIM_SPECS_CLS is not None + + @classmethod + def supports_pipeline_parallelism(cls) -> bool: + return cls.PIPELINE_PARALLELISM_SPECS_CLS is not None + + @classmethod + @requires_neuronx_distributed + def _get_parameter_names_for_current_pipeline( + cls, model: "torch.nn.Module", remove_duplicate: bool = True + ) -> Set[str]: + """ + Retrieves the names of the parameters that will be in the current pipeline stage by using the pipeline + parallelism rank. + """ + from neuronx_distributed.parallel_layers.parallel_state import ( + get_pipeline_model_parallel_rank, + get_pipeline_model_parallel_size, + ) + + pp_size = get_pipeline_model_parallel_size() + pp_rank = get_pipeline_model_parallel_rank() + all_parameter_names = {n for n, _ in named_parameters(model, remove_duplicate=remove_duplicate)} + if pp_size == 1: + return all_parameter_names + + if not cls.supports_pipeline_parallelism(): + raise NotImplementedError(f"{cls} does not support pipeline parallelism.") + + cuts = cls.PIPELINE_PARALLELISM_SPECS_CLS.create_pipeline_cuts(model, pp_size) + + start_module_name = cuts[pp_rank - 1] if pp_rank >= 1 else None + end_module_name = None if pp_rank == pp_size - 1 else cuts[pp_rank] + parameter2name = {p: n for n, p in named_parameters(model, remove_duplicate=remove_duplicate)} + parameter_names = set() + should_add = False + for name, mod in model.named_modules(): + if not isinstance(mod, cls.PIPELINE_PARALLELISM_SPECS_CLS.TRASNFORMER_LAYER_CLS): + continue + # If start_module_name is None, it means we are on the first rank, we should add right from the beginning. + if start_module_name is None: + should_add = True + if should_add: + for _, param in named_parameters(mod, remove_duplicate=remove_duplicate): + # It is important to use this dictionary (built with `model.named_parameters()`) instead of using + # `mod.named_parameters()` to get the fully qualified names. + param_name = parameter2name[param] + parameter_names.add(param_name) + + # We consider the parameters inside ]start_module_name, end_module_name]. + if start_module_name == name: + should_add = True + if name == end_module_name: + break + + parameters_inside_transformer_layers = { + p + for mod in model.modules() + if isinstance(mod, cls.PIPELINE_PARALLELISM_SPECS_CLS.TRASNFORMER_LAYER_CLS) + for _, p in named_parameters(mod, remove_duplicate=remove_duplicate) + } + parameter_outside_of_transformer_layers_names = { + name + for name, param in named_parameters(model, remove_duplicate=remove_duplicate) + if param not in parameters_inside_transformer_layers + } + return parameter_names | parameter_outside_of_transformer_layers_names + @abstractclassmethod def _parallelize( cls, @@ -154,17 +261,6 @@ def _parallelize( `PreTrainedModel`: The parallelized model. """ - @classmethod - def patch_for_sequence_parallelism(cls, model: "PreTrainedModel", sequence_parallel_enabled: bool): - """ - This method needs to be overriden. It must patch anything model-specfic to make the model compatible with - sequence parallelism. - """ - if sequence_parallel_enabled: - raise NotImplementedError( - f"No patching for the attention mechanism for sequence parallelism was implemented for {model.__class__}" - ) - @classmethod @requires_neuronx_distributed def parallelize( @@ -173,6 +269,9 @@ def parallelize( device: Optional["torch.device"] = None, parallelize_embeddings: bool = True, sequence_parallel_enabled: bool = False, + pipeline_parallel_input_names: Optional[Union[Tuple[str, ...], List[str]]] = None, + pipeline_parallel_num_microbatches: int = 1, + pipeline_parallel_use_zero1_optimizer: bool = False, checkpoint_dir: Optional[Union[str, Path]] = None, ) -> "PreTrainedModel": """ @@ -192,6 +291,11 @@ def parallelize( This can be disabled in the case when the TP size does not divide the vocabulary size. sequence_parallel_enabled (`bool`, defaults to `False`): Whether or not sequence parallelism is enabled. + pipeline_parallel_num_microbatches (`int`, defaults to 1): + The number of microbatches used for pipeline execution. + pipeline_parallel_use_zero1_optimizer (`bool`, defaults to `False`): + When zero-1 optimizer is used, set this to True, so the PP model will understand that zero-1 optimizer + will handle data parallel gradient averaging. checkpoint_dir (`Optional[Union[str, Path]]`): Path to a sharded checkpoint. If specified, the checkpoint weights will be loaded to the parallelized model. @@ -201,45 +305,61 @@ def parallelize( """ from neuronx_distributed import parallel_layers - if sequence_parallel_enabled and cls.SEQUENCE_PARALLEL_LAYERNORM_PATTERNS is None: + if sequence_parallel_enabled and not cls.supports_sequence_parallelism(): raise NotImplementedError(f"Sequence parallelism is not supported for {model.__class__}.") - from neuronx_distributed.parallel_layers.parallel_state import get_tensor_model_parallel_rank + from neuronx_distributed.parallel_layers.parallel_state import ( + get_pipeline_model_parallel_size, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_size, + ) + from neuronx_distributed.pipeline import NxDPPModel + + tp_size = get_tensor_model_parallel_size() + + sequence_parallel_enabled = sequence_parallel_enabled and tp_size > 1 # Parallelizing the model. # This needs to be done prior to preparing the model for sequence parallelism because modules can be overriden. - model = cls._parallelize( - model, - device=device, - parallelize_embeddings=parallelize_embeddings, - sequence_parallel_enabled=sequence_parallel_enabled, - ) + if tp_size > 1: + model = cls._parallelize( + model, + device=device, + parallelize_embeddings=parallelize_embeddings, + sequence_parallel_enabled=sequence_parallel_enabled, + ) # Preparing the model for sequence parallelism: - # 1. Transforming the LayerNorms. - layer_norm_qualified_name_patterns = ( - cls.SEQUENCE_PARALLEL_LAYERNORM_PATTERNS if cls.SEQUENCE_PARALLEL_LAYERNORM_PATTERNS is not None else [] - ) - layer_norm_sequence_parallelizer = LayerNormSequenceParallelizer( - sequence_parallel_enabled, layer_norm_qualified_name_patterns - ) - layer_norm_sequence_parallelizer.sequence_parallelize(model, cls.LAYERNORM_TYPE) - - # 2. Taking care of scattering / gathering on the sequence axis in the model via the IOSequenceParallelizer. - io_sequence_parallelizer = IOSequenceParallelizer( - sequence_parallel_enabled, - sequence_collective_op_infos=cls.SEQUENCE_COLLECTIVE_OPS_INFOS, - ) - io_sequence_parallelizer.sequence_parallelize(model) + sp_specs_cls = cls.SEQUENCE_PARALLELSIM_SPECS_CLS - # 3. Applying model specific patching for sequence parallelism. if sequence_parallel_enabled: - cls.patch_for_sequence_parallelism(model, sequence_parallel_enabled) + # 1. Transforming the LayerNorms. + layer_norm_qualified_name_patterns = ( + sp_specs_cls.SEQUENCE_PARALLEL_LAYERNORM_PATTERNS + if sp_specs_cls.SEQUENCE_PARALLEL_LAYERNORM_PATTERNS is not None + else [] + ) + layer_norm_sequence_parallelizer = LayerNormSequenceParallelizer( + sequence_parallel_enabled, layer_norm_qualified_name_patterns + ) + layer_norm_sequence_parallelizer.sequence_parallelize(model, sp_specs_cls.LAYERNORM_TYPE) + + # 2. Taking care of scattering / gathering on the sequence axis in the model via the IOSequenceParallelizer. + io_sequence_parallelizer = IOSequenceParallelizer( + sequence_parallel_enabled, + sequence_collective_op_infos=sp_specs_cls.SEQUENCE_COLLECTIVE_OPS_INFOS, + ) + io_sequence_parallelizer.sequence_parallelize(model) + + # 3. Applying model specific patching for sequence parallelism. + sp_specs_cls.patch_for_sequence_parallelism(model, sequence_parallel_enabled) - weight_map = getattr(model, "_weight_map", None) # The model was not loaded lazily, it is already ready. - if weight_map is None: - return model + weight_map = getattr(model, "_weight_map", {}) + + names_of_the_parameters_to_consider = cls._get_parameter_names_for_current_pipeline( + model, remove_duplicate=True + ) with torch.no_grad(): tied_weights = {} @@ -249,7 +369,10 @@ def parallelize( split = name.rsplit(".", maxsplit=1) module = model.get_submodule(split[0]) attribute_name = split[1] - current_weight = getattr(module, attribute_name) + + # Skipping the parameters that will not end-up in this pipeline rank. + if name not in names_of_the_parameters_to_consider: + continue try: weight_info = WeightInformation(weight_map[name], name, weight_map=weight_map, device=device) @@ -265,14 +388,14 @@ def parallelize( # It can be the case when weights are tied. For example between the embeddings and the LM head. new_parameter = tied_weights[parameter] elif weight_info is not None: - if getattr(current_weight, "tensor_model_parallel", False): + if getattr(parameter, "tensor_model_parallel", False): if parameter.device == torch.device("meta"): # This must either be a torch.nn.Embedding or a torch.nn.Linear that was not handled during # parallelization since those are the only classes that we initialize on the `meta` device. - num_dims = current_weight.dim() - partition_dim = getattr(current_weight, "partition_dim") + num_dims = parameter.dim() + partition_dim = getattr(parameter, "partition_dim") tp_rank = get_tensor_model_parallel_rank() - size_per_rank = current_weight.size(partition_dim) + size_per_rank = parameter.size(partition_dim) slices = [ None if idx != partition_dim @@ -291,10 +414,17 @@ def parallelize( new_parameter = torch.nn.Parameter( load_tensor_for_weight(weight_info, tensor_slices=slices).to(parameter.dtype) ) + elif parameter.device != torch.device("meta") and ( + was_already_initialized_during_parallelization(parameter) + or not parameter_can_be_initialized(model, module, attribute_name) + ): + tied_weights[parameter] = parameter + new_parameters.add(parameter) + continue else: # This means that there is no information about where to find the weights for this parameter. device = torch.device("cpu") if device is None else device - new_parameter = torch.nn.Parameter(torch.empty_like(current_weight, device=device)) + new_parameter = torch.nn.Parameter(torch.empty_like(parameter, device=device)) modules_to_initialize[module].append(attribute_name) setattr( @@ -317,23 +447,63 @@ def parallelize( # `reset_parameters()` method but we need to be careful because one of the parameters might not # need initialization. left_uninitialized = try_to_hf_initialize(model, mod, parameter_names) - if not left_uninitialized: - continue - initialize_linear(mod, left_uninitialized) - + if left_uninitialized: + initialize_torch_nn_module(mod, left_uninitialized) elif isinstance(mod, parallel_layers.layers.BaseParallelLinear): # First, we try to initialize the layer similarly as it would be done with the model. - # To do that it is necessary to change the model class to that the `model._init_weights` method - # considers this module as a `torch.nn.Linear` instance. - orig_class = mod.__class__ - mod.__class__ = torch.nn.Linear - left_uninitialized = try_to_hf_initialize(model, mod, parameter_names) - mod.__class__ = orig_class - if not left_uninitialized: - continue - initialize_parallel_linear(mod, left_uninitialized) + # To do that we initialize a `torch.nn.Linear` with the full shape, and then scatter the weights. + input_is_parallel = gather_output = False + if isinstance(mod, parallel_layers.layers.RowParallelLinear): + axis = "row" + input_is_parallel = mod.input_is_parallel + else: + axis = "column" + gather_output = mod.gather_output + fake_linear_mod = torch.nn.Linear(mod.input_size, mod.output_size) + left_uninitialized = try_to_hf_initialize(model, fake_linear_mod, parameter_names) + if left_uninitialized: + initialize_parallel_linear(mod, left_uninitialized) + else: + fake_parallel_linear_mod = linear_to_parallel_linear( + fake_linear_mod, + axis, + input_is_parallel=input_is_parallel, + gather_output=gather_output, + sequence_parallel_enabled=mod.sequence_parallel_enabled, + ) + mod.weight.data = fake_parallel_linear_mod.weight.data.clone() + if mod.bias is not None: + mod.bias.data = fake_parallel_linear_mod.bias.data.clone() + del fake_linear_mod + del fake_parallel_linear_mod else: - raise ValueError(f"Do not know how to initialize a module of type {mod.__class__}") + left_uninitialized = try_to_hf_initialize(model, mod, parameter_names) + if left_uninitialized and hasattr(mod, "reset_parameters"): + initialize_torch_nn_module(mod, parameter_names) + + pp_size = get_pipeline_model_parallel_size() + if pp_size > 1: + if not cls.supports_pipeline_parallelism(): + raise NotImplementedError("{cls} does not support pipeline parallelism.") + + model.config.return_dict = False + model.config.use_cache = False + model.config.output_attentions = False + model.config.output_hidden_states = False + + with Patcher(cls.PIPELINE_PARALLELISM_SPECS_CLS.get_patching_specs()): + if pipeline_parallel_input_names is None: + pipeline_parallel_input_names = cls.PIPELINE_PARALLELISM_SPECS_CLS.DEFAULT_INPUT_NAMES + model = NxDPPModel( + model, + transformer_layer_cls=cls.PIPELINE_PARALLELISM_SPECS_CLS.TRASNFORMER_LAYER_CLS, + num_microbatches=pipeline_parallel_num_microbatches, + output_loss_value_spec=cls.PIPELINE_PARALLELISM_SPECS_CLS.OUTPUT_LOSS_SPECS, + input_names=pipeline_parallel_input_names, + pipeline_cuts=cls.PIPELINE_PARALLELISM_SPECS_CLS.create_pipeline_cuts(model, pp_size), + leaf_module_cls=cls.PIPELINE_PARALLELISM_SPECS_CLS.leaf_module_cls(), + use_zero1_optimizer=pipeline_parallel_use_zero1_optimizer, + ) if checkpoint_dir is not None: cls.load_model_checkpoint(model, checkpoint_dir) @@ -348,13 +518,21 @@ def deparallelize(cls, model: "PreTrainedModel") -> "PreTrainedModel": @requires_neuronx_distributed def was_parallelized(cls, model: "PreTrainedModel") -> bool: import neuronx_distributed + from neuronx_distributed.parallel_layers.parallel_state import ( + get_pipeline_model_parallel_size, + get_tensor_model_parallel_size, + ) + from neuronx_distributed.pipeline import NxDPPModel + needs_parallelization_for_pp = get_pipeline_model_parallel_size() > 1 and not isinstance(model, NxDPPModel) parallel_layer_classes = ( neuronx_distributed.parallel_layers.ParallelEmbedding, neuronx_distributed.parallel_layers.ColumnParallelLinear, neuronx_distributed.parallel_layers.RowParallelLinear, ) - return any(isinstance(mod, parallel_layer_classes) for mod in model.modules()) + layers_are_parallel = any(isinstance(mod, parallel_layer_classes) for mod in model.modules()) + needs_parallelization_for_tp = get_tensor_model_parallel_size() > 1 and not layers_are_parallel + return (not needs_parallelization_for_pp) and (not needs_parallelization_for_tp) @classmethod def _check_model_was_parallelized(cls, model: "PreTrainedModel"): @@ -362,35 +540,64 @@ def _check_model_was_parallelized(cls, model: "PreTrainedModel"): raise ValueError("The model needs to be parallelized first.") @classmethod + @requires_torch_xla def optimizer_cpu_params_to_xla_params( cls, optimizer: "torch.optim.Optimizer", orig_param_to_parallel_param_on_xla: Mapping[int, "torch.nn.Parameter"], ) -> Tuple[List[Dict[str, Any]], bool]: + import torch_xla.core.xla_model as xm + parameters_on_xla = [] need_to_create_new_optimizer = False if hasattr(optimizer, "_args_to_recreate"): args, _ = optimizer._args_to_recreate - parameters = args[0] - for param in parameters: - if isinstance(param, dict): - new_param = {k: v for k, v in param.items() if k != "params"} - params = [] - for p in param["params"]: - params.append(orig_param_to_parallel_param_on_xla[id(p)]) - new_param["params"] = params - else: - new_param = [] - for p in param: - new_param.append(orig_param_to_parallel_param_on_xla[id(p)]) + + # parameter_groups can either be an iterable of dictionaries (groups), or of parameters, in which case + # there is only one group. + parameter_groups = args[0] + parameter_groups = list(parameter_groups) + # parameter_groups cannot be empty + if isinstance(parameter_groups[0], dict): + for group in parameter_groups: + new_group = {k: v for k, v in group.items() if k != "params"} + params_on_xla = [] + for p in group["params"]: + if p.device == xm.xla_device(): + params_on_xla.append(p) + elif id(p) not in orig_param_to_parallel_param_on_xla: + # This can be the case with pipeline parallelism. + continue + else: + params_on_xla.append(orig_param_to_parallel_param_on_xla[id(p)]) + new_group["params"] = params_on_xla + parameters_on_xla.append(new_group) + else: + new_param = {} + params_on_xla = [] + for param in parameter_groups: + if param.device == xm.xla_device(): + params_on_xla.append(param) + elif id(param) not in orig_param_to_parallel_param_on_xla: + # This can be the case with pipeline parallelism. + continue + else: + params_on_xla.append(orig_param_to_parallel_param_on_xla[id(param)]) + new_param["params"] = params_on_xla parameters_on_xla.append(new_param) else: for param_group in optimizer.param_groups: new_params = [] params = param_group["params"] for idx in range(len(params)): - param_on_xla = orig_param_to_parallel_param_on_xla[id(params[idx])] - if params[idx] != param_on_xla: + if params[idx].device == xm.xla_device(): + param_on_xla = params[idx] + elif id(params[idx]) not in orig_param_to_parallel_param_on_xla: + need_to_create_new_optimizer = True + continue + else: + param_on_xla = orig_param_to_parallel_param_on_xla[id(params[idx])] + if params[idx] is not param_on_xla: need_to_create_new_optimizer = True new_params.append(param_on_xla) new_group = {k: v for k, v in param_group.items() if k != "params"} @@ -399,7 +606,7 @@ def optimizer_cpu_params_to_xla_params( return parameters_on_xla, need_to_create_new_optimizer @classmethod - def optimizer_for_tp( + def optimizer_for_mp( cls, optimizer: "torch.optim.Optimizer", orig_param_to_parallel_param_on_xla: Mapping[int, "torch.nn.Parameter"], @@ -429,14 +636,14 @@ def optimizer_for_tp( ) if hasattr(optimizer, "_args_to_recreate"): args, kwargs = optimizer._args_to_recreate - optimizer_for_tp = optimizer.__class__(parallel_parameters, *args[1:], **kwargs) + optimizer_for_mp = optimizer.__class__(parallel_parameters, *args[1:], **kwargs) del optimizer elif need_to_create_new_optimizer: - optimizer_for_tp = optimizer.__class__(parallel_parameters) + optimizer_for_mp = optimizer.__class__(parallel_parameters) del optimizer else: - optimizer_for_tp = optimizer - return optimizer_for_tp + optimizer_for_mp = optimizer + return optimizer_for_mp @classmethod def _get_parameters_tp_metadata(cls, named_parameters: Dict[str, "torch.nn.Parameter"]): @@ -509,26 +716,25 @@ def save_model_checkpoint_as_regular( @requires_neuronx_distributed def save_model_checkpoint_as_sharded( cls, - model: "PreTrainedModel", + model: Union["PreTrainedModel", "NxDPPModel"], output_dir: Union[str, Path], optimizer: Optional["torch.optim.Optimizer"] = None, ): import torch_xla.core.xla_model as xm from neuronx_distributed import parallel_layers - from neuronx_distributed.parallel_layers.parallel_state import ( - get_data_parallel_rank, - get_tensor_model_parallel_rank, - ) + from neuronx_distributed.pipeline import NxDPPModel cls._check_model_was_parallelized(model) - data_parallel_rank = get_data_parallel_rank() - tensor_parallel_rank = get_tensor_model_parallel_rank() - if not isinstance(output_dir, Path): output_dir = Path(output_dir) - state_dict = {"model": model.state_dict()} + if isinstance(model, NxDPPModel): + model_state_dict = model.local_state_dict() + else: + model_state_dict = model.state_dict() + + state_dict = {"model": model_state_dict} state_dict["sharded_metadata"] = { k: asdict(v) for k, v in cls._get_parameters_tp_metadata(dict(model.named_parameters())).items() } @@ -539,12 +745,12 @@ def save_model_checkpoint_as_sharded( output_path = output_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME - if data_parallel_rank == 0 and tensor_parallel_rank == 0: + if xm.get_local_ordinal() == 0: if output_path.is_dir(): shutil.rmtree(output_path, ignore_errors=True) output_path.mkdir() xm.rendezvous("waiting before saving") - parallel_layers.save(state_dict, output_path.as_posix()) + parallel_layers.save(state_dict, output_path.as_posix(), save_xser=True) @classmethod def save_model_checkpoint( @@ -572,7 +778,10 @@ def load_model_sharded_checkpoint(cls, model: "PreTrainedModel", load_dir: Union if not isinstance(load_dir, Path): load_dir = Path(load_dir) neuronx_distributed.parallel_layers.load( - load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME, model_or_optimizer=model, sharded=True + load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME, + model_or_optimizer=model, + load_xser=True, + sharded=True, ) @classmethod @@ -588,6 +797,7 @@ def load_model_checkpoint(cls, model: "PreTrainedModel", load_dir: Union[str, Pa @classmethod @requires_neuronx_distributed def load_optimizer_sharded_checkpoint(cls, optimizer: "torch.optim.Optimizer", load_dir: Union[str, Path]): + import neuronx_distributed from neuronx_distributed.optimizer import NeuronZero1Optimizer is_zero_1_optimizer = optimizer.__class__.__name__ == "NeuronAcceleratedOptimizer" and isinstance( @@ -599,10 +809,13 @@ def load_optimizer_sharded_checkpoint(cls, optimizer: "torch.optim.Optimizer", l "It is not possible to load a sharded optimizer checkpoint when using ZeRO-1 yet." ) - from neuronx_distributed.parallel_layers import load - if not isinstance(load_dir, Path): load_dir = Path(load_dir) - load( - load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME, model_or_optimizer=optimizer, model_key="optimizer_state_dict" + + neuronx_distributed.parallel_layers.load( + load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME, + model_or_optimizer=optimizer, + model_key="optimizer_state_dict", + load_xser=True, + sharded=True, ) diff --git a/optimum/neuron/distributed/decoder_models.py b/optimum/neuron/distributed/decoder_models.py index 481890eed..0bb795e31 100644 --- a/optimum/neuron/distributed/decoder_models.py +++ b/optimum/neuron/distributed/decoder_models.py @@ -14,15 +14,18 @@ # limitations under the License. """Classes related to `neuronx-distributed` to perform parallelism.""" -from typing import TYPE_CHECKING, Optional, Tuple +import warnings +from typing import TYPE_CHECKING, Any, List, Optional, Tuple import torch +from transformers.cache_utils import Cache from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoBlock, GPTNeoSelfAttention from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXAttention from transformers.models.llama.modeling_llama import ( LlamaAttention, LlamaDecoderLayer, LlamaRMSNorm, + _prepare_4d_causal_attention_mask, apply_rotary_pos_emb, repeat_kv, ) @@ -32,7 +35,7 @@ MistralRMSNorm, ) -from .base import Parallelizer +from .base import Parallelizer, PipelineParallelismSpecs, SequenceParallelismSpecs from .parallel_layers import ( LayerNormType, ParallelCrossEntropy, @@ -71,7 +74,7 @@ class GPTNeoParallelCrossEntropy(ParallelCrossEntropy): LAST_LINEAR_PROJECTION_NAME = {"GPTNeoForCausalLM": "lm_head"} -class GPTNeoParallelizer(Parallelizer): +class GPTNeoSequenceParallelismSpecs(SequenceParallelismSpecs): SEQUENCE_PARALLEL_LAYERNORM_PATTERNS = [ "transformer.h.[0-9]+.ln_[1-2]", "transformer.ln_f", @@ -108,6 +111,10 @@ def _merge_heads(self, tensor, num_heads, attn_head_size): module._split_heads = _split_heads.__get__(module) module._merge_heads = _merge_heads.__get__(module) + +class GPTNeoParallelizer(Parallelizer): + SEQUENCE_PARALLELSIM_SPECS_CLS = GPTNeoSequenceParallelismSpecs + @classmethod def _parallelize( cls, @@ -158,14 +165,14 @@ class GPTNeoXParallelCrossEntropy(ParallelCrossEntropy): LAST_LINEAR_PROJECTION_NAME = {"GPTNeoXForCausalLM": "embed_out"} -class GPTNeoXParallelizer(Parallelizer): +class GPTNeoXSequenceParallelismSpecs(SequenceParallelismSpecs): SEQUENCE_PARALLEL_LAYERNORM_PATTERNS = [ "gpt_neox.layers.[0-9]+.input_layernorm", "gpt_neox.layers.[0-9]+.post_attention_layernorm", "gpt_neox.final_layer_norm", ] SEQUENCE_COLLECTIVE_OPS_INFOS = [ - SequenceCollectiveOpInfo("scatter", torch.nn.Embedding, "output", "first"), + SequenceCollectiveOpInfo("scatter", "gpt_neox.embed_in", "output", "first"), SequenceCollectiveOpInfo("gather", torch.nn.LayerNorm, "output", "last"), ] @@ -269,6 +276,10 @@ def sequence_parallel_forward( if isinstance(module, GPTNeoXAttention): module.forward = sequence_parallel_forward.__get__(module) + +class GPTNeoXParallelizer(Parallelizer): + SEQUENCE_PARALLELSIM_SPECS_CLS = GPTNeoXSequenceParallelismSpecs + @classmethod def _parallelize( cls, @@ -366,7 +377,7 @@ class LlamaParallelCrossEntropy(ParallelCrossEntropy): } -class LlamaParallelizer(Parallelizer): +class LlamaSequenceParallelismSpecs(SequenceParallelismSpecs): SEQUENCE_PARALLEL_LAYERNORM_PATTERNS = [ "model.layers.[0-9]+.input_layernorm", "model.layers.[0-9]+.post_attention_layernorm", @@ -391,13 +402,20 @@ def patch_for_sequence_parallelism(cls, model: "PreTrainedModel", sequence_paral def attention_forward( self, - hidden_states: "torch.Tensor", - attention_mask: Optional["torch.Tensor"] = None, - position_ids: Optional["torch.LongTensor"] = None, - past_key_value: Optional[Tuple["torch.Tensor"]] = None, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, output_attentions: bool = False, use_cache: bool = False, - ) -> Tuple["torch.Tensor", Optional["torch.Tensor"], Optional[Tuple["torch.Tensor"]]]: + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and removed since `transformers` v4.37. Please make sure to " + "use `attention_mask` instead.`" + ) + if self.config.pretraining_tp > 1: key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp query_slices = self.q_proj.weight.split( @@ -439,16 +457,21 @@ def attention_forward( kv_seq_len = key_states.shape[-2] if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] + if self.layer_idx is None: + raise ValueError( + "The cache structure has changed since version `transformers v4.36. If you are using " + f"{self.__class__.__name__} for auto-regressive decoding with k/v caching, please make sure to " + "initialize the attention class with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: - # reuse k, v, self_attention - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - - past_key_value = (key_states, value_states) if use_cache else None + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) # repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.num_key_value_groups) @@ -505,6 +528,29 @@ def attention_forward( if isinstance(module, LlamaAttention): module.forward = attention_forward.__get__(module) + +class LlamaPipelineParallelismSpecs(PipelineParallelismSpecs): + TRASNFORMER_LAYER_CLS = LlamaDecoderLayer + DEFAULT_INPUT_NAMES = ("input_ids", "attention_mask", "labels") + LEAF_MODULE_CLASSES_NAMES = [LlamaRMSNorm] + + @classmethod + def get_patching_specs(cls) -> List[Tuple[str, Any]]: + leaf_prepare_4d_causal_attention_mask = torch.fx._symbolic_trace._create_wrapped_func( + _prepare_4d_causal_attention_mask + ) + return [ + ( + "transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask", + leaf_prepare_4d_causal_attention_mask, + ), + ] + + +class LlamaParallelizer(Parallelizer): + SEQUENCE_PARALLELSIM_SPECS_CLS = LlamaSequenceParallelismSpecs + PIPELINE_PARALLELISM_SPECS_CLS = LlamaPipelineParallelismSpecs + @classmethod def _parallelize( cls, @@ -598,7 +644,7 @@ class MistralParallelCrossEntropy(ParallelCrossEntropy): LAST_LINEAR_PROJECTION_NAME = {"MistralForCausalLM": "lm_head"} -class MistralParallelizer(Parallelizer): +class MistralSequenceParallelismSpecs(SequenceParallelismSpecs): SEQUENCE_PARALLEL_LAYERNORM_PATTERNS = [ "model.layers.[0-9]+.input_layernorm", "model.layers.[0-9]+.post_attention_layernorm", @@ -625,11 +671,16 @@ def attention_forward( hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, + past_key_value: Optional[Cache] = None, output_attentions: bool = False, use_cache: bool = False, **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and removed since `transformers` v4.37. Please make sure to " + "use `attention_mask` instead.`" + ) query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) @@ -653,16 +704,21 @@ def attention_forward( kv_seq_len = key_states.shape[-2] if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] + if self.layer_idx is None: + raise ValueError( + "The cache structure has changed since `transformers` v4.36. If you are using " + f"{self.__class__.__name__} for auto-regressive decoding with k/v caching, please make sure to " + "initialize the attention class with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: - # reuse k, v, self_attention - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - - past_key_value = (key_states, value_states) if use_cache else None + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) # repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.num_key_value_groups) @@ -713,6 +769,10 @@ def attention_forward( if isinstance(module, MistralAttention): module.forward = attention_forward.__get__(module) + +class MistralParallelizer(Parallelizer): + SEQUENCE_PARALLELSIM_SPECS_CLS = MistralSequenceParallelismSpecs + @classmethod def _parallelize( cls, diff --git a/optimum/neuron/distributed/encoder_decoder_models.py b/optimum/neuron/distributed/encoder_decoder_models.py index 4fb537330..fa29ee8b6 100644 --- a/optimum/neuron/distributed/encoder_decoder_models.py +++ b/optimum/neuron/distributed/encoder_decoder_models.py @@ -20,7 +20,7 @@ from transformers.models.t5.modeling_t5 import T5Attention, T5ForSequenceClassification, T5LayerNorm from ...utils import NormalizedConfigManager -from .base import Parallelizer +from .base import Parallelizer, SequenceParallelismSpecs from .parallel_layers import ( LayerNormType, ParallelCrossEntropy, @@ -154,7 +154,7 @@ class T5ParallelCrossEntropy(ParallelCrossEntropy): LAST_LINEAR_PROJECTION_NAME = {"T5ForConditionalGeneration": "lm_head"} -class T5Parallelizer(Parallelizer): +class T5SequenceParallelismSpecs(SequenceParallelismSpecs): SEQUENCE_PARALLEL_LAYERNORM_PATTERNS = [ "encoder.block.[0-9]+.layer.[0-9]+.layer_norm", "encoder.final_layer_norm", @@ -316,6 +316,8 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): if isinstance(module, T5Attention): module.forward = sequence_parallel_forward.__get__(module) + +class T5Parallelizer(Parallelizer): @classmethod def _parallelize( cls, diff --git a/optimum/neuron/distributed/encoder_models.py b/optimum/neuron/distributed/encoder_models.py index 2322d7434..c8e2c617c 100644 --- a/optimum/neuron/distributed/encoder_models.py +++ b/optimum/neuron/distributed/encoder_models.py @@ -19,7 +19,7 @@ import torch from ..utils.require_utils import requires_neuronx_distributed -from .base import Parallelizer +from .base import Parallelizer, SequenceParallelismSpecs from .parallel_layers import ( ParallelCrossEntropy, ParallelEmbedding, @@ -90,7 +90,7 @@ class BertParallelCrossEntropy(ParallelCrossEntropy): } -class BertParallelizer(Parallelizer): +class BertSequenceParallelismSpecs(SequenceParallelismSpecs): SEQUENCE_PARALLEL_LAYERNORM_PATTERNS = [ "bert.embeddings.LayerNorm", "bert.encoder.layer.[0-9]+.attention.output.LayerNorm", @@ -123,6 +123,10 @@ def transpose_for_scores(self, x: "torch.Tensor") -> "torch.Tensor": module.forward, sequence_parallel_enabled ).__get__(module) + +class BertParallelizer(Parallelizer): + SEQUENCE_PARALLELSIM_SPECS_CLS = BertSequenceParallelismSpecs + @classmethod def _parallelize( cls, @@ -181,7 +185,7 @@ class RobertaParallelCrossEntropy(ParallelCrossEntropy): } -class RobertaParallelizer(Parallelizer): +class RobertaSequenceParallelismSpecs(SequenceParallelismSpecs): SEQUENCE_PARALLEL_LAYERNORM_PATTERNS = [ "roberta.embeddings.LayerNorm", "roberta.encoder.layer.[0-9]+.attention.output.LayerNorm", @@ -214,6 +218,10 @@ def transpose_for_scores(self, x: "torch.Tensor") -> "torch.Tensor": module.forward, sequence_parallel_enabled ).__get__(module) + +class RobertaParallelizer(Parallelizer): + SEQUENCE_PARALLELSIM_SPECS_CLS = RobertaSequenceParallelismSpecs + @classmethod def _parallelize( cls, diff --git a/optimum/neuron/distributed/parallel_layers.py b/optimum/neuron/distributed/parallel_layers.py index 1db914886..9f626f61d 100644 --- a/optimum/neuron/distributed/parallel_layers.py +++ b/optimum/neuron/distributed/parallel_layers.py @@ -693,6 +693,7 @@ def transform( @requires_neuronx_distributed +@torch.fx.wrap def safe_parallel_cross_entropy(*args, **kwargs): if kwargs.pop("weight", None) is not None: raise ValueError("The weight keyword argument is not supported when using parallel cross entropy") @@ -714,6 +715,7 @@ def safe_parallel_cross_entropy(*args, **kwargs): input_ = args[0] if _PARALLEL_CROSS_ENTROPY_SHOULD_PRESERVE_INPUT: input_ = input_.clone() + loss = parallel_cross_entropy(input_, *args[1:], **kwargs) if reduction == "mean": diff --git a/optimum/neuron/distributed/parallelizers_manager.py b/optimum/neuron/distributed/parallelizers_manager.py index 09fb929df..9c7d92e36 100644 --- a/optimum/neuron/distributed/parallelizers_manager.py +++ b/optimum/neuron/distributed/parallelizers_manager.py @@ -19,6 +19,7 @@ from transformers import PreTrainedModel +from ..utils.require_utils import requires_neuronx_distributed from .base import Parallelizer @@ -69,7 +70,12 @@ def get_supported_model_types(cls) -> List[str]: return list(cls._MODEL_TYPE_TO_PARALLEL_MODEL_CLASS.keys()) @classmethod + @requires_neuronx_distributed def _get_model_type(cls, model_type_or_model: Union[str, PreTrainedModel]) -> str: + from neuronx_distributed.pipeline import NxDPPModel + + if isinstance(model_type_or_model, NxDPPModel): + model_type_or_model = model_type_or_model.original_torch_module if isinstance(model_type_or_model, PreTrainedModel): model_type = model_type_or_model.config.model_type else: diff --git a/optimum/neuron/distributed/utils.py b/optimum/neuron/distributed/utils.py index 7093818a6..66118b108 100644 --- a/optimum/neuron/distributed/utils.py +++ b/optimum/neuron/distributed/utils.py @@ -15,6 +15,7 @@ """Utilities for performing parallelism with `neuronx_distributed`""" import contextlib +import copy import functools import itertools import json @@ -28,21 +29,49 @@ from transformers.utils import is_peft_available from ..utils import DynamicPatch, Patcher +from ..utils.deprecate_utils import deprecate from ..utils.import_utils import is_neuronx_distributed_available from ..utils.misc import download_checkpoints_in_cache from ..utils.require_utils import requires_neuronx_distributed, requires_safetensors, requires_torch_xla +if is_neuronx_distributed_available(): + from neuronx_distributed.parallel_layers import layers + if TYPE_CHECKING: from transformers import PreTrainedModel - if is_neuronx_distributed_available(): - from neuronx_distributed.parallel_layers import layers - TENSOR_PARALLEL_SHARDS_DIR_NAME = "tensor_parallel_shards" +@deprecate( + "2.0.0", + package_name="torch", + reason="torch.nn.Module._named_members takes a `remove_duplicate` parameter starting from 2.0.0", +) +def _named_members(module, get_members_fn, prefix="", recurse=True, remove_duplicate: bool = True): + r"""Helper method for yielding various names + members of modules.""" + memo = set() + modules = module.named_modules(prefix=prefix, remove_duplicate=remove_duplicate) if recurse else [(prefix, module)] + for module_prefix, mod in modules: + members = get_members_fn(mod) + for k, v in members: + if v is None or v in memo: + continue + if remove_duplicate: + memo.add(v) + name = module_prefix + ("." if module_prefix else "") + k + yield name, v + + +def named_parameters(module: "torch.nn.Module", prefix: str = "", recurse: bool = True, remove_duplicate: bool = True): + gen = _named_members( + module, lambda mod: mod._parameters.items(), prefix=prefix, recurse=recurse, remove_duplicate=remove_duplicate + ) + yield from gen + + @dataclass class WeightInformation: """ @@ -140,6 +169,14 @@ def _validate_weight_info_device_matches_specified_device(device: "torch.device" ) +def mark_parameter_init_status_during_parallelization(parameter: "torch.nn.Parameter", initialized: bool): + setattr(parameter, "_was_initialized_during_parallelization", initialized) + + +def was_already_initialized_during_parallelization(parameter: "torch.nn.Parameter") -> bool: + return getattr(parameter, "_was_initialized_during_parallelization", False) + + @requires_neuronx_distributed def embedding_to_parallel_embedding( embedding_layer: "torch.nn.Embedding", @@ -217,10 +254,14 @@ def embedding_to_parallel_embedding( ), ) parallel_embedding_layer.weight.copy_(weight_data) - else: + mark_parameter_init_status_during_parallelization(parallel_embedding_layer.weight, True) + elif embedding_layer.weight.device != torch.device("meta"): parallel_embedding_layer.weight.copy_( embedding_layer.weight[tp_rank * row_size : (tp_rank + 1) * row_size, :] ) + mark_parameter_init_status_during_parallelization(parallel_embedding_layer.weight, True) + else: + mark_parameter_init_status_during_parallelization(parallel_embedding_layer.weight, False) if lm_head_layer is not None: parallel_lm_head_layer = linear_to_parallel_linear( @@ -334,19 +375,25 @@ def linear_to_parallel_linear( ), ) parallel_linear_layer.weight.copy_(weight_data) + mark_parameter_init_status_during_parallelization(parallel_linear_layer.weight, True) elif linear_layer.weight.device != torch.device("meta"): parallel_linear_layer.weight.copy_( linear_layer.weight[:, tp_rank * col_size : (tp_rank + 1) * col_size] ) + mark_parameter_init_status_during_parallelization(parallel_linear_layer.weight, True) else: - raise ValueError("Could not find data for the linear layer to parellelize.") + mark_parameter_init_status_during_parallelization(parallel_linear_layer.weight, False) if linear_layer.bias is not None: if linear_layer_bias_weight_info is not None: bias_weight_data = load_tensor_for_weight(linear_layer_bias_weight_info) parallel_linear_layer.bias.copy_(bias_weight_data) - else: + mark_parameter_init_status_during_parallelization(parallel_linear_layer.bias, True) + elif linear_layer.bias.device != torch.device("meta"): parallel_linear_layer.bias.copy_(linear_layer.bias) + mark_parameter_init_status_during_parallelization(parallel_linear_layer.bias, True) + else: + mark_parameter_init_status_during_parallelization(parallel_linear_layer.bias, False) else: if embedding_weight_to_tie is not None: @@ -360,12 +407,14 @@ def linear_to_parallel_linear( ), ) parallel_linear_layer.weight.copy_(weight_data) + mark_parameter_init_status_during_parallelization(parallel_linear_layer.weight, True) elif linear_layer.weight.device != torch.device("meta"): parallel_linear_layer.weight.copy_( linear_layer.weight[tp_rank * row_size : (tp_rank + 1) * row_size, :] ) + mark_parameter_init_status_during_parallelization(parallel_linear_layer.weight, True) else: - raise ValueError("Could not find data for the linear layer to parellelize.") + mark_parameter_init_status_during_parallelization(parallel_linear_layer.weight, False) if linear_layer.bias is not None: if linear_layer_bias_weight_info is not None: @@ -383,13 +432,17 @@ def linear_to_parallel_linear( tensor_slices=tensor_slices, ) parallel_linear_layer.bias.copy_(bias_weight_data) - else: + mark_parameter_init_status_during_parallelization(parallel_linear_layer.bias, True) + elif linear_layer.bias.device != torch.device("meta"): if gather_output: parallel_linear_layer.bias.copy_(linear_layer.bias) else: parallel_linear_layer.bias.copy_( linear_layer.bias[tp_rank * row_size : (tp_rank + 1) * row_size] ) + mark_parameter_init_status_during_parallelization(parallel_linear_layer.bias, True) + else: + mark_parameter_init_status_during_parallelization(parallel_linear_layer.bias, False) return parallel_linear_layer @@ -451,13 +504,15 @@ def gqa_key_value_slicing_when_tp_size_greater_than_num_key_value_heads( ), ) sliced_linear_layer.weight.copy_(weight_data) + mark_parameter_init_status_during_parallelization(sliced_linear_layer.weight, True) elif linear_layer.weight.device != torch.device("meta"): sliced_linear_layer.weight.copy_( linear_layer.weight[key_value_head_index * head_dim : (key_value_head_index + 1) * head_dim, :] ) + mark_parameter_init_status_during_parallelization(sliced_linear_layer.weight, True) else: - raise ValueError("Could not find data for the linear layer to slice.") + mark_parameter_init_status_during_parallelization(sliced_linear_layer.weight, False) if linear_layer.bias is not None: if linear_layer_bias_weight_info is not None: @@ -466,10 +521,14 @@ def gqa_key_value_slicing_when_tp_size_greater_than_num_key_value_heads( tensor_slices=((key_value_head_index * head_dim, (key_value_head_index + 1) * head_dim),), ) sliced_linear_layer.bias.copy_(bias_weight_data) - else: + mark_parameter_init_status_during_parallelization(sliced_linear_layer.bias, True) + elif sliced_linear_layer.bias.device != torch.device("meta"): sliced_linear_layer.bias.copy_( linear_layer.bias[key_value_head_index * head_dim : (key_value_head_index + 1) * head_dim] ) + mark_parameter_init_status_during_parallelization(sliced_linear_layer.bias, True) + else: + mark_parameter_init_status_during_parallelization(sliced_linear_layer.bias, False) return sliced_linear_layer @@ -490,31 +549,47 @@ def try_to_hf_initialize(model: "PreTrainedModel", mod: torch.nn.Module, paramet """ cached_params_data = {name: param.data.clone() for name, param in mod.named_parameters()} model._init_weights(mod) + + dummy_mod = copy.deepcopy(mod) + for name in parameter_names: + getattr(dummy_mod, name).random_() + model._init_weights(dummy_mod) + left_uninitialized = [] with torch.no_grad(): for name in parameter_names: - if torch.all(cached_params_data[name] == getattr(mod, name).data): - left_uninitialized.append(name) + # The parameter was left unchanged. + if torch.all(getattr(mod, name).data == cached_params_data[name]): + # There are two possible reasons: + # 1. The model cannot initialize the module that owns the parameter. + # 2. The parameter already had the proper value. + + # We check if a dummy copy of the module, filled with random values is modified to know if the model + # can initialize the module. + dummy_param_was_changed = torch.all(getattr(dummy_mod, name).data == getattr(mod, name).data) + if not dummy_param_was_changed: + left_uninitialized.append(name) + for name, cached_data in cached_params_data.items(): if name not in parameter_names: param = getattr(mod, name) param.data = cached_data + return left_uninitialized -def initialize_linear(mod: torch.nn.Linear, parameter_names: List[str]): +def initialize_torch_nn_module(mod: torch.nn.Module, parameter_names: List[str]): """ Initializes the parameters in `parameter_names` of a `torch.nn.Linear` module. """ - cached_parameters = [mod.weight.data] - if mod.bias is not None: - cached_parameters.append(mod.bias.data) + if not hasattr(mod, "reset_parameters"): + raise ValueError(f"{mod} does not have a `reset_parameters` method.") + cached_parameters = {name: param.data.clone() for name, param in mod.named_parameters()} mod.reset_parameters() with torch.no_grad(): - if "weight" not in parameter_names: - mod.weight.data = cached_parameters[0] - if mod.bias is not None and "bias" not in parameter_names: - mod.bias.data = cached_parameters[1] + for name, param in mod.named_parameters(): + if param is not None and name not in parameter_names: + param.data = cached_parameters[name] def initialize_parallel_linear(mod: "layers.BaseParallelLinear", parameter_names: List[str]): @@ -531,9 +606,18 @@ def initialize_parallel_linear(mod: "layers.BaseParallelLinear", parameter_names mod._init_bias() +def parameter_can_be_initialized(model: torch.nn.Module, parent_module: torch.nn.Module, parameter_name: str) -> bool: + clone = copy.deepcopy(parent_module) + left_uninitialized = try_to_hf_initialize(model, clone, [parameter_name]) + is_parallel_linear = isinstance(parent_module, layers.BaseParallelLinear) + return ( + hasattr(parent_module, "reset_parameters") or is_parallel_linear or (parameter_name not in left_uninitialized) + ) + + @classmethod @requires_torch_xla -def from_pretrained_for_tp( +def from_pretrained_for_mp( cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, @@ -672,8 +756,8 @@ def from_pretrained_for_tp( if not sharing_same_suffix_as_name: continue names_of_weights_not_in_model.add(name) - longest_sharing_parameter_name = max(sharing_same_suffix_as_name, key=lambda s: len(s)) - prefixes.add(longest_sharing_parameter_name.replace(name, "")) + shortest_sharing_parameter_name = min(sharing_same_suffix_as_name, key=lambda s: len(s)) + prefixes.add(shortest_sharing_parameter_name.replace(name, "")) else: weight_map_for_model[name] = filename if names_of_weights_not_in_model: @@ -703,7 +787,7 @@ def from_pretrained_for_tp( @contextlib.contextmanager -def lazy_load_for_parallelism(tensor_parallel_size: int = 1): +def lazy_load_for_parallelism(tensor_parallel_size: int = 1, pipeline_parallel_size: int = 1): """ Context manager that makes the loading of a model lazy for model parallelism: @@ -711,11 +795,15 @@ def lazy_load_for_parallelism(tensor_parallel_size: int = 1): instantiate. - Every `torch.nn.Embedding` is also put on the `torch.device("meta")` device. - No state dict is actually loaded, instead a weight map is created and attached to the model. For more - information, read the [`optimum.neuron.distributed.utils.from_pretrained_for_tp`] docstring. + information, read the [`optimum.neuron.distributed.utils.from_pretrained_for_mp`] docstring. + + If both `tensor_parallel_size` and `pipeline_parallel_size` are set to 1, no lazy loading is performed. Args: tensor_parallel_size (`int`, defaults to 1): - The parallel size considered for tensor parallel size. If set to 1, no lazy loading is performed. + The tensor parallel size considered. + pipeline_parallel_size (`int`, defaults to 1): + The pipeline parallel size considered. """ def meta_init(init_fn): @@ -731,9 +819,9 @@ def wrapper(*args, **kwargs): patching_specs = [ ("torch.nn.Embedding.__init__", meta_init_patch), ("torch.nn.Linear.__init__", meta_init_patch), - ("transformers.modeling_utils.PreTrainedModel.from_pretrained", from_pretrained_for_tp), + ("transformers.modeling_utils.PreTrainedModel.from_pretrained", from_pretrained_for_mp), ] - if tensor_parallel_size > 1: + if tensor_parallel_size > 1 or pipeline_parallel_size > 1: patcher = Patcher(patching_specs=patching_specs) else: patcher = contextlib.nullcontext() @@ -753,6 +841,21 @@ def make_optimizer_constructor_lazy(optimizer_cls: Type["torch.optim.Optimizer"] def optimizer_constructor(*args, **kwargs): optimizer_with_no_parameters = optimizer_cls([torch.nn.Parameter(torch.empty(1))], *args[1:], **kwargs) + # It is necessary to make sure that what's holding the parameters is not an iterator, otherwise it can lead to + # unexpected behaviour since each entry will be evaluated at iteration time. There are 2 possibilities: + # 1. args[0] holds the parameters + # 2. args[0] holds a list of parameter groups + parameters_or_parameter_groups = args[0] + if not isinstance(parameters_or_parameter_groups, list): + parameters_or_parameter_groups = list(parameters_or_parameter_groups) + if isinstance(parameters_or_parameter_groups[0], dict): + # It means that parameter groups were provided. We iterate over each group and make sure that the + # `"params"` entry is not an iterator. + for group in parameters_or_parameter_groups: + if not isinstance(group["params"], list): + group["params"] = list(group["params"]) + + args = (parameters_or_parameter_groups,) + args[1:] optimizer_with_no_parameters._args_to_recreate = (args, kwargs) return optimizer_with_no_parameters diff --git a/optimum/neuron/trainers.py b/optimum/neuron/trainers.py index e464d0f02..c066ae797 100755 --- a/optimum/neuron/trainers.py +++ b/optimum/neuron/trainers.py @@ -14,11 +14,14 @@ # limitations under the License. """Defines Trainer subclasses to perform training on AWS Neuron instances.""" -import contextlib import copy import glob +import math import os import random +import shutil +import sys +import time import warnings from pathlib import Path from tempfile import TemporaryDirectory @@ -26,41 +29,59 @@ import numpy as np import torch +from accelerate import __version__ as accelerate_version from packaging import version from transformers import PreTrainedModel, Seq2SeqTrainer, Trainer, TrainingArguments +from transformers.debug_utils import DebugOption, DebugUnderflowOverflow +from transformers.integrations import hp_params from transformers.modeling_utils import unwrap_model +from transformers.pytorch_utils import is_torch_less_than_1_11 from transformers.trainer import ( OPTIMIZER_NAME, SCHEDULER_NAME, TRAINER_STATE_NAME, TRAINING_ARGS_NAME, ) +from transformers.trainer_callback import TrainerState from transformers.trainer_pt_utils import ( + IterableDatasetShard, + find_batch_size, + get_dataloader_sampler, + nested_concat, + nested_numpify, reissue_pt_warnings, ) -from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, EvalLoopOutput, has_length -from transformers.utils import WEIGHTS_NAME, is_sagemaker_mp_enabled +from transformers.trainer_utils import ( + PREFIX_CHECKPOINT_DIR, + EvalLoopOutput, + EvalPrediction, + HPSearchBackend, + TrainOutput, + denumpify_detensorize, + has_length, + speed_metrics, +) +from transformers.training_args import ParallelMode +from transformers.utils import WEIGHTS_NAME, is_apex_available, is_sagemaker_mp_enabled from ..utils import check_if_transformers_greater, logging from .accelerate import NeuronAccelerator, NeuronDistributedType -from .distributed import ParallelizersManager +from .distributed import Parallelizer, ParallelizersManager from .distributed.utils import make_optimizer_constructor_lazy from .trainer_callback import NeuronCacheCallback from .utils import ( - DynamicPatch, - ModelPatcher, Patcher, is_torch_xla_available, patch_within_function, ) from .utils.cache_utils import get_neuron_cache_path, set_neuron_cache_path +from .utils.require_utils import requires_neuronx_distributed from .utils.training_utils import ( TRANSFORMERS_MIN_VERSION_USE_ACCELERATE, get_model_param_count, is_precompilation, is_topology_supported, patch_generation_mixin_to_neuron_generation_mixin, - patched_finfo, prepare_environment_for_neuron, set_neuron_cc_optlevel_for_model, skip_first_batches, @@ -68,8 +89,15 @@ ) +if is_apex_available(): + from apex import amp + +if is_sagemaker_mp_enabled(): + import smdistributed.modelparallel.torch as smp + if is_torch_xla_available(): import torch_xla.core.xla_model as xm + import torch_xla.debug.metrics as met if is_sagemaker_mp_enabled(): from smdistributed.modelparallel import __version__ as SMP_VERSION @@ -79,7 +107,6 @@ else: IS_SAGEMAKER_MP_POST_1_10 = False - logger = logging.get_logger("transformers.trainer") KEEP_HF_HUB_PROGRESS_BARS = os.environ.get("KEEP_HF_HUB_PROGRESS_BARS") @@ -94,16 +121,6 @@ _TCP_STORE_PORT = 5000 -MODEL_PATCHING_SPECS = [ - ("config.layerdrop", 0), - ("no_sync", lambda: contextlib.nullcontext()), - ( - "forward", - DynamicPatch(patch_within_function(("torch.finfo", patched_finfo))), - ), -] - - if os.environ.get("TORCHELASTIC_RUN_ID"): import torch_xla.distributed.xla_backend as xbn @@ -178,7 +195,7 @@ def __init__(self, *args, **kwargs): logger.setLevel(logging.INFO) push = self.args.local_rank <= 0 and not is_precompilation() and not self.args.skip_cache_push - fetch = self.args.local_rank <= 0 or self.args.tp_plugin.should_parallelize + fetch = self.args.local_rank <= 0 or self.args.mp_plugin.should_parallelize callback = NeuronCacheCallback( tmp_neuron_cache=_TMP_NEURON_CACHE_PATH, @@ -196,11 +213,8 @@ def __init__(self, *args, **kwargs): set_neuron_cc_optlevel_for_model(self.model, optlevel=self.args.neuron_cc_optlevel) @property - def tp_enabled(self): - return ( - check_if_transformers_greater("4.30.0") - and self.accelerator.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM - ) + def mp_enabled(self): + return self.accelerator.distributed_type is NeuronDistributedType.MODEL_PARALLELISM def prepare_args_for_precompilation(self, args: "TrainingArguments"): if args.num_train_epochs != 1: @@ -221,7 +235,7 @@ def create_accelerator_and_postprocess(self): self.accelerator = NeuronAccelerator( deepspeed_plugin=self.args.deepspeed_plugin, gradient_accumulation_steps=self.args.gradient_accumulation_steps, - tp_plugin=self.args.tp_plugin, + mp_plugin=self.args.mp_plugin, zero_1=self.args.zero_1, ) @@ -246,12 +260,9 @@ def create_accelerator_and_postprocess(self): ds_plugin.hf_ds_config.trainer_config_process(self.args) def _wrap_model(self, model, training=True, dataloader=None): - patching_specs = [] - for spec in MODEL_PATCHING_SPECS: - patching_specs.append((model,) + spec) - - with ModelPatcher(patching_specs, ignore_missing_attributes=True): - return super()._wrap_model(model, training=training, dataloader=dataloader) + return super()._wrap_model( + self.accelerator.patch_model_for_neuron(model), training=training, dataloader=dataloader + ) # TODO: make this cleaner. def trigger_on_step_middle_for_neuron_cache_callback(self, model: "PreTrainedModel"): @@ -269,7 +280,7 @@ def trigger_on_step_middle_for_neuron_cache_callback(self, model: "PreTrainedMod callback.on_step_middle(self.args, self.state, self.control, **kwargs) def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: - if self.tp_enabled: + if self.mp_enabled: if self.train_dataset is None or not has_length(self.train_dataset): return None @@ -285,7 +296,7 @@ def _get_eval_sampler(self, eval_dataset: torch.utils.data.Dataset) -> Optional[ @staticmethod def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]: optimizer_cls, optimizer_kwargs = transformers_get_optimizer_cls_and_kwargs(args) - lazy_load = args.tp_plugin.should_parallelize or args.zero_1 + lazy_load = args.mp_plugin.should_parallelize or args.zero_1 if check_if_transformers_greater("4.30.0") and lazy_load: optimizer_cls = make_optimizer_constructor_lazy(optimizer_cls) return optimizer_cls, optimizer_kwargs @@ -294,11 +305,47 @@ def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]: def create_optimizer(self): return super().create_optimizer() + def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torch.Tensor, Any]: + # When pipeline parallelism is enabled, we should not put any tensor on device. + # It is handled by the NxDPPModel class. + if self.args.mp_plugin.pipeline_parallel_size > 1: + return data + return super()._prepare_input(data) + def compute_loss(self, model, inputs, return_outputs: bool = False): self.state.last_inputs = inputs self.trigger_on_step_middle_for_neuron_cache_callback(model) + from neuronx_distributed.pipeline import NxDPPModel + + if isinstance(model, NxDPPModel): + inputs = self._prepare_inputs(inputs) + loss = model.run_train(**inputs) + return loss + return super().compute_loss(model, inputs, return_outputs=return_outputs) + def training_step(self, model: torch.nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor: + from neuronx_distributed.pipeline import NxDPPModel + + if isinstance(model, NxDPPModel): + from neuronx_distributed.parallel_layers.parallel_state import ( + get_pipeline_model_parallel_rank, + get_pipeline_model_parallel_size, + ) + + with self.compute_loss_context_manager(): + loss = self.compute_loss(model, inputs) + + if get_pipeline_model_parallel_rank() != get_pipeline_model_parallel_size() - 1: + use_bf16 = os.environ.get("XLA_USE_BF16", False) or os.environ.get("XLA_DOWNCAST_BF16", False) + dtype = torch.bfloat16 if use_bf16 else torch.float32 + loss = torch.tensor(0, dtype=dtype).to(xm.xla_device()) + else: + loss = loss.detach() + return loss / self.args.gradient_accumulation_steps + return super().training_step(model, inputs) + + @requires_neuronx_distributed def prediction_step( self, model: torch.nn.Module, @@ -306,21 +353,21 @@ def prediction_step( prediction_loss_only: bool, ignore_keys: Optional[List[str]] = None, ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: + from neuronx_distributed.pipeline import NxDPPModel + self.state.last_inputs = inputs self.trigger_on_step_middle_for_neuron_cache_callback(model) - return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) - @patch_within_function(("transformers.trainer.get_model_param_count", get_model_param_count)) - def _inner_training_loop( - self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None - ): - return super()._inner_training_loop( - batch_size=batch_size, - args=args, - resume_from_checkpoint=resume_from_checkpoint, - trial=trial, - ignore_keys_for_eval=ignore_keys_for_eval, - ) + if isinstance(model, NxDPPModel): + if not prediction_loss_only: + raise ValueError("Only the prediction loss can be returned when doing pipeline parallelism.") + loss = model.run_eval(**inputs) + if loss is None: + use_bf16 = os.environ.get("XLA_USE_BF16", False) or os.environ.get("XLA_DOWNCAST_BF16", False) + dtype = torch.bfloat16 if use_bf16 else torch.float32 + loss = torch.tensor(0, dtype=dtype).to(xm.xla_device()) + return (loss, None, None) + return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval): if self.control.should_log: @@ -328,20 +375,36 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for xm.mark_step() - if self.args.tp_plugin.tensor_parallel_size > 1: + if self.args.mp_plugin.should_parallelize: from neuronx_distributed.parallel_layers.parallel_state import ( get_data_parallel_group, get_data_parallel_size, + get_pipeline_model_parallel_group, + get_pipeline_model_parallel_size, ) dp_size = get_data_parallel_size() + pp_size = get_pipeline_model_parallel_size() tr_loss_div = tr_loss / dp_size - tr_loss_scalar = xm.all_reduce( - xm.REDUCE_SUM, - tr_loss_div, - groups=get_data_parallel_group(as_list=True), - ) - tr_loss_scalar = tr_loss_scalar.detach().item() + + if pp_size > 1: + tr_loss_div = xm.all_reduce( + xm.REDUCE_SUM, tr_loss_div, groups=get_data_parallel_group(as_list=True) + ) + tr_loss_div = xm.all_reduce( + xm.REDUCE_SUM, + tr_loss_div, + groups=get_pipeline_model_parallel_group(as_list=True), + ) + xm.mark_step() + tr_loss_scalar = tr_loss_div.item() + else: + tr_loss_scalar = xm.all_reduce( + xm.REDUCE_SUM, + tr_loss_div, + groups=get_data_parallel_group(as_list=True), + ) + tr_loss_scalar = tr_loss_scalar.detach().item() else: # all_gather + mean() to get average loss over all processes tr_loss_scalar = self._nested_gather(tr_loss).mean().item() @@ -395,20 +458,20 @@ def _save_xla(self, output_dir: Optional[str] = None): # Save a trained model and configuration using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` xm.rendezvous("saving_checkpoint") - if self.accelerator.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM: + if self.accelerator.distributed_type is NeuronDistributedType.MODEL_PARALLELISM: logger.info("Model parallelism is enabled, only saving the model sharded state dict.") + # TODO: how to handle pp? if isinstance(self.model, PreTrainedModel): from neuronx_distributed.parallel_layers.parallel_state import get_tensor_model_parallel_size config = copy.deepcopy(self.model.config) - if self.args.tp_plugin.parallelize_embeddings: + if self.args.mp_plugin.parallelize_embeddings: config.vocab_size = config.vocab_size * get_tensor_model_parallel_size() config.save_pretrained(output_dir) - parallelizer = ParallelizersManager.parallelizer_for_model(self.model) # This mark_step is needed to avoid hang issues. xm.mark_step() - parallelizer.save_model_checkpoint(self.model, output_dir, as_sharded=True, optimizer=self.optimizer) + Parallelizer.save_model_checkpoint(self.model, output_dir, as_sharded=True, optimizer=self.optimizer) else: safe_save_function_patcher = Patcher( [("transformers.modeling_utils.safe_save_file", torch_xla_safe_save_file)] @@ -468,8 +531,9 @@ def _save_checkpoint(self, model, trial, metrics=None): self.save_model(output_dir, _internal_call=True) # The optimizer state is saved in the shard alongside with the model parameters when doing TP. - if self.accelerator.distributed_type is not NeuronDistributedType.TENSOR_PARALLELISM: + if self.accelerator.distributed_type is not NeuronDistributedType.MODEL_PARALLELISM: xm.rendezvous("saving_optimizer_states") + # TODO: how to handle pp? xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME)) with warnings.catch_warnings(record=True) as caught_warnings: @@ -523,9 +587,10 @@ def _save_checkpoint(self, model, trial, metrics=None): def _load_from_checkpoint(self, resume_from_checkpoint, model=None): # It has been handled during model parallelization. - if self.accelerator.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM: + # TODO: how to handle pp? + if self.accelerator.distributed_type is NeuronDistributedType.MODEL_PARALLELISM: return - super()._load_from_checkpoint(self, resume_from_checkpoint, model=model) + super()._load_from_checkpoint(resume_from_checkpoint, model=model) def _load_optimizer_and_scheduler_for_xla_fsdp(self, checkpoint): checkpoint_file_exists = ( @@ -549,7 +614,7 @@ def _load_optimizer_and_scheduler(self, checkpoint): return if self.accelerator.distributed_type is NeuronDistributedType.XLA_FSDP: return self._load_optimizer_and_scheduler_for_xla_fsdp(checkpoint) - elif self.accelerator.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM: + elif self.accelerator.distributed_type is NeuronDistributedType.MODEL_PARALLELISM: lr_scheduler_state = torch.load(os.path.join(checkpoint, SCHEDULER_NAME), map_location="cpu") xm.send_cpu_data_to_device(lr_scheduler_state, self.args.device) self.lr_scheduler.load_state_dict(lr_scheduler_state) @@ -559,18 +624,461 @@ def _load_optimizer_and_scheduler(self, checkpoint): else: return super()._load_optimizer_and_scheduler(checkpoint) - @patch_within_function(("transformers.trainer.skip_first_batches", skip_first_batches)) + @requires_neuronx_distributed def _inner_training_loop( self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None ): - return super()._inner_training_loop( - batch_size=batch_size, - args=args, - resume_from_checkpoint=resume_from_checkpoint, - trial=trial, - ignore_keys_for_eval=ignore_keys_for_eval, + from neuronx_distributed.pipeline import NxDPPModel + + self.accelerator.free_memory() + self._train_batch_size = batch_size + logger.debug(f"Currently training with a batch size of: {self._train_batch_size}") + # Data loader and number of training steps + train_dataloader = self.get_train_dataloader() + + # Setting up training control variables: + # number of training epochs: num_train_epochs + # number of training steps per epoch: num_update_steps_per_epoch + # total number of training steps to execute: max_steps + total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size + + len_dataloader = None + num_train_tokens = None + if has_length(train_dataloader): + len_dataloader = len(train_dataloader) + num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps + num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1) + num_examples = self.num_examples(train_dataloader) + if args.max_steps > 0: + max_steps = args.max_steps + num_train_epochs = args.max_steps // num_update_steps_per_epoch + int( + args.max_steps % num_update_steps_per_epoch > 0 + ) + # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's + # the best we can do. + num_train_samples = args.max_steps * total_train_batch_size + if args.include_tokens_per_second: + num_train_tokens = ( + self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps + ) + else: + max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch) + num_train_epochs = math.ceil(args.num_train_epochs) + num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs + if args.include_tokens_per_second: + num_train_tokens = self.num_tokens(train_dataloader) * args.num_train_epochs + elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size + max_steps = args.max_steps + # Setting a very large number of epochs so we go as many times as necessary over the iterator. + num_train_epochs = sys.maxsize + num_update_steps_per_epoch = max_steps + num_examples = total_train_batch_size * args.max_steps + num_train_samples = args.max_steps * total_train_batch_size + if args.include_tokens_per_second: + num_train_tokens = self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps + else: + raise ValueError( + "args.max_steps must be set to a positive value if dataloader does not have a length, was" + f" {args.max_steps}" + ) + + if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug: + if self.args.n_gpu > 1: + # nn.DataParallel(model) replicates the model, creating new variables and module + # references registered here no longer work on other gpus, breaking the module + raise ValueError( + "Currently --debug underflow_overflow is not supported under DP. Please use DDP" + " (torch.distributed.launch)." + ) + else: + debug_overflow = DebugUnderflowOverflow(self.model) # noqa + + delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled + + # We need to reset the scheduler, as its parameters may be different on subsequent calls + if self._created_lr_scheduler: + self.lr_scheduler = None + self._created_lr_scheduler = False + + if not delay_optimizer_creation: + self.create_optimizer_and_scheduler(num_training_steps=max_steps) + + self.state = TrainerState() + self.state.is_hyper_param_search = trial is not None + + # Compute absolute values for logging, eval, and save if given as ratio + if args.logging_steps is not None: + if args.logging_steps < 1: + self.state.logging_steps = math.ceil(max_steps * args.logging_steps) + else: + self.state.logging_steps = args.logging_steps + if args.eval_steps is not None: + if args.eval_steps < 1: + self.state.eval_steps = math.ceil(max_steps * args.eval_steps) + else: + self.state.eval_steps = args.eval_steps + if args.save_steps is not None: + if args.save_steps < 1: + self.state.save_steps = math.ceil(max_steps * args.save_steps) + else: + self.state.save_steps = args.save_steps + + # Activate gradient checkpointing if needed + if args.gradient_checkpointing: + if args.gradient_checkpointing_kwargs is None: + gradient_checkpointing_kwargs = {} + else: + gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs + + self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) + + model = self._wrap_model(self.model_wrapped) + + # as the model is wrapped, don't use `accelerator.prepare` + # this is for unhandled cases such as + # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX + use_accelerator_prepare = True if model is self.model else False + + if delay_optimizer_creation: + if use_accelerator_prepare: + self.model = self.accelerator.prepare(self.model) + self.create_optimizer_and_scheduler(num_training_steps=max_steps) + + # prepare using `accelerator` prepare + if use_accelerator_prepare: + self.model.train() + if hasattr(self.lr_scheduler, "step"): + if self.use_apex: + model = self.accelerator.prepare(self.model) + else: + model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer) + else: + # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config. + model, self.optimizer, self.lr_scheduler = self.accelerator.prepare( + self.model, self.optimizer, self.lr_scheduler + ) + + if isinstance(model, NxDPPModel): + self.model = model + + if self.is_fsdp_enabled: + self.model = self.model_wrapped = model + + # for the rest of this function `model` is the outside model, whether it was wrapped or not + if model is not self.model: + self.model_wrapped = model + + # Check if saved optimizer or scheduler states exist + self._load_optimizer_and_scheduler(resume_from_checkpoint) + + # important: at this point: + # self.model is the Transformers Model + # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), + # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc. + + # Train! + logger.info("***** Running training *****") + logger.info(f" Num examples = {num_examples:,}") + logger.info(f" Num Epochs = {num_train_epochs:,}") + logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}") + if self.args.per_device_train_batch_size != self._train_batch_size: + logger.info(f" Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {max_steps:,}") + logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}") + + self.state.epoch = 0 + start_time = time.time() + epochs_trained = 0 + steps_trained_in_current_epoch = 0 + steps_trained_progress_bar = None + + # Check if continuing training from a checkpoint + if resume_from_checkpoint is not None and os.path.isfile( + os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME) + ): + self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)) + epochs_trained = self.state.global_step // num_update_steps_per_epoch + if not args.ignore_data_skip: + steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch) + steps_trained_in_current_epoch *= args.gradient_accumulation_steps + else: + steps_trained_in_current_epoch = 0 + + logger.info(" Continuing training from checkpoint, will skip to saved global_step") + logger.info(f" Continuing training from epoch {epochs_trained}") + logger.info(f" Continuing training from global step {self.state.global_step}") + if not args.ignore_data_skip: + logger.info( + f" Will skip the first {epochs_trained} epochs then the first" + f" {steps_trained_in_current_epoch} batches in the first epoch." + ) + + # Update the references + self.callback_handler.model = self.model + self.callback_handler.optimizer = self.optimizer + self.callback_handler.lr_scheduler = self.lr_scheduler + self.callback_handler.train_dataloader = train_dataloader + if self.hp_name is not None and self._trial is not None: + # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial + # parameter to Train when using DDP. + self.state.trial_name = self.hp_name(self._trial) + if trial is not None: + assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial + self.state.trial_params = hp_params(assignments) + else: + self.state.trial_params = None + # This should be the same if the state has been saved but in case the training arguments changed, it's safer + # to set this after the load. + self.state.max_steps = max_steps + self.state.num_train_epochs = num_train_epochs + self.state.is_local_process_zero = self.is_local_process_zero() + self.state.is_world_process_zero = self.is_world_process_zero() + + # tr_loss is a tensor to avoid synchronization of TPUs through .item() + tr_loss = torch.tensor(0.0).to(args.device) + # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses + self._total_loss_scalar = 0.0 + self._globalstep_last_logged = self.state.global_step + + # It should be equivalent but prefer to use the `zero_grad` method from the optimizer when doing pipeline + # parallelism. + if isinstance(model, NxDPPModel): + self.optimizer.zero_grad() + else: + model.zero_grad() + + self.control = self.callback_handler.on_train_begin(args, self.state, self.control) + + # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. + if not args.ignore_data_skip: + for epoch in range(epochs_trained): + sampler = get_dataloader_sampler(train_dataloader) + sampler_kinds = [torch.utils.data.RandomSampler] + if version.parse(accelerate_version) > version.parse("0.23.0"): + from accelerate.data_loader import SeedableRandomSampler + + sampler_kinds.append(SeedableRandomSampler) + is_random_sampler = isinstance(sampler, tuple(sampler_kinds)) + if is_torch_less_than_1_11 or not is_random_sampler: + # We just need to begin an iteration to create the randomization of the sampler. + for _ in train_dataloader: + break + else: + # Otherwise we need to call the whooooole sampler cause there is some random operation added + # AT THE VERY END! + sampler = sampler if sampler is not None else [] + _ = list(sampler) + + total_batched_samples = 0 + for epoch in range(epochs_trained, num_train_epochs): + epoch_iterator = train_dataloader + if hasattr(epoch_iterator, "set_epoch"): + epoch_iterator.set_epoch(epoch) + + # Reset the past mems state at the beginning of each epoch if necessary. + if args.past_index >= 0: + self._past = None + + steps_in_epoch = ( + len(epoch_iterator) + if len_dataloader is not None + else args.max_steps * args.gradient_accumulation_steps + ) + self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control) + + if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0: + self._load_rng_state(resume_from_checkpoint) + + rng_to_sync = False + steps_skipped = 0 + if steps_trained_in_current_epoch > 0: + epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch) + steps_skipped = steps_trained_in_current_epoch + steps_trained_in_current_epoch = 0 + rng_to_sync = True + + step = -1 + for step, inputs in enumerate(epoch_iterator): + total_batched_samples += 1 + if rng_to_sync: + self._load_rng_state(resume_from_checkpoint) + rng_to_sync = False + + # Skip past any already trained steps if resuming training + if steps_trained_in_current_epoch > 0: + steps_trained_in_current_epoch -= 1 + if steps_trained_progress_bar is not None: + steps_trained_progress_bar.update(1) + if steps_trained_in_current_epoch == 0: + self._load_rng_state(resume_from_checkpoint) + continue + elif steps_trained_progress_bar is not None: + steps_trained_progress_bar.close() + steps_trained_progress_bar = None + + if step % args.gradient_accumulation_steps == 0: + self.control = self.callback_handler.on_step_begin(args, self.state, self.control) + + with self.accelerator.accumulate(model): + tr_loss_step = self.training_step(model, inputs) + + if ( + args.logging_nan_inf_filter + and not is_torch_xla_available() + and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) + ): + # if loss is nan or inf simply add the average of previous logged losses + tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) + else: + tr_loss += tr_loss_step + + self.current_flos += float(self.floating_point_ops(inputs)) + + is_last_step_and_steps_less_than_grad_acc = ( + steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch + ) + + if ( + total_batched_samples % args.gradient_accumulation_steps == 0 + or + # last step in epoch but step is always smaller than gradient_accumulation_steps + is_last_step_and_steps_less_than_grad_acc + ): + # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered + # in accelerate. So, explicitly enable sync gradients to True in that case. + if is_last_step_and_steps_less_than_grad_acc or ( + version.parse(accelerate_version) <= version.parse("0.20.3") + ): + self.accelerator.gradient_state._set_sync_gradients(True) + + # Gradient clipping + if args.max_grad_norm is not None and args.max_grad_norm > 0: + # deepspeed does its own clipping + + if is_sagemaker_mp_enabled() and args.fp16: + self.optimizer.clip_master_grads(args.max_grad_norm) + elif self.use_apex: + # Revert to normal clipping otherwise, handling Apex or full precision + torch.nn.utils.clip_grad_norm_( + amp.master_params(self.optimizer), + args.max_grad_norm, + ) + else: + self.accelerator.clip_grad_norm_( + model.parameters(), + args.max_grad_norm, + ) + + # Optimizer step + self.optimizer.step() + optimizer_was_run = not self.accelerator.optimizer_step_was_skipped + if optimizer_was_run: + # Delay optimizer scheduling until metrics are generated + if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): + self.lr_scheduler.step() + + # It should be equivalent but prefer to use the `zero_grad` method from the optimizer when doing + # pipeline parallelism. + if isinstance(model, NxDPPModel): + self.optimizer.zero_grad() + else: + model.zero_grad() + + self.state.global_step += 1 + self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch + self.control = self.callback_handler.on_step_end(args, self.state, self.control) + + self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) + else: + self.control = self.callback_handler.on_substep_end(args, self.state, self.control) + + if self.control.should_epoch_stop or self.control.should_training_stop: + break + if step < 0: + logger.warning( + "There seems to be not a single sample in your epoch_iterator, stopping training at step" + f" {self.state.global_step}! This is expected if you're using an IterableDataset and set" + f" num_steps ({max_steps}) higher than the number of available samples." + ) + self.control.should_training_stop = True + + self.control = self.callback_handler.on_epoch_end(args, self.state, self.control) + self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) + + if DebugOption.TPU_METRICS_DEBUG in self.args.debug: + if is_torch_xla_available(): + # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) + xm.master_print(met.metrics_report()) + else: + logger.warning( + "You enabled PyTorch/XLA debug metrics but you don't have a TPU " + "configured. Check your training configuration if this is unexpected." + ) + if self.control.should_training_stop: + break + + if args.past_index and hasattr(self, "_past"): + # Clean the state at the end of training + delattr(self, "_past") + + logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") + if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: + # Wait for everyone to get here so we are sure the model has been saved by process 0. + if is_torch_xla_available(): + xm.rendezvous("load_best_model_at_end") + elif args.parallel_mode == ParallelMode.DISTRIBUTED: + torch.distributed.barrier() + elif is_sagemaker_mp_enabled(): + smp.barrier() + + self._load_best_model() + + # add remaining tr_loss + self._total_loss_scalar += tr_loss.item() + train_loss = self._total_loss_scalar / self.state.global_step + + metrics = speed_metrics( + "train", + start_time, + num_samples=num_train_samples, + num_steps=self.state.max_steps, + num_tokens=num_train_tokens, ) + self.store_flos() + metrics["total_flos"] = self.state.total_flos + metrics["train_loss"] = train_loss + + self.is_in_train = False + + self._memory_tracker.stop_and_update_metrics(metrics) + + self.log(metrics) + + run_dir = self._get_output_dir(trial) + checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir) + + # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save. + if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: + for checkpoint in checkpoints_sorted: + if not os.path.samefile(checkpoint, self.state.best_model_checkpoint): + logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") + shutil.rmtree(checkpoint) + self.control = self.callback_handler.on_train_end(args, self.state, self.control) + + # Wait for the checkpoint to be uploaded. + self._finish_current_push() + + # After training we make sure to retrieve back the original forward pass method + # for the embedding layer by removing the forward post hook. + if self.neftune_noise_alpha is not None: + self._deactivate_neftune(self.model) + + return TrainOutput(self.state.global_step, train_loss, metrics) + + @requires_neuronx_distributed def evaluation_loop( self, dataloader: torch.utils.data.DataLoader, @@ -579,19 +1087,233 @@ def evaluation_loop( ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "eval", ) -> EvalLoopOutput: + """ + Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`. + + Works both with or without labels. + """ + from neuronx_distributed.parallel_layers.parallel_state import get_data_parallel_size + from neuronx_distributed.pipeline import NxDPPModel + # This will prepare the model if it was not prepared before. # This is needed for example for TP when we performing only evaluation (no training): # 1. The model needs to be loaded if it was lazy loaded. # 2. The model needs to be parallelized. - self.accelerator.prepare_model(self.model) - - return super().evaluation_loop( - dataloader, - description, - prediction_loss_only=prediction_loss_only, - ignore_keys=ignore_keys, - metric_key_prefix=metric_key_prefix, - ) + model = self.accelerator.prepare_model(self.model) + + args = self.args + + prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only + + is_nxdppmodel = isinstance(model, NxDPPModel) + if not is_nxdppmodel: + model = self._wrap_model(model, training=False, dataloader=dataloader) + + if len(self.accelerator._models) == 0 and model is self.model: + model = ( + self.accelerator.prepare(model) + if self.is_deepspeed_enabled + else self.accelerator.prepare_model(model, evaluation_mode=True) + ) + + if self.is_fsdp_enabled: + self.model = model + + # for the rest of this function `model` is the outside model, whether it was wrapped or not + if model is not self.model: + self.model_wrapped = model + + # backward compatibility + if self.is_deepspeed_enabled: + self.deepspeed = self.model_wrapped + + # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called + # while ``train`` is running, cast it to the right dtype first and then put on device + if not self.is_in_train and not is_nxdppmodel: + if args.fp16_full_eval: + model = model.to(dtype=torch.float16, device=args.device) + elif args.bf16_full_eval: + model = model.to(dtype=torch.bfloat16, device=args.device) + + batch_size = self.args.eval_batch_size + + logger.info(f"***** Running {description} *****") + dp_size = get_data_parallel_size() + logger.info(f" Num data parallel workers = {dp_size}") + if has_length(dataloader): + num_examples = self.num_examples(dataloader) + total_num_examples = num_examples * dp_size + logger.info(f" Per data parallel worker num examples = {num_examples}") + logger.info(f" Total num examples = {total_num_examples}") + else: + logger.info(" Num examples: Unknown") + logger.info(f" Batch size = {batch_size}") + + if not is_nxdppmodel: + model.eval() + + self.callback_handler.eval_dataloader = dataloader + # Do this before wrapping. + eval_dataset = getattr(dataloader, "dataset", None) + + if args.past_index >= 0: + self._past = None + + # Initialize containers + # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps) + losses_host = None + preds_host = None + labels_host = None + inputs_host = None + + # losses/preds/labels on CPU (final containers) + all_losses = None + all_preds = None + all_labels = None + all_inputs = None + # Will be useful when we have an iterable dataset so don't know its length. + + observed_num_examples = 0 + # Main evaluation loop + for step, inputs in enumerate(dataloader): + # Update the observed num examples + observed_batch_size = find_batch_size(inputs) + if observed_batch_size is not None: + observed_num_examples += observed_batch_size + # For batch samplers, batch_size is not known by the dataloader in advance. + if batch_size is None: + batch_size = observed_batch_size + + if is_nxdppmodel and observed_batch_size % model.num_microbatches != 0: + if xm.get_local_ordinal() == 0: + logger.warning( + "Skipping the evaluation step because the pipeline number of microbatches " + f"({model.num_microbatches}) does not divide the batch size ({observed_batch_size})." + ) + continue + + # Prediction step + loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) + main_input_name = getattr(model, "main_input_name", "input_ids") + inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None + + xm.mark_step() + + # Update containers on host + if loss is not None: + losses = self.accelerator.gather_for_metrics((loss.repeat(batch_size))) + losses_host = losses if losses_host is None else nested_concat(losses_host, losses, padding_index=-100) + if labels is not None: + labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100) + if inputs_decode is not None: + inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100) + inputs_decode = self.accelerator.gather_for_metrics((inputs_decode)) + inputs_host = ( + inputs_decode + if inputs_host is None + else nested_concat(inputs_host, inputs_decode, padding_index=-100) + ) + if logits is not None: + logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100) + if self.preprocess_logits_for_metrics is not None: + logits = self.preprocess_logits_for_metrics(logits, labels) + logits = self.accelerator.gather_for_metrics((logits)) + preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) + + if labels is not None: + labels = self.accelerator.gather_for_metrics((labels)) + labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) + + self.control = self.callback_handler.on_prediction_step(args, self.state, self.control) + + # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. + if ( + args.eval_accumulation_steps is not None + and (step + 1) % args.eval_accumulation_steps == 0 + and (self.accelerator.sync_gradients or version.parse(accelerate_version) > version.parse("0.20.3")) + ): + if losses_host is not None: + losses = nested_numpify(losses_host) + all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) + if preds_host is not None: + logits = nested_numpify(preds_host) + all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) + if inputs_host is not None: + inputs_decode = nested_numpify(inputs_host) + all_inputs = ( + inputs_decode + if all_inputs is None + else nested_concat(all_inputs, inputs_decode, padding_index=-100) + ) + if labels_host is not None: + labels = nested_numpify(labels_host) + all_labels = ( + labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) + ) + + # Set back to None to begin a new accumulation + losses_host, preds_host, inputs_host, labels_host = None, None, None, None + + if args.past_index and hasattr(self, "_past"): + # Clean the state at the end of the evaluation loop + delattr(self, "_past") + + # Gather all remaining tensors and put them back on the CPU + if losses_host is not None: + losses = nested_numpify(losses_host) + all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) + if preds_host is not None: + logits = nested_numpify(preds_host) + all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) + if inputs_host is not None: + inputs_decode = nested_numpify(inputs_host) + all_inputs = ( + inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100) + ) + if labels_host is not None: + labels = nested_numpify(labels_host) + all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) + + # Number of samples + if has_length(eval_dataset): + num_samples = len(eval_dataset) + # The instance check is weird and does not actually check for the type, but whether the dataset has the right + # methods. Therefore we need to make sure it also has the attribute. + elif isinstance(eval_dataset, IterableDatasetShard) and getattr(eval_dataset, "num_examples", 0) > 0: + num_samples = eval_dataset.num_examples + else: + if has_length(dataloader): + num_samples = self.num_examples(dataloader) + else: # both len(dataloader.dataset) and len(dataloader) fail + num_samples = observed_num_examples + if num_samples == 0 and observed_num_examples > 0: + num_samples = observed_num_examples + + # Metrics! + if self.compute_metrics is not None and all_preds is not None and all_labels is not None: + if args.include_inputs_for_metrics: + metrics = self.compute_metrics( + EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs) + ) + else: + metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels)) + else: + metrics = {} + + # To be JSON-serializable, we need to remove numpy types or zero-d tensors + metrics = denumpify_detensorize(metrics) + + if all_losses is not None: + metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item() + if hasattr(self, "jit_compilation_time"): + metrics[f"{metric_key_prefix}_jit_compilation_time"] = self.jit_compilation_time + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + + return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples) class NeuronTrainer(AugmentTrainerForNeuronMixin, Trainer): diff --git a/optimum/neuron/training_args.py b/optimum/neuron/training_args.py index 415148057..33c6a60ff 100644 --- a/optimum/neuron/training_args.py +++ b/optimum/neuron/training_args.py @@ -36,7 +36,7 @@ from ..utils import check_if_transformers_greater, logging from .accelerate import NeuronAcceleratorState, NeuronPartialState -from .accelerate.utils import TensorParallelismPlugin, patch_accelerate_is_tpu_available +from .accelerate.utils import ModelParallelismPlugin, patch_accelerate_is_tpu_available from .utils import is_accelerate_available, is_torch_xla_available from .utils.training_utils import TRANSFORMERS_MIN_VERSION_FOR_XLA_FSDP @@ -80,6 +80,14 @@ class NeuronTrainingArgumentsMixin: "help": "Specify the level of optimization the Neuron compiler should perform.", }, ) + pipeline_parallel_size: int = field( + default=1, + metadata={"help": "The number of pipeline parallel replicas."}, + ) + pipeline_parallel_num_microbatches: int = field( + default=-1, + metadata={"help": "The number of microbatches used for pipeline execution."}, + ) def __post_init__(self): # Patches accelerate.utils.imports.is_tpu_available to match `is_torch_xla_available` @@ -120,10 +128,27 @@ def __post_init__(self): checkpoint = get_last_checkpoint(self.output_dir) resume_from_checkpoint = checkpoint - self.tp_plugin = TensorParallelismPlugin( + if self.pipeline_parallel_size > 1: + if self.pipeline_parallel_num_microbatches == -1: + self.pipeline_parallel_num_microbatches = self.per_device_train_batch_size + if self.per_device_train_batch_size % self.pipeline_parallel_num_microbatches != 0: + raise ValueError( + f"The number of pipeline microbatches ({self.pipeline_parallel_num_microbatches}) divide the total " + f"per-device train batch size ({self.per_device_train_batch_size})." + ) + if self.per_device_eval_batch_size % self.pipeline_parallel_num_microbatches != 0: + raise ValueError( + f"The number of pipeline microbatches ({self.pipeline_parallel_num_microbatches}) divide the total " + f"per-device eval batch size ({self.per_device_eval_batch_size})." + ) + + self.mp_plugin = ModelParallelismPlugin( self.tensor_parallel_size, - not self.disable_embedding_parallelization, + parallelize_embeddings=not self.disable_embedding_parallelization, sequence_parallel_enabled=not self.disable_sequence_parallel, + pipeline_parallel_size=self.pipeline_parallel_size, + pipeline_parallel_num_microbatches=self.pipeline_parallel_num_microbatches, + pipeline_parallel_use_zero1_optimizer=self.zero_1, checkpoint_dir=resume_from_checkpoint, ) super().__post_init__() @@ -228,13 +253,13 @@ def _setup_devices(self) -> "torch.device": @property def place_model_on_device(self): - return not self.tp_plugin.should_parallelize and super().place_model_on_device + return not self.mp_plugin.should_parallelize and super().place_model_on_device @property def world_size(self): divisor = 1 - if self.tp_plugin.should_parallelize: - divisor = self.tp_plugin.tensor_parallel_size + if self.mp_plugin.should_parallelize: + divisor = self.mp_plugin.tensor_parallel_size * self.mp_plugin.pipeline_parallel_size return super().world_size // divisor diff --git a/optimum/neuron/utils/cache_utils.py b/optimum/neuron/utils/cache_utils.py index 698dde5e0..d68aa4642 100644 --- a/optimum/neuron/utils/cache_utils.py +++ b/optimum/neuron/utils/cache_utils.py @@ -21,7 +21,6 @@ import os import re import shutil -import subprocess import tempfile from dataclasses import InitVar, asdict, dataclass, field from pathlib import Path @@ -33,9 +32,9 @@ from huggingface_hub import ( CommitOperationAdd, HfApi, - HfFolder, RepoUrl, create_repo, + get_token, hf_hub_download, whoami, ) @@ -45,8 +44,8 @@ from ...utils import logging from ...utils.logging import warn_once -from .constant import NEURON_BINARIES_PATH from .misc import is_main_worker, string_to_bool +from .require_utils import requires_neuronx_distributed from .version_utils import get_neuronxcc_version @@ -137,7 +136,7 @@ def is_private_repo(repo_id: str) -> bool: if _DISABLE_IS_PRIVATE_REPO_CHECK: return False try: - HfApi().model_info(repo_id=repo_id, token=HfFolder.get_token()) + HfApi().model_info(repo_id=repo_id, token=get_token()) private_to_user = False except RepositoryNotFoundError: private_to_user = True @@ -260,15 +259,12 @@ def set_neuron_cache_path(neuron_cache_path: Union[str, Path], ignore_no_cache: def get_num_neuron_cores() -> int: - path = os.environ["PATH"] - if NEURON_BINARIES_PATH not in path: - path = f"{NEURON_BINARIES_PATH}:{path}" - os.environ["PATH"] = path - proc = subprocess.Popen(["neuron-ls", "-j"], stdout=subprocess.PIPE) - stdout, _ = proc.communicate() - stdout = stdout.decode("utf-8") - json_stdout = json.loads(stdout) - return sum(neuron_device_info["nc_count"] for neuron_device_info in json_stdout) + neuron_devices_path = Path("/sys/class/neuron_device/") + if not neuron_devices_path.is_dir(): + num_cores = 0 + else: + num_cores = len(list(neuron_devices_path.iterdir())) * 2 + return num_cores def get_num_neuron_cores_used() -> int: @@ -656,6 +652,9 @@ class NeuronHash: tensor_parallel_size: Union[int, _UnspecifiedHashAttribute] = field( default_factory=_UnspecifiedHashAttribute.with_args(min_optimum_neuron_version="0.0.8", default=1) ) + pipeline_parallel_size: Union[int, _UnspecifiedHashAttribute] = field( + default_factory=_UnspecifiedHashAttribute.with_args(min_optimum_neuron_version="0.0.17", default=1) + ) _model_name_or_path: Optional[str] = None _is_private: Optional[bool] = None _model_type: Optional[str] = None @@ -739,11 +738,19 @@ def compute_sha512_hash(self, *buffers: bytes) -> str: hash_.update(buffer) return hash_.hexdigest() + @requires_neuronx_distributed def compute_hash(self, model: Optional["PreTrainedModel"] = None) -> Tuple[str, str]: if self._hash.is_empty: if model is None: raise ValueError("A model must be specified the first time the hash is computed.") - model_hash = self.compute_sha512_hash(self.state_dict_to_bytes(model.state_dict())) + + from neuronx_distributed.pipeline import NxDPPModel + + if isinstance(model, NxDPPModel): + state_dict = model.local_state_dict() + else: + state_dict = model.state_dict() + model_hash = self.compute_sha512_hash(self.state_dict_to_bytes(state_dict)) hash_dict = asdict(self) hash_dict["model"] = model_hash @@ -756,6 +763,9 @@ def compute_hash(self, model: Optional["PreTrainedModel"] = None) -> Tuple[str, self._insert_potential_unspecified_hash_attribute( "tensor_parallel_size", self.tensor_parallel_size, hash_dict ) + self._insert_potential_unspecified_hash_attribute( + "pipeline_parallel_size", self.tensor_parallel_size, hash_dict + ) self._insert_potential_unspecified_hash_attribute("fsdp", self.fsdp, hash_dict) hash_dict["data_type"] = str(hash_dict["data_type"]).split(".")[1] @@ -817,7 +827,7 @@ def get_cached_model_on_the_hub(neuron_hash: NeuronHash) -> Optional[CachedModel else: revision = "main" try: - repo_filenames = HfApi().list_repo_files(repo_id, revision=revision, token=HfFolder.get_token()) + repo_filenames = HfApi().list_repo_files(repo_id, revision=revision, token=get_token()) except Exception: continue model_files_on_the_hub = [] @@ -974,7 +984,7 @@ def push_to_cache_on_hub( path_in_repo = Path().joinpath(*path_in_repo.parts[1:]) path_in_repo = neuron_hash.cache_path / path_in_repo - repo_filenames = HfApi().list_repo_files(cache_repo_id, token=HfFolder.get_token()) + repo_filenames = HfApi().list_repo_files(cache_repo_id, token=get_token()) path_in_repo_str = path_in_repo.as_posix() if local_cache_dir_or_file.is_dir(): exists = any(filename.startswith(path_in_repo_str) for filename in repo_filenames) diff --git a/optimum/neuron/utils/patching.py b/optimum/neuron/utils/patching.py index b806997dd..3311352a0 100644 --- a/optimum/neuron/utils/patching.py +++ b/optimum/neuron/utils/patching.py @@ -37,20 +37,36 @@ def __init__( self.patching_specs = self.process_patching_specs( patching_specs, ignore_missing_attributes=ignore_missing_attributes ) + self.already_patched = False @abstractmethod def process_patching_specs( self, patching_specs: Optional[List[Tuple[Any, Any]]] = None, ignore_missing_attributes: bool = False - ) -> List[Tuple[Any, str, Any, Any]]: + ) -> List[Tuple[Any, str, Any, Any, bool]]: pass - def __enter__(self): - for module, attribute_name, _, patch in self.patching_specs: + def patch(self): + if self.already_patched: + return + for module, attribute_name, _, patch, _ in self.patching_specs: setattr(module, attribute_name, patch) + self.already_patched = True + + def restore(self): + if not self.already_patched: + return + for module, attribute_name, orig, _, should_delete_attribute_at_restore in self.patching_specs: + if should_delete_attribute_at_restore: + delattr(module, attribute_name) + else: + setattr(module, attribute_name, orig) + self.already_patched = False + + def __enter__(self): + return self.patch() def __exit__(self, exc_type, exc_value, traceback): - for module, attribute_name, _, patch in self.patching_specs: - setattr(module, attribute_name, patch) + return self.restore() class DynamicPatch: @@ -103,7 +119,7 @@ def process_patching_specs( ) if isinstance(patch, DynamicPatch): patch = patch(attribute) - proccessed_patching_specs.append((module, attribute_name, attribute, patch)) + proccessed_patching_specs.append((module, attribute_name, attribute, patch, not module_has_attr)) return proccessed_patching_specs @@ -144,7 +160,7 @@ def process_patching_specs( if inspect.ismethod(attribute): patch = patch.__get__(model) - proccessed_patching_specs.append((module, attribute_name, attribute, patch)) + proccessed_patching_specs.append((module, attribute_name, attribute, patch, not module_has_attr)) return proccessed_patching_specs diff --git a/optimum/neuron/utils/runner.py b/optimum/neuron/utils/runner.py index e6790f98b..d738c6f67 100644 --- a/optimum/neuron/utils/runner.py +++ b/optimum/neuron/utils/runner.py @@ -28,7 +28,7 @@ import requests from huggingface_hub import ( HfApi, - HfFolder, + get_token, snapshot_download, ) from transformers import AutoConfig @@ -172,7 +172,7 @@ class ExampleRunner: ], }, "image-classification": { - "dataset_name": "beans", + "dataset_name": "mnist", "extra_command_line_arguments": [ "--remove_unused_columns false", "--ignore_mismatched_sizes", @@ -304,7 +304,7 @@ def install_requirements(self, requirements_filename: Union[str, Path]): self._installed_requirements = True def check_user_logged_in_and_cache_repo_is_set(self): - token = HfFolder.get_token() + token = get_token() if not token: raise RuntimeError( "You need to log in the Hugging Face Hub otherwise you will not be able to push anything. " @@ -333,7 +333,7 @@ def download_model_repo_and_override_config( if not config_overrides: return model_name_or_path - filenames = HfApi().list_repo_files(repo_id=model_name_or_path, token=HfFolder.get_token()) + filenames = HfApi().list_repo_files(repo_id=model_name_or_path, token=get_token()) safetensors_model_file_pattern = re.compile(r"\w+(-[0-9]*-of-[0-9]*)?\.safetensors") allow_patterns = ["*.json", "*.txt"] if any(re.match(safetensors_model_file_pattern, filename) for filename in filenames): @@ -380,6 +380,7 @@ def run( save_total_limit: int = -1, learning_rate: float = 1e-4, tensor_parallel_size: int = 1, + pipeline_parallel_size: int = 1, disable_embedding_parallelization: bool = False, zero_1: bool = False, output_dir: Optional[Union[Path, str]] = None, @@ -417,9 +418,14 @@ def run( self.install_requirements(script_path.parent / "requirements.txt") def compute_max_train_samples( - max_steps: int, num_cores: int, tensor_parallel_size: int, per_device_train_batch_size: int + max_steps: int, + num_cores: int, + tensor_parallel_size: int, + pipeline_parallel_size: int, + per_device_train_batch_size: int, ) -> int: - total_batch_size = (num_cores // tensor_parallel_size) * per_device_train_batch_size + number_of_cores_per_replicas = tensor_parallel_size * pipeline_parallel_size + total_batch_size = (num_cores // number_of_cores_per_replicas) * per_device_train_batch_size total_num_samples = max_steps * total_batch_size # Adding 10% more examples just to make sure. return int(total_num_samples * 1.1) @@ -442,7 +448,9 @@ def compute_max_train_samples( if max_steps is not None: cmd.append(f"--max_steps {max_steps}") max_steps_idx = len(cmd) - 1 - max_train_samples = compute_max_train_samples(max_steps, num_cores, tensor_parallel_size, train_batch_size) + max_train_samples = compute_max_train_samples( + max_steps, num_cores, tensor_parallel_size, pipeline_parallel_size, train_batch_size + ) cmd.append(f"--max_train_samples {max_train_samples}") cmd.append("--do_train") @@ -469,6 +477,8 @@ def compute_max_train_samples( # Parallelism if tensor_parallel_size > 1: cmd.append(f"--tensor_parallel_size {tensor_parallel_size}") + if pipeline_parallel_size > 1: + cmd.append(f"--pipeline_parallel_size {pipeline_parallel_size}") if disable_embedding_parallelization: cmd.append("--disable_embedding_parallelization") if zero_1: diff --git a/optimum/neuron/utils/training_utils.py b/optimum/neuron/utils/training_utils.py index b08f6e6d9..113096237 100644 --- a/optimum/neuron/utils/training_utils.py +++ b/optimum/neuron/utils/training_utils.py @@ -286,7 +286,7 @@ def set_neuron_cc_optlevel_for_model(model: "PreTrainedModel", optlevel: str = " neuron_cc_flags = os.environ.get("NEURON_CC_FLAGS", "") match_ = re.search(r"-O[123]", neuron_cc_flags) if match_: - neuron_cc_flags = neuron_cc_flags[: match_.start(0)] + f"{optlevel}" + neuron_cc_flags[match_.end(1) + 1 :] + neuron_cc_flags = neuron_cc_flags[: match_.start(0)] + f"{optlevel}" + neuron_cc_flags[match_.end(0) + 1 :] else: neuron_cc_flags += f"{optlevel} " os.environ["NEURON_CC_FLAGS"] = neuron_cc_flags diff --git a/tests/cli/test_neuron_cache_cli.py b/tests/cli/test_neuron_cache_cli.py index 67f6dca1b..8b9a7640b 100644 --- a/tests/cli/test_neuron_cache_cli.py +++ b/tests/cli/test_neuron_cache_cli.py @@ -14,14 +14,17 @@ # limitations under the License. import os +import random +import string import subprocess +from pathlib import Path from tempfile import TemporaryDirectory from unittest import TestCase import torch from huggingface_hub import HfApi, create_repo, delete_repo from huggingface_hub.utils import RepositoryNotFoundError -from transformers import BertConfig, BertModel +from transformers import BertConfig, BertModel, BertTokenizer from transformers.testing_utils import is_staging_test from optimum.neuron.utils.cache_utils import ( @@ -39,6 +42,12 @@ from ..utils import StagingTestMixin +# Taken from https://pynative.com/python-generate-random-string/ +def get_random_string(length: int) -> str: + letters = string.ascii_lowercase + return "".join(random.choice(letters) for i in range(length)) + + @is_trainium_test @is_staging_test class TestNeuronCacheCLI(StagingTestMixin, TestCase): @@ -54,7 +63,6 @@ def setUp(self): def tearDown(self): super().tearDown() os.environ["HF_HOME"] = self._hf_home - try: delete_repo(self.default_repo_id, repo_type="model") except RepositoryNotFoundError: @@ -126,65 +134,86 @@ def test_optimum_neuron_cache_set(self): ) def test_optimum_neuron_cache_add(self): - os.environ["CUSTOM_CACHE_REPO"] = self.CUSTOM_CACHE_REPO - # TODO: activate those later. - # Without any sequence length, it should fail. - # command = ( - # "optimum-cli neuron cache add -m bert-base-uncased --task text-classification --train_batch_size 16 " - # "--precision bf16 --num_cores 2" - # ).split() - # p = subprocess.Popen(command, stderr=PIPE) - # _, stderr = p.communicate() - # stderr = stderr.decode("utf-8") - # self.assertIn("either sequence_length or encoder_sequence and decoder_sequence_length", stderr) - - # Without both encoder and decoder sequence lengths, it should fail. - # command = ( - # "optimum-cli neuron cache add -m t5-small --task translation --train_batch_size 16 --precision bf16 " - # "--num_cores 2 --encoder_sequence_length 512" - # ).split() - # p = subprocess.Popen(command, stderr=PIPE) - # _, stderr = p.communicate() - # stderr = stderr.decode("utf-8") - # self.assertIn("Both the encoder_sequence and decoder_sequence_length", stderr) - - bert_model_name = "__DUMMY_OPTIMUM_USER__/tiny-random-BertModel-neuron" - - # With wrong precision value, it should fail. - command = ( - f"optimum-cli neuron cache add -m {bert_model_name} --task text-classification --train_batch_size 1 " - "--precision wrong --num_cores 2 --sequence_length 128" - ).split() - p = subprocess.Popen(command) - returncode = p.wait() - self.assertNotEqual(returncode, 0) - - # With wrong num_cores value, it should fail. - command = ( - f"optimum-cli neuron cache add -m {bert_model_name} --task text-classification --train_batch_size 1 " - "--precision bf16 --num_cores 999 --sequence_length 128" - ).split() - p = subprocess.Popen(command) - returncode = p.wait() - self.assertNotEqual(returncode, 0) - - # Non seq2seq model. - command = ( - f"optimum-cli neuron cache add -m {bert_model_name} --task text-classification --train_batch_size 1 " - "--precision bf16 --num_cores 2 --sequence_length 128" - ).split() - p = subprocess.Popen(command) - returncode = p.wait() - self.assertEqual(returncode, 0) - - # seq2seq model. - command = ( - f"optimum-cli neuron cache add -m {bert_model_name} --task translation --train_batch_size 1 --precision bf16 " - "--num_cores 2 --encoder_sequence_length 12 --decoder_sequence_length 12" - ).split() - p = subprocess.Popen(command) - returncode = p.wait() - self.assertEqual(returncode, 0) + with TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + os.environ["CUSTOM_CACHE_REPO"] = self.CUSTOM_CACHE_REPO + # TODO: activate those later. + # Without any sequence length, it should fail. + # command = ( + # "optimum-cli neuron cache add -m bert-base-uncased --task text-classification --train_batch_size 16 " + # "--precision bf16 --num_cores 2" + # ).split() + # p = subprocess.Popen(command, stderr=PIPE) + # _, stderr = p.communicate() + # stderr = stderr.decode("utf-8") + # self.assertIn("either sequence_length or encoder_sequence and decoder_sequence_length", stderr) + + # Without both encoder and decoder sequence lengths, it should fail. + # command = ( + # "optimum-cli neuron cache add -m t5-small --task translation --train_batch_size 16 --precision bf16 " + # "--num_cores 2 --encoder_sequence_length 512" + # ).split() + # p = subprocess.Popen(command, stderr=PIPE) + # _, stderr = p.communicate() + # stderr = stderr.decode("utf-8") + # self.assertIn("Both the encoder_sequence and decoder_sequence_length", stderr) + + # Create dummy BERT model. + bert_model_name = tmpdir / "bert_model" + config = BertConfig() + + config.num_hidden_layers = 2 + config.num_attention_heads = 2 + config.vocab_size = 100 + + with open(tmpdir / "vocab.txt", "w") as fp: + fp.write("\n".join(get_random_string(random.randint(10, 20)))) + + tokenizer = BertTokenizer(tmpdir / "vocab.txt") + tokenizer.save_pretrained(bert_model_name) + + model = BertModel(config) + model.save_pretrained(bert_model_name) + + env = dict(os.environ) + env["OPTIMUM_NEURON_DISABLE_IS_PRIVATE_REPO_CHECK"] = "1" + + # With wrong precision value, it should fail. + command = ( + f"optimum-cli neuron cache add -m {bert_model_name} --task text-classification --train_batch_size 1 " + "--precision wrong --num_cores 2 --sequence_length 128" + ).split() + p = subprocess.Popen(command, env=env) + returncode = p.wait() + self.assertNotEqual(returncode, 0) + + # With wrong num_cores value, it should fail. + command = ( + f"optimum-cli neuron cache add -m {bert_model_name} --task text-classification --train_batch_size 1 " + "--precision bf16 --num_cores 999 --sequence_length 128" + ).split() + p = subprocess.Popen(command, env=env) + returncode = p.wait() + self.assertNotEqual(returncode, 0) + + # Non seq2seq model. + command = ( + f"optimum-cli neuron cache add -m {bert_model_name} --task text-classification --train_batch_size 1 " + "--precision bf16 --num_cores 2 --sequence_length 128" + ).split() + p = subprocess.Popen(command, env=env) + returncode = p.wait() + self.assertEqual(returncode, 0) + + # seq2seq model. + command = ( + f"optimum-cli neuron cache add -m {bert_model_name} --task translation --train_batch_size 1 --precision bf16 " + "--num_cores 2 --encoder_sequence_length 12 --decoder_sequence_length 12" + ).split() + p = subprocess.Popen(command, env=env) + returncode = p.wait() + self.assertEqual(returncode, 0) def test_optimum_neuron_cache_list(self): with TemporaryDirectory() as tmpdirname: diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py new file mode 100644 index 000000000..6efd9aa3a --- /dev/null +++ b/tests/distributed/conftest.py @@ -0,0 +1,43 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + + +# This hook is run before the default pytest_runtest_call +@pytest.hookimpl(tryfirst=True) +def pytest_runtest_call(item): + # We want to use our own launching function for distributed tests + if getattr(item.cls, "is_dist_test", False): + dist_test_class = item.cls() + dist_test_class(item._request) + item.runtest = lambda: True # Dummy function so test is not run twice + + +# We allow DistributedTest to reuse distributed environments. When the last +# test for a class is run, we want to make sure those distributed environments +# are destroyed. +def pytest_runtest_teardown(item, nextitem): + if getattr(item.cls, "reuse_dist_env", False) and not nextitem: + dist_test_class = item.cls() + for num_procs, pool in dist_test_class._pool_cache.items(): + dist_test_class._close_pool(pool, num_procs, force=True) + + +@pytest.hookimpl(tryfirst=True) +def pytest_fixture_setup(fixturedef, request): + if getattr(fixturedef.func, "is_dist_fixture", False): + dist_fixture_class = fixturedef.func() + dist_fixture_class(request) diff --git a/tests/distributed/distributed.py b/tests/distributed/distributed.py new file mode 100644 index 000000000..690140cd1 --- /dev/null +++ b/tests/distributed/distributed.py @@ -0,0 +1,353 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Defines classes to enable running tests in a distributed setting.""" + +# The following code is copied and adapted from the DeepSpeed repo: +# https://github.com/microsoft/DeepSpeed/blob/master/tests/unit/common.py + +import inspect +import multiprocessing +import os +import socket +import time +import uuid +from abc import ABC, abstractmethod +from typing import List, Union + +import psutil +import pytest +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from _pytest.fixtures import FixtureLookupError +from _pytest.outcomes import Skipped + +from optimum.neuron.utils.cache_utils import get_num_neuron_cores +from optimum.neuron.utils.import_utils import is_neuronx_distributed_available, is_torch_xla_available + + +if is_torch_xla_available(): + import torch_xla.distributed.xla_backend as xbn + +if is_neuronx_distributed_available(): + import neuronx_distributed + +TEST_TIMEOUT = 600 + + +def is_neuron_environment_available() -> bool: + return get_num_neuron_cores() > 0 + + +def get_xdist_worker_id(): + xdist_worker = os.environ.get("PYTEST_XDIST_WORKER", None) + if xdist_worker is not None: + xdist_worker_id = xdist_worker.replace("gw", "") + return int(xdist_worker_id) + return None + + +def get_master_port(base_port=29500, port_range_size=1000): + xdist_worker_id = get_xdist_worker_id() + if xdist_worker_id is not None: + # Make xdist workers use different port ranges to avoid race conditions + base_port += port_range_size * xdist_worker_id + + # Select first open port in range + port = base_port + max_port = base_port + port_range_size + sock = socket.socket() + while port < max_port: + try: + sock.bind(("", port)) + sock.close() + return str(port) + except OSError: + port += 1 + raise IOError("no free ports") + + +class DistributedExec(ABC): + """ + Base class for distributed execution of functions/methods. Contains common + methods needed for DistributedTest and DistributedFixture (not included in this file). + """ + + world_size: Union[int, List[int]] = 2 + tp_size: int = 1 + pp_size: int = 1 + backend: str = "xla" + init_distributed: bool = True + set_dist_env: bool = True + requires_neuron_environment: bool = True + reuse_dist_env: bool = False + _pool_cache = {} + exec_timeout: int = TEST_TIMEOUT + + @abstractmethod + def run(self): + ... + + def __call__(self, request=None): + self._fixture_kwargs = self._get_fixture_kwargs(request, self.run) + world_size = self.world_size + if self.requires_neuron_environment and not is_neuron_environment_available(): + pytest.skip("Only supported in a Neuron environment.") + + if isinstance(world_size, int): + world_size = [world_size] + for procs in world_size: + self._launch_procs(procs, self.tp_size, self.pp_size) + + def _get_fixture_kwargs(self, request, func): + if not request: + return {} + # Grab fixture / parametrize kwargs from pytest request object + fixture_kwargs = {} + params = inspect.getfullargspec(func).args + params.remove("self") + for p in params: + try: + fixture_kwargs[p] = request.getfixturevalue(p) + except FixtureLookupError: + pass # test methods can have kwargs that are not fixtures + return fixture_kwargs + + def _launch_procs(self, num_procs, tp_size, pp_size): + if not is_torch_xla_available() or not is_neuronx_distributed_available(): + raise RuntimeError( + "The `torch_xla` and `neuronx_distributed` packages are required to run a distributed test." + ) + + # Verify we have enough accelerator devices to run this test + num_cores = get_num_neuron_cores() + if 0 < num_cores < num_procs: + pytest.skip( + f"Skipping test because not enough Neuron cores are available: {num_procs} required, {num_cores} " + "available." + ) + + # Set start method to `forkserver` (or `fork`) + mp.set_start_method("forkserver", force=True) + os.environ["TORCHELASTIC_RUN_ID"] = str(uuid.uuid4()) + + # Create process pool or use cached one + master_port = None + if self.reuse_dist_env: + if num_procs not in self._pool_cache: + self._pool_cache[num_procs] = mp.Pool(processes=num_procs) + master_port = get_master_port() + pool = self._pool_cache[num_procs] + else: + pool = mp.Pool(processes=num_procs) + master_port = get_master_port() + + # Run the test + args = [(local_rank, num_procs, master_port, tp_size, pp_size) for local_rank in range(num_procs)] + skip_msgs_async = pool.starmap_async(self._dist_run, args) + + skip_msgs = "" # Otherwise the linter complains. + try: + skip_msgs = skip_msgs_async.get(self.exec_timeout) + except mp.TimeoutError: + # Shortcut to exit pytest in the case of a hanged test. This + # usually means an environment error and the rest of tests will + # hang (causing super long unit test runtimes) + pytest.exit("Test hanged, exiting", returncode=0) + except Exception as e: + self._close_pool(pool, num_procs) + self._terminate_xrt_server() + raise e + finally: + # Tear down distributed environment and close process pools + self._close_pool(pool, num_procs) + self._terminate_xrt_server() + + # If we skipped a test, propagate that to this process + if any(skip_msgs): + assert len(set(skip_msgs)) == 1, "Multiple different skip messages received" + pytest.skip(skip_msgs[0]) + + def _dist_run(self, local_rank, num_procs, master_port, tp_size, pp_size): + skip_msg = "" + if not dist.is_initialized(): + """Initializes communication and executes the user function.""" + if self.set_dist_env: + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(master_port) + # Unit tests do not support multi-node so local_rank == global rank + os.environ["LOCAL_RANK"] = str(local_rank) + os.environ["RANK"] = str(local_rank) + os.environ["LOCAL_SIZE"] = str(num_procs) + os.environ["WORLD_SIZE"] = str(num_procs) + os.environ["LOCAL_WORLD_SIZE"] = str(num_procs) + # Unit tests do not support multi-node so there is only one group in our case + os.environ["GROUP_RANK"] = "0" + + if self.init_distributed: + dist.init_process_group(backend=self.backend, rank=local_rank, world_size=num_procs) + if not isinstance(torch.distributed.group.WORLD, xbn.ProcessGroupXla): + raise AssertionError("Failed to initialize torch.distributed process group using XLA backend.") + + # Intializing NxD. + neuronx_distributed.parallel_layers.parallel_state.initialize_model_parallel( + tensor_model_parallel_size=tp_size, + pipeline_model_parallel_size=pp_size, + ) + try: + self.run(**self._fixture_kwargs) + except BaseException as e: + if isinstance(e, Skipped): + skip_msg = e.msg + else: + raise e + + return skip_msg + + def _dist_destroy(self): + if (dist is not None) and dist.is_initialized(): + dist.barrier() + dist.destroy_process_group() + + def _close_pool(self, pool, num_procs, force=False): + if force or not self.reuse_dist_env: + try: + _ = pool.starmap(self._dist_destroy, [() for _ in range(num_procs)]) + pool.close() + pool.join() + except ValueError: + pass + + def _terminate_xrt_server(self): + xrt_server_str = "torch_neuronx.distributed._xrt_run_server" + startmethod = mp.get_start_method(allow_none=True) + # Rules: + # - `startmethod is None`: the XRT server tracks pytest's PID. + # - `startmethod="spawn"`: the parent process of the pool's processes is pytest, so the XRT server tracks + # pytest's PID. + # - `startmethod="fork"`: same as `startmethod="spawn"`. + # - `startmethod="forkserver"`: the parent process of the pool's processes is the forkserver, so the XRT server tracks + # the forkserver's PID. + if startmethod == "forkserver": + target_pid = multiprocessing.forkserver._forkserver._forkserver_pid + else: + target_pid = os.getpid() + + for p in psutil.process_iter(): + try: + if "python3" in p.name() and len(p.cmdline()) == 7: + cmdline = p.cmdline() + if cmdline[2] == xrt_server_str and cmdline[-1] == str(target_pid): + p.terminate() + except psutil.ZombieProcess: + continue + + +class DistributedTest(DistributedExec): + """ + Implementation for running pytest with distributed execution. + """ + + is_dist_test = True + + def early_skip(self, fixtures_kwargs): + """ + Override to enable early test skipping (before processes creation). + """ + pass + + # Temporary directory that is shared among test methods in a class + @pytest.fixture(autouse=True, scope="class") + def class_tmpdir(self, tmpdir_factory): + fn = tmpdir_factory.mktemp(self.__class__.__name__) + return fn + + def run(self, **fixture_kwargs): + self._current_test(**fixture_kwargs) + + def __call__(self, request): + self._current_test = self._get_current_test_func(request) + self._fixture_kwargs = self._get_fixture_kwargs(request, self._current_test) + + if self.requires_neuron_environment and not is_neuron_environment_available(): + pytest.skip("Only supported in a Neuron environment.") + + self.early_skip(self._fixture_kwargs) + + world_size = tp_size = pp_size = parallel_sizes = None + + # Catch world_size, tp_size or pp_size override pytest mark. + def try_to_override_via_pytest_mark(mark, name): + if mark.name == name: + return mark.args[0] + return None + + for mark in getattr(request.function, "pytestmark", []): + world_size = try_to_override_via_pytest_mark(mark, "world_size") + tp_size = try_to_override_via_pytest_mark(mark, "tp_size") + pp_size = try_to_override_via_pytest_mark(mark, "pp_size") + parallel_sizes = try_to_override_via_pytest_mark(mark, "parallel_size") + + # Catch world_size, tp_size or pp_size override via fixture. + def try_to_override_via_fixture(name, current_value): + if name in self._fixture_kwargs: + if current_value is not None: + raise ValueError(f"It is not possible to override {name} both via pytest.mark and fixtures.") + return self._fixture_kwargs[name] + return None + + world_size = try_to_override_via_fixture("world_size", world_size) + tp_size = try_to_override_via_fixture("tp_size", tp_size) + pp_size = try_to_override_via_fixture("pp_size", pp_size) + parallel_sizes = try_to_override_via_fixture("parallel_sizes", parallel_sizes) + + if parallel_sizes is not None: + if not all(size is None for size in [world_size, tp_size, pp_size]): + raise ValueError("Either specify parallel_sizes or specific size (world_size, tp_size, pp_size)") + world_size, tp_size, pp_size = parallel_sizes + + if world_size is None: + world_size = self.world_size + if tp_size is None: + tp_size = self.tp_size + if pp_size is None: + pp_size = self.pp_size + + sizes = [world_size, tp_size, pp_size] + if all(isinstance(size, int) for size in sizes): + world_size = [world_size] + tp_size = [tp_size] + pp_size = [pp_size] + else: + lengths = [len(size) for size in sizes if not isinstance(size, int)] + if len(set(lengths)) != 1: + raise ValueError( + "When providing multiple values for either world_size, tp_size or pp_size, you must provide the " + f"same number of values. Here: {', '.join(lengths)}." + ) + if not all(isinstance(size, (tuple, list)) for size in sizes): + length = lengths[0] + world_size = [world_size] * length if isinstance(world_size, int) else world_size + tp_size = [tp_size] * length if isinstance(tp_size, int) else tp_size + pp_size = [pp_size] * length if isinstance(pp_size, int) else pp_size + + for sizes in zip(world_size, tp_size, pp_size): + self._launch_procs(*sizes) + time.sleep(0.5) + + def _get_current_test_func(self, request): + # DistributedTest subclasses may have multiple test methods + func_name = request.function.__name__ + return getattr(self, func_name) diff --git a/tests/distributed/model_parallel_test_template.txt b/tests/distributed/model_parallel_test_template.txt deleted file mode 100644 index ad6f8e530..000000000 --- a/tests/distributed/model_parallel_test_template.txt +++ /dev/null @@ -1,157 +0,0 @@ -# This is a template file for testing model parallelization. - -import os -from contextlib import nullcontext -from inspect import signature - -import torch -import neuronx_distributed -from neuronx_distributed import parallel_layers -from neuronx_distributed.utils.model_utils import move_model_to_device -import torch_xla.core.xla_model as xm - -from transformers import AutoConfig, AutoTokenizer, {model_class} -from transformers.trainer_utils import set_seed - -import optimum -from optimum.neuron.utils.training_utils import set_neuron_cc_optlevel_for_model -from optimum.neuron.distributed import ParallelizersManager, lazy_load_for_parallelism - -from utils import gather_along_dim, generate_dummy_labels, create_static_seed_patcher - - -if os.environ.get("TORCHELASTIC_RUN_ID"): - import torch_xla.distributed.xla_backend as xbn - - if not isinstance(torch.distributed.group.WORLD, xbn.ProcessGroupXla): - torch.distributed.init_process_group(backend="xla") - -SEED = 42 - -from_config = os.environ["from_config"] == "true" -lazy_load = os.environ["lazy_load"] == "true" -is_parallel = os.environ["is_parallel"] == "true" -config_overwrite = os.environ.get("config_overwrite", "") -parallelize_embeddings = is_parallel and os.environ["parallelize_embeddings"] == "true" -sequence_parallel_enabled = os.environ["sequence_parallel_enabled"] == "true" -computing_loss_is_supported = os.environ["computing_loss_is_supported"] == "true" - -# This is required to prevent `parallel_cross_entropy` to mutate the logits (which would make them not comparable). -if is_parallel and parallelize_embeddings: - optimum.neuron.distributed.parallel_layers._PARALLEL_CROSS_ENTROPY_SHOULD_PRESERVE_INPUT = True - -# Initialize TP -if is_parallel: - neuronx_distributed.parallel_layers.parallel_state.initialize_model_parallel(tensor_model_parallel_size={tp_size}) - - -config = AutoConfig.from_pretrained("{model_name_or_path}") -config_overwrite = config_overwrite.split(",") -for overwrite_info in config_overwrite: - if overwrite_info == "": - continue - attr_name, attr_value = overwrite_info.split("=") - attr_type = type(getattr(config, attr_name)) - setattr(config, attr_name, attr_type(attr_value)) - -if getattr(config, "problem_type", None) is None: - config.problem_type = "single_label_classification" - -if xm.get_ordinal() == 0: - print(config) - -preprocessor = AutoTokenizer.from_pretrained("{model_name_or_path}") - -inputs = preprocessor("This is a test to check that TP is working.", return_tensors="pt") - -if sequence_parallel_enabled: - for name, tensor in inputs.items(): - if tensor.shape[1] % {tp_size} != 0: - tensor = torch.nn.functional.pad( - tensor, pad=(0, tensor.shape[1] % {tp_size}), value=1, - ) - inputs[name] = tensor - -def load_model_with_seed(seed: int, from_config: bool): - set_seed(seed) - if from_config: - model = {model_class}(config) - else: - tp_size = {tp_size} if is_parallel else 1 - ctx = lazy_load_for_parallelism(tensor_parallel_size=tp_size) if lazy_load else nullcontext() - with ctx: - model = {model_class}.from_pretrained("{model_name_or_path}", config=config, ignore_mismatched_sizes=True) - return model - -static_seed_patcher = create_static_seed_patcher({model_class}, SEED) -with static_seed_patcher: - model = load_model_with_seed(SEED, from_config) - - set_neuron_cc_optlevel_for_model(model) - - vocab_size = getattr(model.config, "vocab_size", None) - - if is_parallel: - model = ParallelizersManager.parallelizer_for_model(model).parallelize( - model, - parallelize_embeddings=parallelize_embeddings, - sequence_parallel_enabled=sequence_parallel_enabled, - ) - filename = "parallel.bin" - else: - filename = "original.bin" - -move_model_to_device(model, "xla") -model = model.eval() - -xla_inputs = dict() -sig = signature(model.forward) -for k, v in inputs.items(): - if k not in sig.parameters: - continue - xla_inputs[k] = v.to("xla") - decoder_input_name = "decoder_" + k - if model.config.is_encoder_decoder and decoder_input_name in sig.parameters: - xla_inputs[decoder_input_name] = v.to("xla") - -# We take the shape of the first input to "predict" the shape of the labels. -# Might not work for every tasks. -shape = list(xla_inputs.values())[0].shape - -if computing_loss_is_supported: - xla_inputs.update(generate_dummy_labels(model, shape, vocab_size=vocab_size, device="xla", seed=SEED)) - -model_outputs = model(**xla_inputs, return_dict=True) -xm.mark_step() - -axis_to_gather = dict() -axis_to_gather["default"] = -1 -axis_to_gather["past_key_values"] = 1 - -def gather_output(output, gather_dim): - if isinstance(output, (tuple, list, set)): - output_type = type(output) - gathered_output = [] - for t in output: - gathered_output.append(gather_output(t, gather_dim)) - result = output_type(gathered_output) - else: - result = gather_along_dim(output, gather_dim) - return result - -if is_parallel: - # Because of parallelism (embeddings and sequence parallelism), some outputs need to be gathered. - # Since it is not possible to generically know which one, we save both the "regular" output and the gathered - # version of it. We then compare both of them to the original output and fail if both do not match. - gathered_model_outputs = dict() - for name, output in model_outputs.items(): - gathered_model_outputs[name] = output - if name == "loss" or output is None: - gathered_output = output - else: - gathered_output = gather_output(output, axis_to_gather.get(name, axis_to_gather["default"])) - gathered_output_name = "gathered_" + name - gathered_model_outputs[gathered_output_name] = gathered_output - model_outputs = gathered_model_outputs - -xm.save(model_outputs, "{output_path}" + "/" + filename) diff --git a/tests/distributed/test_common.py b/tests/distributed/test_common.py new file mode 100644 index 000000000..4cc99a741 --- /dev/null +++ b/tests/distributed/test_common.py @@ -0,0 +1,415 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""General tests related to distributed training.""" + +from pathlib import Path +from typing import TYPE_CHECKING, Dict + +import pytest +import safetensors +import torch +from transformers import LlamaForCausalLM + +from optimum.neuron.accelerate.optimizer import NeuronAcceleratedOptimizer +from optimum.neuron.accelerate.utils.dataclasses import NeuronDistributedType +from optimum.neuron.distributed.utils import ( + TENSOR_PARALLEL_SHARDS_DIR_NAME, + make_optimizer_constructor_lazy, +) +from optimum.neuron.utils.import_utils import ( + is_neuronx_distributed_available, + is_torch_xla_available, +) +from optimum.neuron.utils.testing_utils import is_trainium_test + +from .distributed import DistributedTest +from .utils import create_accelerator_for_mp, create_static_seed_patcher, get_model, get_model_inputs + + +if is_torch_xla_available(): + import torch_xla.core.xla_model as xm + +if is_neuronx_distributed_available(): + from neuronx_distributed.parallel_layers.parallel_state import ( + get_data_parallel_rank, + get_pipeline_model_parallel_rank, + get_tensor_model_parallel_group, + get_tensor_model_parallel_rank, + ) + from neuronx_distributed.parallel_layers.utils import move_all_tensor_to_cpu + from neuronx_distributed.pipeline import NxDPPModel + from neuronx_distributed.utils.model_utils import move_model_to_device + +if TYPE_CHECKING: + from transformers import PreTrainedModel + +MODEL_NAME = "michaelbenayoun/llama-2-tiny-16layers-random" + + +def get_tiny_llama_model( + tp_size: int = 1, + pp_size: int = 1, + lazy_load: bool = False, + from_config: bool = False, + use_static_seed_patcher: bool = False, + add_random_noise: bool = False, +) -> "PreTrainedModel": + return get_model( + LlamaForCausalLM, + MODEL_NAME, + tp_size=tp_size, + pp_size=pp_size, + lazy_load=lazy_load, + from_config=from_config, + use_static_seed_patcher=use_static_seed_patcher, + add_random_noise=add_random_noise, + ) + + +def get_optimizer(model: torch.nn.Module, lazy: bool = False, with_groups: bool = True) -> torch.optim.Optimizer: + adam_cls = torch.optim.AdamW + if lazy: + adam_cls = make_optimizer_constructor_lazy(adam_cls) + + if with_groups: + groups = [ + {"params": (p for idx, p in enumerate(model.parameters()) if idx % 2 == 0), "lr": 1e-2}, + {"params": (p for idx, p in enumerate(model.parameters()) if idx % 2 == 1), "lr": 1e-6}, + ] + else: + groups = model.parameters() + + return adam_cls(groups) + + +def move_params_to_cpu(parameters): + parameters = list(parameters) + xm.mark_step() + # `move_all_tensor_to_cpu` only selects `torch.Tensor`, so we need to move the parameters' data. + cpu_params = move_all_tensor_to_cpu([p.data for p in parameters]) + return cpu_params + + +@is_trainium_test +class TestCommonDistributed(DistributedTest): + # TODO: enable dp=4,tp=pp=2 when working on the multi-node training PR. + @pytest.fixture( + scope="class", + params=[[2, 1, 1], [2, 2, 1], [2, 1, 2]], + ids=["dp=2", "tp=2", "pp=2"], + ) + def parallel_sizes(self, request): + return request.param + + @pytest.fixture(scope="class", params=[False, True], ids=["no_lazy_load", "lazy_load"]) + def lazy_load(self, request): + return request.param + + @pytest.fixture(scope="class", params=[False, True], ids=["from_pretrained", "from_config"]) + def from_config(self, request): + return request.param + + @pytest.fixture(scope="class", params=[False, True], ids=["no_lazy_optimizer", "lazy_optimizer"]) + def lazy_optimizer(self, request): + return request.param + + @pytest.fixture(scope="class", params=[False, True], ids=["without_groups", "with_groups"]) + def with_groups(self, request): + return request.param + + @pytest.fixture(scope="class", params=[False, True], ids=["no_zero_1", "zero_1"]) + def zero_1(self, request): + return request.param + + @pytest.fixture(scope="class", params=[1, 12], ids=["no_grad_acc", "grad_acc=12"]) + def gradient_accumulation_steps(self, request): + return request.param + + @pytest.fixture(scope="class", params=[None, 0.01], ids=["no_clip_grad_norm", "clip_grad_norm"]) + def max_grad_norm(self, request): + return request.param + + def test_optimizer_parameters_match_model_parameters( + self, lazy_load, lazy_optimizer, with_groups, zero_1, parallel_sizes + ): + num_workers, tp_size, pp_size = parallel_sizes + dp_size = num_workers // (tp_size * pp_size) + if dp_size == 1 and zero_1: + pytest.skip("zero_1 needs to be tested only for dp_size > 1") + + model = get_tiny_llama_model(tp_size=tp_size, pp_size=pp_size, lazy_load=lazy_load) + optimizer = get_optimizer(model, lazy_optimizer, with_groups) + + accelerator = create_accelerator_for_mp(tp_size, pp_size, zero_1=zero_1) + if tp_size > 1 or pp_size > 1: + assert accelerator.state.distributed_type is NeuronDistributedType.MODEL_PARALLELISM + + model = accelerator.prepare(model) + + # Under DDP only setting, the optimizer needs to be created after the model has been moved. + if tp_size == 1 and pp_size == 1: + optimizer = get_optimizer(model, lazy_optimizer, with_groups) + + optimizer = accelerator.prepare(optimizer) + + assert isinstance(optimizer, NeuronAcceleratedOptimizer) + + if isinstance(model, NxDPPModel): + model_parameters = set(model.local_parameters()) + else: + model_parameters = set(model.parameters()) + optimizer_parameters = {p for group in optimizer.param_groups for p in group["params"]} + + assert model_parameters == optimizer_parameters + + def test_optimizer_step(self, zero_1, gradient_accumulation_steps, max_grad_norm, parallel_sizes): + num_workers, tp_size, pp_size = parallel_sizes + dp_size = num_workers // (tp_size * pp_size) + if dp_size == 1 and zero_1: + pytest.skip("zero_1 needs to be tested only for dp_size > 1") + + # TODO: investigate that with the AWS team to find a solution. + if dp_size > 1 and zero_1 and max_grad_norm is not None: + pytest.skip("Gradient clipping seems to not work properly with ZeRO-1.") + + model = get_tiny_llama_model(tp_size=tp_size, pp_size=pp_size, use_static_seed_patcher=True) + + if tp_size == pp_size == 1: + move_model_to_device(model, xm.xla_device()) + + optimizer = get_optimizer(model, with_groups=False) + + accelerator = create_accelerator_for_mp( + tp_size, pp_size, zero_1=zero_1, gradient_accumulation_steps=gradient_accumulation_steps + ) + + model, optimizer = accelerator.prepare(model, optimizer) + assert isinstance(optimizer, NeuronAcceleratedOptimizer) + + inputs = get_model_inputs(model, MODEL_NAME) + + def move_grads_to_cpu(parameters): + grads = [p.grad for p in parameters] + grads = move_all_tensor_to_cpu(grads) + return grads + + if pp_size == 1: + inputs = {k: v.to(xm.xla_device()) for k, v in inputs.items()} + + current_parameters = move_params_to_cpu( + model.local_parameters() if isinstance(model, NxDPPModel) else model.parameters() + ) + + for step in range(int(1.5 * gradient_accumulation_steps)): + is_optimizer_update_step = (step + 1) % gradient_accumulation_steps == 0 + with accelerator.accumulate(model): + if pp_size > 1: + orig_parameters = current_parameters + loss = model.run_train(**inputs) + xm.mark_step() + + if max_grad_norm is not None: + accelerator.clip_grad_norm_(model.local_parameters(), max_norm=max_grad_norm, norm_type=2) + + # Checking that at least some of the parameters have a gradient. + grads_on_cpu = move_grads_to_cpu(model.local_parameters()) + assert any(torch.all(grad != 0) for grad in grads_on_cpu) + + optimizer.step() + + # Checking only after an actual optimizer step that the norm has been clipped because it happens + # during the optimizer step in some cases. + if is_optimizer_update_step and max_grad_norm is not None: + grads_on_cpu = move_grads_to_cpu(model.local_parameters()) + norms = [torch.linalg.vector_norm(grad, 2) for grad in grads_on_cpu] + total_norm = torch.linalg.vector_norm(torch.stack(norms), 2) + assert total_norm <= max_grad_norm + + optimizer.zero_grad() + + grads_on_cpu = move_grads_to_cpu(model.local_parameters()) + if is_optimizer_update_step: + # At this point, no parameter should have a gradient. + assert all(torch.all(grad == 0) for grad in grads_on_cpu) + + current_parameters = move_params_to_cpu(model.local_parameters()) + else: + orig_parameters = current_parameters + outputs = model(**inputs) + loss = outputs["loss"] + xm.mark_step() + loss.backward() + + if max_grad_norm is not None: + accelerator.clip_grad_norm_(model.parameters(), max_norm=max_grad_norm, norm_type=2) + + # Checking that at least some of the parameters have a gradient. + grads_on_cpu = move_grads_to_cpu(model.parameters()) + assert any(torch.all(grad != 0) for grad in grads_on_cpu) + + optimizer.step() + + # Checking only after an actual optimizer step that the norm has been clipped because it happens + # during the optimizer step in some cases. + if is_optimizer_update_step and max_grad_norm is not None: + grads_on_cpu = move_grads_to_cpu(model.parameters()) + norms = [torch.linalg.vector_norm(grad, 2) for grad in grads_on_cpu] + total_norm = torch.linalg.vector_norm(torch.stack(norms), 2) + assert total_norm <= max_grad_norm + + optimizer.zero_grad() + + # At this point, no parameter should have a gradient. + if is_optimizer_update_step: + grads_on_cpu = move_grads_to_cpu(model.parameters()) + assert all(torch.all(grad == 0) for grad in grads_on_cpu) + + current_parameters = move_params_to_cpu(model.parameters()) + + if is_optimizer_update_step: + assert any(torch.any(p1 != p2) for (p1, p2) in zip(orig_parameters, current_parameters)) + else: + assert all(torch.all(p1 == p2) for (p1, p2) in zip(orig_parameters, current_parameters)) + + def test_lazy_load(self, from_config, parallel_sizes): + _, tp_size, pp_size = parallel_sizes + + if from_config and (tp_size > 1 or pp_size > 1): + pytest.skip("It is not easy to compare parameters value in this case because of initialization.") + + model = get_tiny_llama_model( + tp_size=1, pp_size=1, lazy_load=False, from_config=from_config, use_static_seed_patcher=True + ) + + orig_parameters: Dict[str, torch.nn.Parameter] = dict(model.named_parameters()) + + accelerator = create_accelerator_for_mp(tp_size, pp_size) + lazy_model = get_tiny_llama_model( + tp_size=tp_size, pp_size=pp_size, lazy_load=True, from_config=from_config, use_static_seed_patcher=True + ) + static_seed_patcher = create_static_seed_patcher(model.__class__, 42) + with static_seed_patcher: + lazy_model = accelerator.prepare(lazy_model) + + if pp_size > 1: + named_parameters = dict(lazy_model.local_named_parameters()) + else: + named_parameters = dict(lazy_model.named_parameters()) + + xm.mark_step() + + for name, param in named_parameters.items(): + orig = orig_parameters[name] + if orig.shape != param.shape: + if orig.dim() == 1: + gather_dim = 0 + elif orig.dim() == 2: + gather_dim = 1 if orig.shape[0] == param.shape[0] else 0 + else: + raise ValueError(f"The case where the weight as a rank of {orig.dim()} is not supported.") + gathered = [torch.empty(param.shape) for _ in range(tp_size)] + torch.distributed.all_gather(gathered, param, group=get_tensor_model_parallel_group()) + gathered_param = torch.cat(gathered, dim=gather_dim) + else: + gathered_param = param + + orig = orig.to("cpu") + gathered_param = gathered_param.to("cpu") + xm.mark_step() + + print(f"Comparing parameter named {name}") + torch.testing.assert_close(orig, gathered_param) + + def test_save_model_and_load_model(self, parallel_sizes, tmpdir, monkeypatch): + _, tp_size, pp_size = parallel_sizes + dp_rank = get_data_parallel_rank() + tp_rank = get_tensor_model_parallel_rank() + pp_rank = get_pipeline_model_parallel_rank() + + tmpdir = Path(tmpdir) + + model = get_tiny_llama_model(tp_size=tp_size, pp_size=pp_size, lazy_load=False, add_random_noise=True) + + accelerator = create_accelerator_for_mp(tp_size, pp_size) + model = accelerator.prepare(model) + accelerator.save_state(tmpdir.as_posix()) + accelerator.state._reset_state(reset_partial_state=True) + del accelerator + + if pp_size > 1: + # We need to disable `NxDPPModel._set_distributed` since it is already done during the creation of the + # first model, otherwise creating new `NxDPPModel`s will fail. + monkeypatch.setattr(NxDPPModel, "_set_distributed", lambda _: _) + + tmpdir_content = [path.name for path in tmpdir.glob("**/*")] + pytorch_checkpoint_exists = "pytorch_model.bin" in tmpdir_content + safetensors_checkpoint_exists = "model.safetensors" in tmpdir_content + + if tp_size > 1 or pp_size > 1: + ref_data_file_name = f"tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:02d}" + tensors_directory = f"{ref_data_file_name}.tensors" + assert not pytorch_checkpoint_exists + assert not safetensors_checkpoint_exists + assert TENSOR_PARALLEL_SHARDS_DIR_NAME in tmpdir_content + assert ref_data_file_name in tmpdir_content + assert tensors_directory in tmpdir_content + else: + assert pytorch_checkpoint_exists or safetensors_checkpoint_exists + + # Making sure that we end-up with a different model when starting over. + new_model = get_tiny_llama_model(tp_size=tp_size, pp_size=pp_size, lazy_load=False, add_random_noise=True) + new_accelerator = create_accelerator_for_mp(tp_size, pp_size) + new_model = new_accelerator.prepare(new_model) + new_accelerator.state._reset_state(reset_partial_state=True) + del new_accelerator + + if pp_size == 1: + model_parameters = move_params_to_cpu(model.parameters()) + new_model_parameters = move_params_to_cpu(new_model.parameters()) + else: + model_parameters = move_params_to_cpu(model.local_parameters()) + new_model_parameters = move_params_to_cpu(new_model.local_parameters()) + + assert any( + torch.all(p1 == 0.0) or torch.all(p1 == 1.0) or torch.all(p1 != p2) + for p1, p2 in zip(model_parameters, new_model_parameters) + ) + + # Checking that when providing a checkpoint, we end-up with the same model as the original. + new_model = get_tiny_llama_model(tp_size=tp_size, pp_size=pp_size, lazy_load=False, add_random_noise=True) + new_accelerator = create_accelerator_for_mp(tp_size, pp_size, checkpoint_dir=tmpdir) + new_model = new_accelerator.prepare(new_model) + + # If there is no model parallelism, the checkpoint weights will not be loaded automatically since we do not + # call parallelize, so we do it manually. + if tp_size == pp_size == 1: + if pytorch_checkpoint_exists: + filename = "pytorch_model.bin" + checkpoint_path = tmpdir / filename + new_model.load_state_dict(torch.load(checkpoint_path)) + else: + filename = "model.safetensors" + checkpoint_path = tmpdir / filename + new_model.load_state_dict(safetensors.torch.load_file(checkpoint_path)) + + if pp_size == 1: + model_parameters = move_params_to_cpu(model.parameters()) + new_model_parameters = move_params_to_cpu(new_model.parameters()) + else: + model_parameters = move_params_to_cpu(model.local_parameters()) + new_model_parameters = move_params_to_cpu(new_model.local_parameters()) + + if dp_rank == 0: + assert all(torch.all(p1 == p2) for p1, p2 in zip(model_parameters, new_model_parameters)) diff --git a/tests/distributed/test_model_parallelization.py b/tests/distributed/test_model_parallelization.py index 2d64e4b28..a7097dc4c 100644 --- a/tests/distributed/test_model_parallelization.py +++ b/tests/distributed/test_model_parallelization.py @@ -14,49 +14,65 @@ # limitations under the License. """Tests validating that models can be parallelized correctly.""" -import os -import subprocess -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union -from unittest import TestCase +from typing import TYPE_CHECKING, List, Optional, Type, Union import pytest import torch -from parameterized import parameterized +import torch.utils._pytree as pytree +from transformers import LlamaForCausalLM +from transformers.models.auto.configuration_auto import CONFIG_MAPPING from transformers.models.auto.modeling_auto import ( - MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, - MODEL_FOR_BACKBONE_MAPPING_NAMES, - MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, - MODEL_FOR_CTC_MAPPING_NAMES, - MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES, - MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, - MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES, - MODEL_FOR_MASKED_LM_MAPPING_NAMES, - MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES, - MODEL_FOR_PRETRAINING_MAPPING_NAMES, - MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, - MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, - MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, - MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, - MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, - MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, - MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING, + MODEL_FOR_BACKBONE_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + MODEL_FOR_CTC_MAPPING, + MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING, + MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, + MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING, + MODEL_FOR_MASKED_LM_MAPPING, + MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, + MODEL_FOR_PRETRAINING_MAPPING, + MODEL_FOR_QUESTION_ANSWERING_MAPPING, + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING, + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, + MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, + MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING, ) +import optimum +from optimum.neuron.accelerate.accelerator import NeuronAccelerator +from optimum.neuron.distributed.parallelizers_manager import ParallelizersManager from optimum.neuron.utils.cache_utils import ( get_num_neuron_cores, - set_neuron_cache_path, ) -from optimum.neuron.utils.import_utils import is_neuronx_available -from optimum.neuron.utils.runner import run_command_with_realtime_output +from optimum.neuron.utils.import_utils import ( + is_neuronx_available, + is_neuronx_distributed_available, + is_torch_xla_available, +) +from optimum.neuron.utils.testing_utils import is_trainium_test +from optimum.neuron.utils.training_utils import set_neuron_cc_optlevel_for_model + +from .distributed import DistributedTest +from .utils import SEED, create_accelerator_for_mp, get_model, get_model_inputs -from ..test_utils import is_trainium_test -from ..utils import TrainiumTestMixin +if is_torch_xla_available(): + import torch_xla.core.xla_model as xm + +if is_neuronx_distributed_available(): + from neuronx_distributed.parallel_layers.parallel_state import ( + get_pipeline_model_parallel_rank, + get_tensor_model_parallel_group, + get_tensor_model_parallel_size, + ) + from neuronx_distributed.parallel_layers.utils import move_all_tensor_to_cpu + from neuronx_distributed.utils.model_utils import move_model_to_device if TYPE_CHECKING: - from transformers import PretrainedConfig + from transformers import PreTrainedModel TEMPLATE_FILE_NAME = "model_parallel_test_template.txt" @@ -71,46 +87,47 @@ ] -def _generate_supported_model_class_names( - model_name: Type["PretrainedConfig"], +def _generate_supported_model_classes( + model_type: str, supported_tasks: Optional[Union[str, List[str]]] = None, -) -> List[str]: +) -> List[Type["PreTrainedModel"]]: task_mapping = { # TODO: enable that when base models are supported. - # "default": MODEL_MAPPING_NAMES, - "pretraining": MODEL_FOR_PRETRAINING_MAPPING_NAMES, - "next-sentence-prediction": MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES, - "masked-lm": MODEL_FOR_MASKED_LM_MAPPING_NAMES, - "causal-lm": MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, - "seq2seq-lm": MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, - "speech-seq2seq": MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, + # "default": MODEL_MAPPING, + "pretraining": MODEL_FOR_PRETRAINING_MAPPING, + "next-sentence-prediction": MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, + "masked-lm": MODEL_FOR_MASKED_LM_MAPPING, + "causal-lm": MODEL_FOR_CAUSAL_LM_MAPPING, + "seq2seq-lm": MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, + "speech-seq2seq": MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, # Those architectures are more painful to deal with because the input is different. - # "multiple-choice": MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES, - "document-question-answering": MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES, - "question-answering": MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, - "sequence-classification": MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, - "token-classification": MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, - "masked-image-modeling": MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES, - "image-classification": MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, - "zero-shot-image-classification": MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES, - "ctc": MODEL_FOR_CTC_MAPPING_NAMES, - "audio-classification": MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, - "semantic-segmentation": MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, - "backbone": MODEL_FOR_BACKBONE_MAPPING_NAMES, + # "multiple-choice": MODEL_FOR_MULTIPLE_CHOICE_MAPPING, + "document-question-answering": MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING, + "question-answering": MODEL_FOR_QUESTION_ANSWERING_MAPPING, + "sequence-classification": MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + "token-classification": MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + "masked-image-modeling": MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING, + "image-classification": MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, + "zero-shot-image-classification": MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING, + "ctc": MODEL_FOR_CTC_MAPPING, + "audio-classification": MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING, + "semantic-segmentation": MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING, + "backbone": MODEL_FOR_BACKBONE_MAPPING, } if supported_tasks is None: - supported_tasks = task_mapping.keys() + supported_tasks = list(task_mapping.keys()) if isinstance(supported_tasks, str): supported_tasks = [supported_tasks] - model_class_names = [] + model_classes = [] for task in supported_tasks: - class_name = task_mapping[task].get(model_name, None) - if class_name is not None and class_name not in CLASSES_TO_IGNORE: - model_class_names.append(class_name) + config_class = CONFIG_MAPPING[model_type] + model_class = task_mapping[task].get(config_class, None) + if model_class is not None and model_class not in CLASSES_TO_IGNORE: + model_classes.append(model_class) - return list(set(model_class_names)) + return list(set(model_classes)) MODEL_TYPES_TO_TEST = [ @@ -125,10 +142,13 @@ def _generate_supported_model_class_names( ), ( "gpt_neox", - "hf-tiny-model-private/tiny-random-GPTNeoXModel", - {"num_hidden_layers": "2", "intermediate_size": "36"}, + "michaelbenayoun/gpt-neox-tiny-4layers-random", + {"num_hidden_layers": "2"}, + ), + ( + "llama", + "michaelbenayoun/llama-2-tiny-16layers-random", ), - ("llama", "yujiepan/llama-2-tiny-3layers-random", {"num_hidden_layers": "2"}), ( "t5", "hf-internal-testing/tiny-random-T5Model", @@ -141,390 +161,305 @@ def _generate_supported_model_class_names( for entry in MODEL_TYPES_TO_TEST: if len(entry) == 2: model_type, model_name_or_path = entry - config_overwrite = {} + config_overwrite = None else: model_type, model_name_or_path, config_overwrite = entry - for model_class_name in _generate_supported_model_class_names(model_type): - MODELS_TO_TEST.append((model_class_name, model_name_or_path, config_overwrite)) + for model_class in _generate_supported_model_classes(model_type): + entry = (model_type, model_class, model_name_or_path, config_overwrite) + if entry not in MODELS_TO_TEST: + MODELS_TO_TEST.append(entry) + + +MODEL_CLASSES_TO_IGNORE = [ + "BertForPreTraining", # There is a compilation issue, and testing TP for BertForPretraining is not really important. + # TODO + # GPTNeo's attention mechanism is broken in transformers==4.36.2, this should be re-enabled once there is a release + # containing this PR: https://github.com/huggingface/transformers/pull/28533 + "GPTNeoForSequenceClassification", + "GPTNeoForTokenClassification", + "GPTNeoForQuestionAnswering", + "GPTNeoForCausalLM", +] -# When doing from pretrained + lazy loading, it is not always easy to initiliazed the remaining weights in a similar -# fashion than in the regular model. So we do not check for them under this specific setting. It does not mean that -# parallelization does not work for them, only that some weights cannot be initialized exactly the same way. -MODEL_CLASSES_TO_IGNORE_ON_LAZY_LOAD_FOR_FROM_PRETRAINED = [ - "T5ForQuestionAnswering", -] +LLAMA_GQA_VARIANTS_TO_TEST = { + "MHA-setup": ( + 8, + 2, + 1, + { + "num_hidden_layers": "2", + "num_attention_heads": "8", + "num_key_value_heads": "8", + }, + ), + "num_key_value_heads > tp_size": ( + 8, + 2, + 1, + { + "num_hidden_layers": "2", + "num_attention_heads": "8", + "num_key_value_heads": "4", + }, + ), + "num_key_value_heads = tp_size": ( + 8, + 8, + 1, + { + "num_hidden_layers": "2", + "hidden_size": "32", + "num_attention_heads": "16", + "num_key_value_heads": "8", + }, + ), + "num_key_value_heads < tp_size": ( + 8, + 8, + 1, + { + "num_hidden_layers": "2", + "hidden_size": "32", + "num_attention_heads": "16", + "num_key_value_heads": "2", + }, + ), + "MQA-setup": ( + 8, + 8, + 1, + { + "num_hidden_layers": "2", + "hidden_size": "32", + "num_attention_heads": "16", + "num_key_value_heads": "1", + }, + ), +} +# LLAMA_V2_MODEL_NAME = "michaelbenayoun/llama-2-tiny-16layers-32kv-heads-random" +LLAMA_V2_MODEL_NAME = "anushehchaudry/llama-2-tiny-random" +# LLAMA_V2_MODEL_NAME = "michaelbenayoun/llama-2-tiny-16layers-random" +# LLAMA_V2_MODEL_NAME = "michaelbenayoun/llama-2-tiny-16layers-32kv-heads-random" @is_trainium_test -class ModelParallelizationTestCase(TrainiumTestMixin, TestCase): +class TestModelParallelization(DistributedTest): OUTPUTS_TO_IGNORE = { # It might not match in the sequence parallel setting because of mistmatched shapes. # Since these outputs are not needed during training, we do not want to perform an expensive gather for them. "encoder_last_hidden_state", } - def _check_output(self, name: str, original_output, output, lazy_load: bool): + @pytest.fixture(scope="class", params=[[2, 2, 1], [2, 1, 2], [16, 2, 2]], ids=["tp=2", "pp=2", "dp=4,tp=pp=2"]) + def parallel_sizes(self, request): + return request.param + + @pytest.fixture(scope="class", params=MODELS_TO_TEST, ids=[specs[1].__name__ for specs in MODELS_TO_TEST]) + def model_specs(self, request): + return request.param + + def early_skip(self, fixtures_kwargs): + pp_size = fixtures_kwargs.get("pp_size", None) + parallel_sizes = fixtures_kwargs.get("parallel_sizes", None) + if pp_size is None and parallel_sizes is not None: + pp_size = parallel_sizes[-1] + model_specs = fixtures_kwargs.get("model_specs", None) + + if pp_size > 1 and model_specs is not None: + model_type = model_specs[0] + manager = ParallelizersManager.parallelizer_for_model(model_type) + if not manager.supports_pipeline_parallelism(): + pytest.skip(f"Pipeline parallelism is not supported for {model_class.__name__}.") + + return super().early_skip(fixtures_kwargs) + + def _check_output(self, name: str, original_output, output): assert type(original_output) is type(output) if isinstance(original_output, (tuple, list, set)): for idx, orig_output in enumerate(original_output): new_name = f"{name}.{idx}" - self._check_output(new_name, orig_output, output[idx], lazy_load) + self._check_output(new_name, orig_output, output[idx]) elif isinstance(original_output, dict): for output_name in original_output: new_name = f"{name}.{output_name}" - self._check_output(new_name, original_output[name], output[name], lazy_load) + self._check_output(new_name, original_output[name], output[name]) elif isinstance(original_output, torch.Tensor): - print(f"Original {name}:\nShape: {original_output.shape}\nValue: {original_output}") - print(f"Parallel {name}:\nShape: {output.shape}\nValue: {output}") + xm.master_print(f"Comparing output named {name}") + tp_size = get_tensor_model_parallel_size() + if original_output.shape != output.shape: + gather_dim = min( + idx for idx in range(original_output.dim()) if original_output.shape[idx] != output.shape[idx] + ) + output = output.to(xm.xla_device()) + gathered = [torch.empty_like(output) for _ in range(tp_size)] + torch.distributed.all_gather(gathered, output, group=get_tensor_model_parallel_group()) + gathered_output = torch.cat(gathered, dim=gather_dim) + xm.mark_step() + output = gathered_output.to("cpu") torch.testing.assert_close(original_output, output) else: assert original_output == output, f"Output named {name} do not match." - def _test_model_parallel( + def _parallel_model_matches_original_model( self, - tp_size: int, - model_class_name: str, - model_name_or_path: str, - from_config: bool, - with_lazy_load: bool, - parallelize_embeddings: bool, - sequence_parallel_enabled: bool, - num_neuron_cores: int = NUM_NEURON_CORES_AVAILABLE, - run_test_in_parallel: bool = False, - overwrite_model_config: Optional[Dict[str, str]] = None, + model_class, + model_name_or_path, + config_overwrite, + parallel_sizes, + from_pretrained, + lazy_load, + sequence_parallel_enabled, + parallelize_embeddings, ): - if "GPTNeoX" in model_class_name: - self.skipTest("GPTNeoX test is flaky, needs to be fixed.") - - if num_neuron_cores < tp_size: - raise ValueError( - "The number of Neuron cores available is lower than the TP size, failing since the test might not be " - "testing what is expected." - ) - - if run_test_in_parallel and (NUM_NEURON_CORES_AVAILABLE // num_neuron_cores) < 2: - raise ValueError( - "The test cannot be run in parallel because there is not enough Neuron cores available to preserve the " - f"number of Neuron cores requested ({NUM_NEURON_CORES_AVAILABLE} cores available and {num_neuron_cores} " - "were requested)" - ) - - template_content = None - current_directory = Path(__file__).parent.resolve() - template_file_path = current_directory / TEMPLATE_FILE_NAME - with open(template_file_path, "r") as fp: - template_content = fp.read() - - specialization_env = { - "from_config": "true" if from_config else "false", - "lazy_load": "true" if with_lazy_load else "false", - "parallelize_embeddings": "true" if parallelize_embeddings else "false", - "sequence_parallel_enabled": "true" if sequence_parallel_enabled else "false", - "computing_loss_is_supported": "true", - **os.environ, - } - - # Updating the Python path to be able to use `tests/distributed/utils.py`. - python_path = specialization_env.get("PYTHONPATH", "") - python_path = f"{current_directory}:{python_path}" - specialization_env["PYTHONPATH"] = python_path - - if overwrite_model_config is not None: - specialization_env["config_overwrite"] = ",".join( - f"{key}={value}" for key, value in overwrite_model_config.items() - ) - - with TemporaryDirectory() as tmpdirname: - specialization_data = { - "model_class": model_class_name, - "model_name_or_path": model_name_or_path, - "parallelize_embeddings": "True" if parallelize_embeddings else "False", - "tp_size": tp_size, - "output_path": tmpdirname, - } - specialized_content = template_content.format(**specialization_data) - with open(f"{tmpdirname}/code.py", "w") as fp: - fp.write(specialized_content) - - cmd = ["torchrun", f"--nproc_per_node={num_neuron_cores}", f"{tmpdirname}/code.py"] - - # When running the test in parallel, we need 2 rendez-vous endpoints: one for the script running the - # original model and one for the script running the parallel model. - rdzv_endpoint_host = "localhost" - rdzv_endpoint_port = 29400 - - orig_neuron_cc_flags = os.environ.get("NEURON_CC_FLAGS", "") - set_neuron_cache_path(tmpdirname) - neuron_cc_flags = os.environ["NEURON_CC_FLAGS"] - os.environ["NEURON_CC_FLAGS"] = orig_neuron_cc_flags - - # Original model. - env = {"is_parallel": "false", **specialization_env, "NEURON_CC_FLAGS": neuron_cc_flags} - if run_test_in_parallel: - # Setting the rendez-vous endpoint for the original model process. - cmd.insert(1, f"--rdzv_endpoint={rdzv_endpoint_host}:{rdzv_endpoint_port}") - env["NEURON_RT_VISIBLE_CORES"] = f"0-{num_neuron_cores - 1}" - - # When running tests in parallel, synchronization is done after both processes started. - if not run_test_in_parallel: - p_original_returncode, stdout = run_command_with_realtime_output(cmd, env=env) - else: - p_original = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env) + if model_class.__name__ in MODEL_CLASSES_TO_IGNORE: + pytest.skip(f"Skipping test for {model_class.__name__} since it is buggy or a special case.") + + world_size, tp_size, pp_size = parallel_sizes + dp_size = world_size // (tp_size * pp_size) + pp_rank = get_pipeline_model_parallel_rank() + + orig_model = get_model( + model_class, + model_name_or_path, + from_config=not from_pretrained, + config_overwrite=config_overwrite, + use_static_seed_patcher=True, + ) + orig_model = NeuronAccelerator.patch_model_for_neuron(orig_model) - # Parallel model. - env = {"is_parallel": "true", **specialization_env, "NEURON_CC_FLAGS": neuron_cc_flags} - if run_test_in_parallel: - # Updating the rendez-vous endpoint for the parallel model process. - cmd[1] = f"--rdzv_endpoint={rdzv_endpoint_host}:{rdzv_endpoint_port + 1}" - env["NEURON_RT_VISIBLE_CORES"] = f"{num_neuron_cores}-{2 * num_neuron_cores - 1}" + set_neuron_cc_optlevel_for_model(orig_model) - p_parallel = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env) + move_model_to_device(orig_model, xm.xla_device()) + orig_model = orig_model.eval() - stdout, _ = p_original.communicate() - p_original_returncode = p_original.returncode - stdout = stdout.decode("utf-8") - full_output = f"Original model standard output:\n{stdout}" - print(full_output) + manager = ParallelizersManager.parallelizer_for_model(orig_model) - stdout, _ = p_parallel.communicate() - p_parallel_returncode = p_parallel.returncode - stdout = stdout.decode("utf-8") - full_output = f"Parallel model standard output:\n{stdout}" - print(full_output) + if pp_size > 1 and not manager.supports_pipeline_parallelism(): + pytest.skip(f"Pipeline parallelism is not supported for {model_class.__name__}.") - else: - p_parallel_returncode, stdout = run_command_with_realtime_output(cmd, env=env) - - assert p_original_returncode == 0 - assert p_parallel_returncode == 0 - - temporary_dir = Path(tmpdirname) - original_model_outputs = torch.load(temporary_dir / "original.bin") - parallel_model_outputs = torch.load(temporary_dir / "parallel.bin") - - if ( - not from_config - and with_lazy_load - and model_class_name in MODEL_CLASSES_TO_IGNORE_ON_LAZY_LOAD_FOR_FROM_PRETRAINED - ): - self.skipTest( - f"Cannot compare outputs for {model_class_name} when doing from_pretrained + lazy loading." - ) + if sequence_parallel_enabled and not manager.supports_sequence_parallelism(): + pytest.skip(f"Sequence parallelism is not supported for {model_class.__name__}.") - for name, t in original_model_outputs.items(): - if name in self.OUTPUTS_TO_IGNORE: - continue - print(f"Testing that {name} match.") - regular_parallel_outputs_error_msg = None - gathered_parallel_outputs_error_msg = None - try: - self._check_output(name, t, parallel_model_outputs[name], with_lazy_load) - except AssertionError as e: - regular_parallel_outputs_error_msg = str(e) - if regular_parallel_outputs_error_msg is not None: - print("Regular output did not match, testing with the gathered output...") - try: - self._check_output(name, t, parallel_model_outputs[f"gathered_{name}"], with_lazy_load) - except AssertionError as e: - gathered_parallel_outputs_error_msg = str(e) - if regular_parallel_outputs_error_msg is not None and gathered_parallel_outputs_error_msg is not None: - msg = ( - "Output did not matched.\nTest with non-gathered parallel outputs error:\n" - f"{regular_parallel_outputs_error_msg}\nTest with gathered parallel outputs error:\n" - f"{gathered_parallel_outputs_error_msg}" - ) - raise AssertionError(msg) - print("Ok!") - - @parameterized.expand(MODELS_TO_TEST) - def test_model_parallel_from_config_no_lazy_load( - self, model_class_name: str, model_name_or_path: str, config_overwrite: Dict[str, str] - ): - # In this test, we: - # 1. Test parallelism when initializing from a config. - # 2. Do not enable embedding parallelization => while behaviour could differ between a model initialized - # lazily or not, the risk is minimal. This feature is tested on the next test with lazy loading. - # 3. Do not enable sequence parallelism => this feature should not depend on whether the model is initialized - # lazily or not. - self._test_model_parallel( - num_neuron_cores=8, - tp_size=2, - run_test_in_parallel=True, - model_class_name=model_class_name, - model_name_or_path=model_name_or_path, - from_config=True, - with_lazy_load=False, - parallelize_embeddings=False, - sequence_parallel_enabled=False, - overwrite_model_config=config_overwrite, + pad_to_multiple_of = None if not sequence_parallel_enabled else tp_size + inputs = get_model_inputs( + orig_model, model_name_or_path, batch_size=dp_size, pad_to_multiple_of=pad_to_multiple_of ) - @parameterized.expand(MODELS_TO_TEST) - def test_model_parallel_from_config_lazy_load( - self, model_class_name: str, model_name_or_path: str, config_overwrite: Dict[str, str] - ): - # In this test, we: - # 1. Test parallelism when initializing lazily from a config. - # 2. Enable embedding parallelization. - # 3. Enable sequence parallelism. - self._test_model_parallel( - num_neuron_cores=8, - tp_size=2, - run_test_in_parallel=True, - model_class_name=model_class_name, - model_name_or_path=model_name_or_path, - from_config=True, - with_lazy_load=True, - parallelize_embeddings=True, - sequence_parallel_enabled=True, - overwrite_model_config=config_overwrite, + xla_inputs = {k: v.to(xm.xla_device()) for k, v in inputs.items()} + xm.mark_step() + + with torch.no_grad(): + orig_model_outputs = orig_model(**xla_inputs) + + xm.mark_step() + + # The parallel model needs to be defined after the forward pass of the first model because there is a + # global monkey patching of the `torch.nn.CrossEntropyLoss` class when doing sequence parallelism. + model = get_model( + model_class, + model_name_or_path, + tp_size=tp_size, + pp_size=pp_size, + lazy_load=lazy_load, + from_config=not from_pretrained, + config_overwrite=config_overwrite, + use_static_seed_patcher=True, + ) + + accelerator = create_accelerator_for_mp( + tp_size, + pp_size, + parallelize_embeddings=parallelize_embeddings, + sequence_parallel_enabled=sequence_parallel_enabled, ) - @parameterized.expand(MODELS_TO_TEST) - def test_model_parallel_from_pretrained_no_lazy_load( - self, model_class_name: str, model_name_or_path: str, config_overwrite: Dict[str, str] + from .utils import create_static_seed_patcher + + static_seed_patcher = create_static_seed_patcher(model.__class__, SEED) + with static_seed_patcher: + model = accelerator.prepare(model) + + xm.mark_step() + + model = accelerator.patch_model_for_neuron(model) + with torch.no_grad(): + if pp_size == 1: + model = model.eval() + model_outputs = model(**xla_inputs) + else: + loss = model.run_eval(**inputs) + model_outputs = {"loss": loss} + + xm.mark_step() + + outputs_to_consider = [ + output_name for output_name in orig_model_outputs if output_name not in self.OUTPUTS_TO_IGNORE + ] + + if pp_size > 1: + outputs_to_consider = ["loss"] + + outputs_to_check = [ + (orig_model_outputs[output_name], model_outputs[output_name]) for output_name in outputs_to_consider + ] + outputs_to_check = pytree.tree_map(move_all_tensor_to_cpu, outputs_to_check) + + for output_name, outputs in zip(outputs_to_consider, outputs_to_check): + if all(output is None for output in outputs): + continue + if pp_size == 1 or pp_rank == pp_size - 1: + self._check_output(output_name, outputs[0], outputs[1]) + + def test_parallel_model_matches_original_model_from_pretrained_with_parallel_embeddings_and_sequence_parallel( + self, + model_specs, + parallel_sizes, + monkeypatch, ): - # In this test, we: - # 1. Test parallelism when initializing from pretrained weights. - # 2. Do not enable embedding parallelization => while behaviour could differ between a model initialized - # lazily or not, the risk is minimal. This feature is tested on the next test with lazy loading. - # 3. Do not enable sequence parallelism => this feature should not depend on whether the model is initialized - # lazily or not. - self._test_model_parallel( - num_neuron_cores=8, - tp_size=2, - run_test_in_parallel=True, - model_class_name=model_class_name, - model_name_or_path=model_name_or_path, - from_config=False, - with_lazy_load=False, - parallelize_embeddings=False, - sequence_parallel_enabled=False, - overwrite_model_config=config_overwrite, + _, model_class, model_name_or_path, config_overwrite = model_specs + monkeypatch.setattr( + optimum.neuron.distributed.parallel_layers, "_PARALLEL_CROSS_ENTROPY_SHOULD_PRESERVE_INPUT", True + ) + return self._parallel_model_matches_original_model( + model_class, model_name_or_path, config_overwrite, parallel_sizes, True, True, True, True ) - @parameterized.expand(MODELS_TO_TEST) - def test_model_parallel_from_pretrained_lazy_load( - self, model_class_name: str, model_name_or_path: str, config_overwrite: Dict[str, str] + @pytest.mark.skip("Model parallelism from config is not fully supported yet.") + def test_parallel_model_matches_original_model_from_config( + self, + model_specs, + parallel_sizes, + monkeypatch, ): - # In this test, we: - # 1. Test parallelism when initializing lazily from pretrained weights. - # 2. Enable embedding parallelization. - # 3. Enable sequence parallelism. - self._test_model_parallel( - num_neuron_cores=8, - tp_size=2, - run_test_in_parallel=True, - model_class_name=model_class_name, - model_name_or_path=model_name_or_path, - from_config=False, - with_lazy_load=True, - parallelize_embeddings=True, - sequence_parallel_enabled=True, - overwrite_model_config=config_overwrite, + _, model_class, model_name_or_path, config_overwrite = model_specs + monkeypatch.setattr( + optimum.neuron.distributed.parallel_layers, "_PARALLEL_CROSS_ENTROPY_SHOULD_PRESERVE_INPUT", True + ) + return self._parallel_model_matches_original_model( + model_class, model_name_or_path, config_overwrite, parallel_sizes, False, True, False, False ) @pytest.mark.skipif( NUM_NEURON_CORES_AVAILABLE < 32, reason=f"This test requires 32 Neuron cores, but only {NUM_NEURON_CORES_AVAILABLE} are available", ) - def test_llama_v2_gqa_variants(self): - llama_v2_model_name = "anushehchaudry/llama-2-tiny-random" - # MHA setup - # TP size = 2, num_attention_heads = 8, num_key_value_heads = 8 - self._test_model_parallel( - num_neuron_cores=8, - tp_size=2, - run_test_in_parallel=True, - model_class_name="LlamaForCausalLM", - model_name_or_path=llama_v2_model_name, - from_config=True, - with_lazy_load=False, - parallelize_embeddings=False, - sequence_parallel_enabled=False, - overwrite_model_config={ - "num_hidden_layers": "2", - "num_attention_heads": "8", - "num_key_value_heads": "8", - }, - ) - - # GQA setup with num_key_value_heads > tp_size. - # TP size = 2, num_attention_heads = 8, num_key_value_heads = 4 - self._test_model_parallel( - num_neuron_cores=8, - tp_size=2, - run_test_in_parallel=True, - model_class_name="LlamaForCausalLM", - model_name_or_path=llama_v2_model_name, - from_config=True, - with_lazy_load=False, - parallelize_embeddings=False, - sequence_parallel_enabled=False, - overwrite_model_config={ - "num_hidden_layers": "2", - "num_attention_heads": "8", - "num_key_value_heads": "4", - }, - ) - - # GQA setup with num_key_value_heads = tp_size. - # TP size = 8, num_attention_heads = 16, num_key_value_heads = 8 - self._test_model_parallel( - num_neuron_cores=8, - tp_size=8, - run_test_in_parallel=True, - model_class_name="LlamaForCausalLM", - model_name_or_path=llama_v2_model_name, - from_config=True, - with_lazy_load=False, - parallelize_embeddings=False, - sequence_parallel_enabled=False, - overwrite_model_config={ - "num_hidden_layers": "2", - "hidden_size": "32", - "num_attention_heads": "16", - "num_key_value_heads": "8", - }, - ) - - # GQA setup with num_key_value_heads < tp_size. - # TP size = 8, num_attention_heads = 16, num_key_value_heads = 2 - self._test_model_parallel( - num_neuron_cores=8, - tp_size=8, - run_test_in_parallel=True, - model_class_name="LlamaForCausalLM", - model_name_or_path=llama_v2_model_name, - from_config=True, - with_lazy_load=False, - parallelize_embeddings=False, - sequence_parallel_enabled=False, - overwrite_model_config={ - "num_hidden_layers": "2", - "hidden_size": "32", - "num_attention_heads": "16", - "num_key_value_heads": "2", - }, + @pytest.mark.parametrize( + "world_size,tp_size,pp_size,config_overwrite", + LLAMA_GQA_VARIANTS_TO_TEST.values(), + ids=LLAMA_GQA_VARIANTS_TO_TEST.keys(), + ) + def test_llama_v2_gqa_variants(self, world_size, tp_size, pp_size, config_overwrite, monkeypatch): + monkeypatch.setattr( + optimum.neuron.distributed.parallel_layers, "_PARALLEL_CROSS_ENTROPY_SHOULD_PRESERVE_INPUT", True ) - - # MQA setup - # TP size = 8, num_attention_heads = 16, num_key_value_heads = 1 - self._test_model_parallel( - num_neuron_cores=8, - tp_size=8, - run_test_in_parallel=True, - model_class_name="LlamaForCausalLM", - model_name_or_path=llama_v2_model_name, - from_config=True, - with_lazy_load=False, - parallelize_embeddings=False, - sequence_parallel_enabled=False, - overwrite_model_config={ - "num_hidden_layers": "2", - "hidden_size": "32", - "num_attention_heads": "16", - "num_key_value_heads": "1", - }, + return self._parallel_model_matches_original_model( + LlamaForCausalLM, + LLAMA_V2_MODEL_NAME, + config_overwrite, + (world_size, tp_size, pp_size), + False, + False, + False, + False, ) diff --git a/tests/distributed/test_training.py b/tests/distributed/test_training.py index f0bfc7351..9067495c3 100644 --- a/tests/distributed/test_training.py +++ b/tests/distributed/test_training.py @@ -14,118 +14,154 @@ # limitations under the License. """Tests related to training with `neuronx_distributed`.""" -import os +import json from pathlib import Path -from tempfile import TemporaryDirectory -from unittest import TestCase -from huggingface_hub import HfFolder +import pytest +from datasets import load_dataset +from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer -from optimum.neuron.utils.cache_utils import ( - delete_custom_cache_repo_name_from_hf_home, - load_custom_cache_repo_name_from_hf_home, - set_custom_cache_repo_name_in_hf_home, -) -from optimum.neuron.utils.runner import ExampleRunner +from optimum.neuron.training_args import NeuronTrainingArguments from optimum.neuron.utils.testing_utils import is_trainium_test +from .distributed import DistributedTest -_TINY_BERT_MODEL_NAME = "hf-internal-testing/tiny-random-bert" + +MODEL_NAME = "michaelbenayoun/llama-2-tiny-16layers-random" @is_trainium_test -class DistributedTrainingTestCase(TestCase): +class TestDistributedTraining(DistributedTest): CACHE_REPO_NAME = "optimum-internal-testing/optimum-neuron-cache-for-testing" - @classmethod - def setUpClass(cls): - orig_token = HfFolder.get_token() - orig_cache_repo = load_custom_cache_repo_name_from_hf_home() - ci_token = os.environ.get("HF_TOKEN_OPTIMUM_NEURON_CI", None) - if ci_token is not None: - HfFolder.save_token(ci_token) - set_custom_cache_repo_name_in_hf_home(cls.CACHE_REPO_NAME) - cls._token = orig_token - cls._cache_repo = orig_cache_repo - cls._env = dict(os.environ) - - @classmethod - def tearDownClass(cls): - os.environ = cls._env - if cls._token is not None: - HfFolder.save_token(cls._token) - if cls._cache_repo is not None: - set_custom_cache_repo_name_in_hf_home(cls._cache_repo) - else: - delete_custom_cache_repo_name_from_hf_home() - - def test_tp_save_and_resume_from_checkpoint(self): - num_cores = 8 - precision = "bf16" - tensor_parallel_size = 2 + @pytest.fixture( + scope="class", + params=[[2, 1, 1], [2, 2, 1], [2, 1, 2]], + ids=["dp=2", "tp=2", "pp=2"], + ) + def parallel_sizes(self, request): + return request.param + + def test_save_and_resume_from_checkpoint(self, parallel_sizes, tmpdir): + from optimum.neuron.trainers import NeuronTrainer + + tmpdir = Path(tmpdir) + _, tp_size, pp_size = parallel_sizes train_batch_size = 2 eval_batch_size = 2 - sequence_length = 16 max_steps = 10 - save_steps = 2 do_eval = True + max_train_samples = 100 max_eval_samples = 16 - with TemporaryDirectory() as tmpdirname: - output_dir = Path(tmpdirname) + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + tokenizer.pad_token = tokenizer.eos_token - runner = ExampleRunner(_TINY_BERT_MODEL_NAME, "text-classification") - - first_output_dir = output_dir / "first_run" - returncode, _ = runner.run( - num_cores, - precision, - train_batch_size, - eval_batch_size=eval_batch_size, - sequence_length=sequence_length, - tensor_parallel_size=tensor_parallel_size, + def create_training_args(output_dir, resume_from_checkpoint=None, max_steps=max_steps): + if isinstance(output_dir, Path): + output_dir = output_dir.as_posix() + if isinstance(resume_from_checkpoint, Path): + resume_from_checkpoint = resume_from_checkpoint.as_posix() + args = NeuronTrainingArguments( + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + bf16=True, + per_device_train_batch_size=train_batch_size, + per_device_eval_batch_size=eval_batch_size, max_steps=max_steps, - save_steps=save_steps, + logging_steps=1, + save_steps=2, do_eval=do_eval, - max_eval_samples=max_eval_samples, - output_dir=first_output_dir, - print_outputs=True, + output_dir=output_dir, + resume_from_checkpoint=resume_from_checkpoint, + skip_cache_push=True, ) - assert returncode == 0, "First run failed." - - # Case 1: Resuming from checkpoint by specifying a checkpoint directory. - second_output_dir = output_dir / "second_run" - returncode, _ = runner.run( - num_cores, - precision, - train_batch_size, - eval_batch_size=eval_batch_size, - sequence_length=sequence_length, - tensor_parallel_size=tensor_parallel_size, - max_steps=max_steps, - save_steps=save_steps, - do_eval=do_eval, - max_eval_samples=max_eval_samples, - output_dir=second_output_dir, - resume_from_checkpoint=first_output_dir / "checkpoint-4", - print_outputs=True, + return args + + def create_model(): + config = AutoConfig.from_pretrained(MODEL_NAME) + config.num_hidden_layers = 2 * max(1, pp_size) + config.num_attention_heads = 2 + config.num_key_value_heads = 2 + config.problem_type = "single_label_classification" + # config.use_cache = False + model = AutoModelForSequenceClassification.from_pretrained( + MODEL_NAME, config=config, ignore_mismatched_sizes=True ) - assert returncode == 0, "Second run failed." - - # Case 2: Resuming from checkpoint by specifying a boolean, in this case it should look inside the output - # directory. - returncode, _ = runner.run( - num_cores, - precision, - train_batch_size, - eval_batch_size=eval_batch_size, - sequence_length=sequence_length, - tensor_parallel_size=tensor_parallel_size, - max_steps=max_steps + 10, # So that it makes more steps since we are restauring from the third run. - save_steps=save_steps, - do_eval=do_eval, - max_eval_samples=max_eval_samples, - output_dir=second_output_dir, - print_outputs=True, + return model + + # First run setting. + first_output_dir = tmpdir / "first_run" + args = create_training_args(first_output_dir) + model = create_model() + + # Dataset preprocessing + raw_datasets = load_dataset("glue", "sst2") + sentence1_key = "sentence" + sentence2_key = None + label_to_id = None + max_seq_length = 32 + padding = "max_length" + + def preprocess_function(examples): + # Tokenize the texts + args = ( + (examples[sentence1_key],) + if sentence2_key is None + else (examples[sentence1_key], examples[sentence2_key]) ) - assert returncode == 0, "Third run failed." + result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) + + # Map labels to IDs (not necessary for GLUE tasks) + if label_to_id is not None and "label" in examples: + result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] + return result + + with args.main_process_first(desc="dataset map pre-processing"): + raw_datasets = raw_datasets.map(preprocess_function, batched=True) + train_dataset = raw_datasets["train"] + train_dataset = train_dataset.select(range(max_train_samples)) + eval_dataset = raw_datasets["validation"] + eval_dataset = eval_dataset.select(range(max_eval_samples)) + + trainer = NeuronTrainer( + model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer + ) + + train_result = trainer.train() + trainer.evaluate() + trainer.save_metrics("train", train_result.metrics) + + with open(first_output_dir / "train_results.json") as fp: + first_training_report = json.load(fp) + + # Case 1: Resuming from checkpoint by specifying a checkpoint directory. + second_output_dir = tmpdir / "second_run" + resume_from_checkpoint = first_output_dir / "checkpoint-4" + args = create_training_args(second_output_dir, resume_from_checkpoint=resume_from_checkpoint) + model = create_model() + trainer = NeuronTrainer( + model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer + ) + + train_result = trainer.train(resume_from_checkpoint=resume_from_checkpoint) + trainer.evaluate() + trainer.save_metrics("train", train_result.metrics) + + with open(first_output_dir / "train_results.json") as fp: + second_training_report = json.load(fp) + + assert first_training_report["train_loss"] == second_training_report["train_loss"] + + # Case 2: Resuming from checkpoint by specifying an output_dir with checkpoints. + # max_steps + 10 to do a some training steps than the previous run. + second_output_dir = first_output_dir + args = create_training_args(second_output_dir, max_steps=max_steps + 10) + model = create_model() + + trainer = NeuronTrainer( + model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer + ) + + trainer.train(resume_from_checkpoint=True) + trainer.evaluate() diff --git a/tests/distributed/utils.py b/tests/distributed/utils.py index b021ae4aa..8cd35f214 100644 --- a/tests/distributed/utils.py +++ b/tests/distributed/utils.py @@ -14,12 +14,14 @@ # limitations under the License. """Utilities for tests distributed.""" +import contextlib import functools import inspect -from contextlib import contextmanager +from pathlib import Path from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Type, Union import torch +from transformers import AutoConfig, AutoTokenizer from transformers.models.auto import get_values from transformers.models.auto.modeling_auto import ( MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, @@ -39,6 +41,8 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, ) +from optimum.neuron import ModelParallelismPlugin, NeuronAccelerator +from optimum.neuron.distributed import lazy_load_for_parallelism from optimum.neuron.utils.patching import DynamicPatch, Patcher from optimum.neuron.utils.require_utils import requires_neuronx_distributed, requires_torch_xla @@ -47,6 +51,10 @@ from transformers import PreTrainedModel +SEED = 42 + + +@requires_neuronx_distributed def generate_dummy_labels( model: "PreTrainedModel", shape: List[int], @@ -55,8 +63,13 @@ def generate_dummy_labels( device: Optional[Union[str, torch.device]] = None, ) -> Dict[str, torch.Tensor]: """Generates dummy labels.""" + from neuronx_distributed.pipeline import NxDPPModel + + if isinstance(model, NxDPPModel): + model_class_name = model.original_torch_module.__class__.__name__ + else: + model_class_name = model.__class__.__name__ - model_class_name = model.__class__.__name__ labels = {} batch_size = shape[0] @@ -99,10 +112,9 @@ def generate_dummy_labels( f', or "multi_label_classification", but "{model.config.problem_type}" was provided.' ) labels["labels"] = torch.zeros(*labels_shape, dtype=labels_dtype, device=device) - elif model_class_name in [ - *get_values(MODEL_FOR_PRETRAINING_MAPPING_NAMES), *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES), + *get_values(MODEL_FOR_PRETRAINING_MAPPING_NAMES), *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES), *get_values(MODEL_FOR_MASKED_LM_MAPPING_NAMES), *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES), @@ -113,12 +125,16 @@ def generate_dummy_labels( ]: if vocab_size is None: raise ValueError( - "The vocabulary size needs to be specified to generte dummy labels for language-modeling tasks." + "The vocabulary size needs to be specified to generate dummy labels for language-modeling tasks." ) if seed is not None: orig_seed = torch.seed() torch.manual_seed(seed) - random_labels = torch.randint(0, vocab_size, shape, dtype=torch.long) + if model_class_name in get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES): + max_value = model.config.num_labels + else: + max_value = vocab_size + random_labels = torch.randint(0, max_value, shape, dtype=torch.long) if device is not None: random_labels = random_labels.to(device) labels["labels"] = random_labels @@ -211,7 +227,7 @@ def wrapper(*args, **kwargs): return wrapper -@contextmanager +@contextlib.contextmanager def create_static_seed_patcher(model_class: Type["PreTrainedModel"], seed: int): """ Context manager that resets the seed to a given value for every initialization function. @@ -220,14 +236,14 @@ def create_static_seed_patcher(model_class: Type["PreTrainedModel"], seed: int): """ specialized_static_initializer_seed = functools.partial(static_initializer_seed, seed=seed) - class_module_name = inspect.getmodule(model_class).__name__ - fully_qualified_method_name = f"{class_module_name}.{model_class.__name__}._init_weights" + inspect.getmodule(model_class).__name__ dynamic_patch = DynamicPatch(specialized_static_initializer_seed) patcher = Patcher( [ - (fully_qualified_method_name, dynamic_patch), + # (fully_qualified_method_name, dynamic_patch), ("torch.nn.Embedding.reset_parameters", dynamic_patch), ("torch.nn.Linear.reset_parameters", dynamic_patch), + ("torch.Tensor.normal_", dynamic_patch), ("neuronx_distributed.parallel_layers.layers.ColumnParallelLinear.init_weight_cpu", dynamic_patch), ("neuronx_distributed.parallel_layers.layers.RowParallelLinear.init_weight_cpu", dynamic_patch), ] @@ -237,3 +253,116 @@ def create_static_seed_patcher(model_class: Type["PreTrainedModel"], seed: int): yield finally: pass + + +def get_model( + model_class: Type["PreTrainedModel"], + model_name_or_path: str, + tp_size: int = 1, + pp_size: int = 1, + lazy_load: bool = False, + from_config: bool = False, + use_static_seed_patcher: bool = False, + add_random_noise: bool = False, + config_overwrite: Optional[Dict[str, str]] = None, +) -> "PreTrainedModel": + if lazy_load: + ctx = lazy_load_for_parallelism(tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size) + else: + ctx = contextlib.nullcontext() + if use_static_seed_patcher: + seed_patcher = create_static_seed_patcher(model_class, SEED) + else: + seed_patcher = contextlib.nullcontext() + with ctx: + with seed_patcher: + config = AutoConfig.from_pretrained(model_name_or_path) + if config_overwrite is not None: + for key, value in config_overwrite.items(): + attr_type = type(getattr(config, key)) + setattr(config, key, attr_type(value)) + if from_config: + model = model_class(config) + else: + model = model_class.from_pretrained(model_name_or_path, config=config, ignore_mismatched_sizes=True) + + if getattr(model.config, "problem_type", None) is None: + model.config.problem_type = "single_label_classification" + + if add_random_noise: + for param in model.parameters(): + param.data.add_(torch.randn_like(param)) + + return model + + +def get_model_inputs( + model: "PreTrainedModel", + model_name_or_path: str, + include_labels: bool = True, + random_labels: bool = True, + batch_size: int = 1, + pad_to_multiple_of: Optional[int] = None, +): + input_str = "Hello there, I'm Michael and I live in Paris!" + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + + inputs = tokenizer(input_str, return_tensors="pt") + + if model.config.is_encoder_decoder: + sig = inspect.signature(model.forward) + for input_name in inputs: + decoder_input_name = f"decoder_{input_name}" + if decoder_input_name in sig.parameters: + inputs[decoder_input_name] = inputs[input_name].clone() + + if include_labels: + if random_labels: + labels = generate_dummy_labels(model, inputs["input_ids"].shape, vocab_size=model.config.vocab_size) + inputs.update(**labels) + else: + labels = tokenizer(input_str, return_tensors="pt")["input_ids"] + inputs["labels"] = labels + + if batch_size > 1: + for name, tensor in inputs.items(): + repeat = [batch_size] + [1] * (tensor.dim() - 1) + tensor = tensor.repeat(*repeat) + inputs[name] = tensor + + if pad_to_multiple_of is not None: + pad_token_id = getattr(model.config, "pad_token_id", 1) + for name, tensor in inputs.items(): + if tensor.dim() == 2 and tensor.shape[1] % pad_to_multiple_of != 0: + if "attention_mask" not in name: + pad_value = pad_token_id + else: + pad_value = 1 + tensor = torch.nn.functional.pad( + tensor, + pad=(0, pad_to_multiple_of - tensor.shape[1] % pad_to_multiple_of), + value=pad_value, + ) + inputs[name] = tensor + return inputs + + +def create_accelerator_for_mp( + tp_size: int, + pp_size: int, + zero_1: bool = False, + gradient_accumulation_steps: int = 1, + parallelize_embeddings: bool = True, + sequence_parallel_enabled: bool = True, + checkpoint_dir: Optional[Union[Path, str]] = None, +) -> NeuronAccelerator: + mp_plugin = ModelParallelismPlugin( + tensor_parallel_size=tp_size, + parallelize_embeddings=parallelize_embeddings, + sequence_parallel_enabled=sequence_parallel_enabled, + pipeline_parallel_size=pp_size, + checkpoint_dir=checkpoint_dir, + ) + return NeuronAccelerator( + mp_plugin=mp_plugin, zero_1=zero_1, gradient_accumulation_steps=gradient_accumulation_steps + ) diff --git a/tests/test_cache_utils.py b/tests/test_cache_utils.py index f92dba1d1..ffd2c2e7d 100644 --- a/tests/test_cache_utils.py +++ b/tests/test_cache_utils.py @@ -24,8 +24,9 @@ from typing import List from unittest import TestCase +import huggingface_hub import torch -from huggingface_hub import HfApi, HfFolder, create_repo, delete_repo, hf_hub_download +from huggingface_hub import HfApi, create_repo, delete_repo, get_token, hf_hub_download, login from transformers import BertConfig, BertModel, set_seed from transformers.testing_utils import TOKEN as TRANSFORMERS_TOKEN from transformers.testing_utils import USER as TRANSFORMERS_USER @@ -246,8 +247,8 @@ def test_list_in_registry_dict(self): @is_staging_test class StagingNeuronUtilsTestCase(StagingTestMixin, TestCase): def test_set_custom_cache_repo_name_in_hf_home(self): - orig_token = HfFolder.get_token() - HfFolder.save_token(TOKEN) + orig_token = get_token() + login(TOKEN) repo_name = f"blablabla-{self.seed}" repo_id = f"{USER}/{repo_name}" @@ -262,7 +263,7 @@ def remove_repo(): except ValueError as e: remove_repo() if orig_token: - HfFolder.save_token(orig_token) + login(orig_token) self.fail(str(e)) with open(f"{tmpdirname}/{CACHE_REPO_FILENAME}", "r") as fp: @@ -276,20 +277,25 @@ def remove_repo(): remove_repo() if orig_token: - HfFolder.save_token(orig_token) + login(orig_token) def test_has_write_access_to_repo(self): + orig_token = get_token() + wrong_token = "random_string" - HfFolder.save_token(wrong_token) + path = Path(huggingface_hub.constants.HF_TOKEN_PATH) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(wrong_token) self.assertFalse(has_write_access_to_repo(self.CUSTOM_CACHE_REPO)) self.assertFalse(has_write_access_to_repo(self.CUSTOM_PRIVATE_CACHE_REPO)) - HfFolder.save_token(self._staging_token) + login(orig_token) self.assertTrue(has_write_access_to_repo(self.CUSTOM_CACHE_REPO)) self.assertTrue(has_write_access_to_repo(self.CUSTOM_PRIVATE_CACHE_REPO)) + @is_trainium_test def test_list_in_registry(self): def _test_list_in_registry(use_private_cache_repo: bool): if use_private_cache_repo: @@ -341,6 +347,7 @@ def _test_list_in_registry(use_private_cache_repo: bool): _test_list_in_registry(True) +@is_trainium_test class NeuronHashTestCase(TestCase): def test_neuron_hash_is_not_mutable(self): bert_model = BertModel(BertConfig()) diff --git a/tests/test_examples.py b/tests/test_examples.py index 149486e65..38e1d23a1 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -24,7 +24,7 @@ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar, Union from unittest import TestCase -import huggingface_hub +from huggingface_hub import get_token from transformers import ( CONFIG_MAPPING, MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING, @@ -40,7 +40,9 @@ ) from transformers.testing_utils import slow +from optimum.neuron.distributed.parallelizers_manager import ParallelizersManager from optimum.neuron.utils.cache_utils import load_custom_cache_repo_name_from_hf_home +from optimum.neuron.utils.import_utils import is_neuronx_distributed_available from optimum.neuron.utils.misc import string_to_bool from optimum.neuron.utils.runner import ExampleRunner from optimum.neuron.utils.testing_utils import is_trainium_test @@ -56,7 +58,9 @@ TypeOrDictOfType = Union[T, Dict[str, T]] -TOKEN = huggingface_hub.get_token() +TOKEN = get_token() +if os.environ.get("HF_TOKEN", None) is not None: + TOKEN = os.environ.get("HF_TOKEN") DEFAULT_CACHE_REPO = "optimum-internal-testing/optimum-neuron-cache-for-testing" SAVED_CUSTOM_CACHE_REPO = load_custom_cache_repo_name_from_hf_home() @@ -267,7 +271,7 @@ def __new__(cls, name, bases, attrs, example_name=None): for model_type, model_name_or_path, tp_support, config_overrides in models_to_test: # Regular training. attrs[f"test_{example_name}_{model_type}"] = cls._create_test( - model_type, model_name_or_path, 1, True, False, config_overrides + model_type, model_name_or_path, 1, 1, True, False, config_overrides ) # Training with ZeRO-1. @@ -277,13 +281,21 @@ def __new__(cls, name, bases, attrs, example_name=None): # ) tensor_parallel_size = 2 if tp_support is not TPSupport.NONE else 1 + + if not is_neuronx_distributed_available(): + pp_support = False + else: + pp_support = ParallelizersManager.parallelizer_for_model(model_type).supports_pipeline_parallelism() + pipeline_parallel_size = 4 if pp_support else 1 + disable_embedding_parallelization = tp_support is TPSupport.PARTIAL if tensor_parallel_size > 1: # Training with TP if supported. - attrs[f"test_{example_name}_{model_type}_with_tp"] = cls._create_test( + attrs[f"test_{example_name}_{model_type}_with_tp_only"] = cls._create_test( model_type, model_name_or_path, tensor_parallel_size, + 1, # No pipeline parallelism in this test. disable_embedding_parallelization, False, config_overrides, @@ -294,6 +306,40 @@ def __new__(cls, name, bases, attrs, example_name=None): # model_type, # model_name_or_path, # tensor_parallel_size, + # 1, # No pipeline parallelism in this test. + # disable_embedding_parallelization, + # True, + # config_overrides, + # ) + + if pipeline_parallel_size > 1: + # Training with PP if supported. + attrs[f"test_{example_name}_{model_type}_with_pp_only"] = cls._create_test( + model_type, + model_name_or_path, + 1, # No tensor parallelism in this test. + pipeline_parallel_size, + disable_embedding_parallelization, + False, + config_overrides, + ) + + if tensor_parallel_size > 1 and pipeline_parallel_size > 1: + attrs[f"test_{example_name}_{model_type}_with_tp_and_pp"] = cls._create_test( + model_type, + model_name_or_path, + tensor_parallel_size, + pipeline_parallel_size, + disable_embedding_parallelization, + False, + config_overrides, + ) + # TODO: enable when working on the multi-node training PR. + # attrs[f"test_{example_name}_{model_type}_with_tp_and_pp_and_zero1"] = cls._create_test( + # model_type, + # model_name_or_path, + # tensor_parallel_size, + # pipeline_parallel_size, # disable_embedding_parallelization, # True, # config_overrides, @@ -344,6 +390,7 @@ def _create_test( model_type: str, model_name_or_path: str, tensor_parallel_size: int, + pipeline_parallel_size: int, disable_embedding_parallelization: bool, zero_1: bool, config_overrides: Optional[Dict[str, Any]] = None, @@ -351,9 +398,6 @@ def _create_test( """ Creates a test function that runs an example for a model_name. - Args: - model_name (`str`): the model_name_or_path. - Returns: `Callable[[ExampleTesterBase], None]`: The test function that runs the example. """ @@ -395,6 +439,7 @@ def test(self): save_total_limit=1, learning_rate=self.LEARNING_RATE, tensor_parallel_size=tensor_parallel_size, + pipeline_parallel_size=pipeline_parallel_size, disable_embedding_parallelization=disable_embedding_parallelization, zero_1=zero_1, output_dir=tmpdirname, diff --git a/tests/test_runner.py b/tests/test_runner.py index 56c18dc38..56a2a3e19 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -17,7 +17,7 @@ import os from unittest import TestCase -import huggingface_hub +from huggingface_hub import get_token, login from parameterized import parameterized from optimum.neuron.utils.cache_utils import ( @@ -58,14 +58,22 @@ class TestExampleRunner(TestCase): @classmethod def setUpClass(cls): - cls._token = huggingface_hub.get_token() + cls._token = get_token() cls._cache_repo = load_custom_cache_repo_name_from_hf_home() cls._env = dict(os.environ) - set_custom_cache_repo_name_in_hf_home(cls.CACHE_REPO_NAME) + if os.environ.get("HF_TOKEN", None) is not None: + token = os.environ.get("HF_TOKEN") + + login(token) + set_custom_cache_repo_name_in_hf_home(cls.CACHE_REPO_NAME) + else: + raise RuntimeError("Please specify the token via the HF_TOKEN environment variable.") @classmethod def tearDownClass(cls): os.environ = cls._env + if cls._token is not None: + login(cls._token) if cls._cache_repo is not None: try: set_custom_cache_repo_name_in_hf_home(cls._cache_repo) diff --git a/tests/test_utils.py b/tests/test_utils.py index 4fc002bee..d10082ccf 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -21,7 +21,7 @@ from torch.utils.data import DataLoader, Dataset, IterableDataset from transformers import BertConfig, BertForSequenceClassification, PreTrainedModel, Wav2Vec2Config, Wav2Vec2Model -from optimum.neuron.trainers import MODEL_PATCHING_SPECS +from optimum.neuron.accelerate.accelerator import MODEL_PATCHING_SPECS from optimum.neuron.utils import ModelPatcher from optimum.neuron.utils.testing_utils import is_trainium_test from optimum.neuron.utils.training_utils import FirstAndLastDataset, is_model_officially_supported diff --git a/tests/utils.py b/tests/utils.py index 2b6caf8e8..f4b584e8c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -24,7 +24,7 @@ import torch from datasets import Dataset, DatasetDict -from huggingface_hub import CommitOperationDelete, HfApi, HfFolder, create_repo, delete_repo, logout +from huggingface_hub import CommitOperationDelete, HfApi, create_repo, delete_repo, get_token, login, logout from huggingface_hub.utils import RepositoryNotFoundError from transformers import PretrainedConfig, PreTrainedModel from transformers.testing_utils import ENDPOINT_STAGING @@ -135,7 +135,7 @@ def create_tiny_pretrained_model( class TrainiumTestMixin: @classmethod def setUpClass(cls): - cls._token = HfFolder.get_token() + cls._token = get_token() cls._cache_repo = load_custom_cache_repo_name_from_hf_home() cls._env = dict(os.environ) @@ -143,7 +143,7 @@ def setUpClass(cls): def tearDownClass(cls): os.environ = cls._env if cls._token is not None: - HfFolder.save_token(cls._token) + login(cls._token) if cls._cache_repo is not None: try: set_custom_cache_repo_name_in_hf_home(cls._cache_repo) @@ -161,10 +161,11 @@ class StagingTestMixin: MAX_NUM_LINEARS = 20 @classmethod - def set_hf_hub_token(cls, token: str) -> str: - orig_token = HfFolder.get_token() + def set_hf_hub_token(cls, token: Optional[str]) -> Optional[str]: + orig_token = get_token() + login(token=token) if token is not None: - HfFolder.save_token(token) + login(token=token) else: logout() cls._env = dict(os.environ, HF_ENDPOINT=ENDPOINT_STAGING) @@ -214,8 +215,8 @@ def remove_all_files_in_repo(self, repo_id: str): except RepositoryNotFoundError: pass - def tearDown(self) -> None: - HfFolder.save_token(TOKEN) + def tearDown(self): + login(TOKEN) self.remove_all_files_in_repo(self.CUSTOM_CACHE_REPO) self.remove_all_files_in_repo(self.CUSTOM_PRIVATE_CACHE_REPO) diff --git a/tools/create_examples_from_transformers.py b/tools/create_examples_from_transformers.py index 61d25030d..c95b6a7c9 100755 --- a/tools/create_examples_from_transformers.py +++ b/tools/create_examples_from_transformers.py @@ -177,7 +177,10 @@ def wrap_with_lazy_load_for_parallelism(file_content: str) -> str: # Adding one tab to indent from the lazy_load_for_parallelism context manager. number_of_spaces += 4 model_loading_content = " " * number_of_spaces + model_loading_content - new_content = f"with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):\n{model_loading_content}\n" + new_content = ( + "with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size, " + f"pipeline_parallel_size=training_args.pipeline_parallel_size):\n{model_loading_content}\n" + ) file_content = file_content[:start] + new_content + file_content[position + 1 :] shift += len(new_content) - initial_length