diff --git a/.github/workflows/test_trainium_common.yml b/.github/workflows/test_trainium_common.yml
index e665aefa1..d7cbc1e84 100644
--- a/.github/workflows/test_trainium_common.yml
+++ b/.github/workflows/test_trainium_common.yml
@@ -32,6 +32,8 @@ jobs:
         run: echo "/home/ubuntu/.local/bin" >> $GITHUB_PATH
       - name: Set pip repository pointing to the Neuron repository
         run: pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+      - name: Update pip
+        run: pip install -U pip
       - name: Install Python dependencies
         run: pip install .[tests,neuronx]
       - name: Run tests on Neuron cores
diff --git a/.github/workflows/test_trainium_distributed.yml b/.github/workflows/test_trainium_distributed.yml
index 1c2ebf3e8..bd8d68162 100644
--- a/.github/workflows/test_trainium_distributed.yml
+++ b/.github/workflows/test_trainium_distributed.yml
@@ -35,5 +35,5 @@ jobs:
         run: pip install .[tests,neuronx]
       - name: Run tests on Neuron cores
         run: |
-          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m "is_trainium_test" tests/distributed/
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m "is_trainium_test" tests/distributed/ -v --durations=0 -x --ignore tests/distributed/test_training.py
 
diff --git a/docs/source/guides/distributed_training.mdx b/docs/source/guides/distributed_training.mdx
index d22141a4a..d15a332a0 100644
--- a/docs/source/guides/distributed_training.mdx
+++ b/docs/source/guides/distributed_training.mdx
@@ -182,11 +182,11 @@ Just as for ZeRO-1, it is possible to wrap the optimizer class to make it lazy.
 ```python
 from torch.optim import AdamW
 from optimum.neuron import NeuronAccelerator
-from optimum.neuron.accelerate.utils import TensorParallelismPlugin
+from optimum.neuron.accelerate.utils import ModelParallelismPlugin
 from optimum.neuron.distributed import lazy_load_for_parallelism
 
 tensor_parallel_size = 8
-tp_plugin = TensorParallelismPlugin(
+mp_plugin = ModelParallelismPlugin(
     tensor_parallel_size,
     parallelize_embeddings=True,
     sequence_parallel_enabled=True,
@@ -195,7 +195,7 @@ tp_plugin = TensorParallelismPlugin(
 
 accelerator = NeuronAccelerator(
     ...
-    tp_plugin=tp_plugin,
+    mp_plugin=mp_plugin,
 )
 
 with lazy_load_for_parallelism(tensor_parallel_size=tensor_parallel_size):
diff --git a/docs/source/package_reference/distributed.mdx b/docs/source/package_reference/distributed.mdx
index f23ceb6c0..7e295d5a2 100644
--- a/docs/source/package_reference/distributed.mdx
+++ b/docs/source/package_reference/distributed.mdx
@@ -24,7 +24,7 @@ The [`~optimum.neuron.distributed.Parallelizer`] class is the base abstract clas
 [[autodoc]] distributed.Parallelizer
   - _parallelize
   - parallelize
-  - optimizer_for_tp
+  - optimizer_for_mp
   - save_model_checkpoint
   - load_model_checkpoint
 
diff --git a/examples/image-classification/run_image_classification.py b/examples/image-classification/run_image_classification.py
old mode 100644
new mode 100755
index 26340a43b..620167685
--- a/examples/image-classification/run_image_classification.py
+++ b/examples/image-classification/run_image_classification.py
@@ -16,6 +16,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -28,6 +29,7 @@
 from torchvision.transforms import (
     CenterCrop,
     Compose,
+    Lambda,
     Normalize,
     RandomHorizontalFlip,
     RandomResizedCrop,
@@ -56,7 +58,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0")
+check_min_version("4.35.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
@@ -143,12 +145,28 @@ class ModelArguments:
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
     image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -177,6 +195,15 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_image_classification", model_args, data_args)
@@ -200,8 +227,8 @@ def main():
 
     # Log on each process the small summary:
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -230,7 +257,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             task="image-classification",
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -277,16 +304,21 @@ def compute_metrics(p):
         finetuning_task="image-classification",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
-    with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
+    with lazy_load_for_parallelism(
+        tensor_parallel_size=training_args.tensor_parallel_size,
+        pipeline_parallel_size=training_args.pipeline_parallel_size,
+    ):
         model = AutoModelForImageClassification.from_pretrained(
             model_args.model_name_or_path,
             from_tf=bool(".ckpt" in model_args.model_name_or_path),
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
             ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
         )
 
@@ -294,7 +326,8 @@ def compute_metrics(p):
         model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # Define torchvision transforms to be applied to each image.
@@ -302,7 +335,11 @@ def compute_metrics(p):
         size = image_processor.size["shortest_edge"]
     else:
         size = (image_processor.size["height"], image_processor.size["width"])
-    normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
+    normalize = (
+        Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
+        if hasattr(image_processor, "image_mean") and hasattr(image_processor, "image_std")
+        else Lambda(lambda x: x)
+    )
     _train_transforms = Compose(
         [
             RandomResizedCrop(size),
diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py
index aa0e346c1..d54efc143 100755
--- a/examples/language-modeling/run_clm.py
+++ b/examples/language-modeling/run_clm.py
@@ -25,6 +25,7 @@
 import math
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional
@@ -56,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0")
+check_min_version("4.35.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
@@ -77,7 +78,7 @@ class ModelArguments:
         default=None,
         metadata={
             "help": (
-                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
             )
         },
     )
@@ -112,12 +113,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -135,7 +152,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded."
+                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
                 "set True will benefit LLM loading time and RAM consumption."
             )
         },
@@ -239,6 +256,15 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_clm", model_args, data_args)
@@ -263,8 +289,8 @@ def main():
 
     # Log on each process the small summary:
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -301,7 +327,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             streaming=data_args.streaming,
         )
         if "validation" not in raw_datasets.keys():
@@ -310,7 +336,7 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 streaming=data_args.streaming,
             )
             raw_datasets["train"] = load_dataset(
@@ -318,7 +344,7 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 streaming=data_args.streaming,
             )
     else:
@@ -340,7 +366,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             **dataset_args,
         )
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
@@ -350,7 +376,7 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 **dataset_args,
             )
             raw_datasets["train"] = load_dataset(
@@ -358,7 +384,7 @@ def main():
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 **dataset_args,
             )
 
@@ -374,7 +400,8 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -392,7 +419,8 @@ def main():
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -400,7 +428,7 @@ def main():
         tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
     else:
         raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
         )
 
@@ -410,21 +438,28 @@ def main():
             if model_args.torch_dtype in ["auto", None]
             else getattr(torch, model_args.torch_dtype)
         )
-        with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
+        with lazy_load_for_parallelism(
+            tensor_parallel_size=training_args.tensor_parallel_size,
+            pipeline_parallel_size=training_args.pipeline_parallel_size,
+        ):
             model = AutoModelForCausalLM.from_pretrained(
                 model_args.model_name_or_path,
                 from_tf=bool(".ckpt" in model_args.model_name_or_path),
                 config=config,
                 cache_dir=model_args.cache_dir,
                 revision=model_args.model_revision,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
                 torch_dtype=torch_dtype,
                 low_cpu_mem_usage=model_args.low_cpu_mem_usage,
             )
 
     else:
-        with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
-            model = AutoModelForCausalLM.from_config(config)
+        with lazy_load_for_parallelism(
+            tensor_parallel_size=training_args.tensor_parallel_size,
+            pipeline_parallel_size=training_args.pipeline_parallel_size,
+        ):
+            model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
 
         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
         logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
@@ -476,17 +511,16 @@ def tokenize_function(examples):
 
     if data_args.block_size is None:
         block_size = tokenizer.model_max_length
-        if block_size > 1024:
+        if block_size > config.max_position_embeddings:
             logger.warning(
-                "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
-                " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
-                " override this default with `--block_size xxx`."
+                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
+                f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx."
             )
-            block_size = 1024
+            block_size = min(1024, config.max_position_embeddings)
     else:
         if data_args.block_size > tokenizer.model_max_length:
             logger.warning(
-                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
+                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model "
                 f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
             )
         block_size = min(data_args.block_size, tokenizer.model_max_length)
@@ -512,7 +546,7 @@ def group_texts(examples):
     # to preprocess.
     #
     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    # https://huggingface.co/docs/datasets/process#map
 
     with training_args.main_process_first(desc="grouping texts together"):
         if not data_args.streaming:
diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py
index 083694c0e..b917291c6 100755
--- a/examples/language-modeling/run_mlm.py
+++ b/examples/language-modeling/run_mlm.py
@@ -25,6 +25,7 @@
 import math
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional
@@ -54,7 +55,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0")
+check_min_version("4.35.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
@@ -108,12 +109,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -121,7 +138,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded."
+                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
                 "set True will benefit LLM loading time and RAM consumption."
             )
         },
@@ -239,6 +256,15 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_mlm", model_args, data_args)
@@ -263,8 +289,8 @@ def main():
 
     # Log on each process the small summary:
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     # Set the verbosity to info of the Transformers logger (on main process only):
     logger.info(f"Training/evaluation parameters {training_args}")
@@ -302,7 +328,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             streaming=data_args.streaming,
         )
         if "validation" not in raw_datasets.keys():
@@ -311,7 +337,7 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 streaming=data_args.streaming,
             )
             raw_datasets["train"] = load_dataset(
@@ -319,7 +345,7 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 streaming=data_args.streaming,
             )
     else:
@@ -336,7 +362,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
@@ -346,14 +372,14 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             raw_datasets["train"] = load_dataset(
                 extension,
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
@@ -367,7 +393,8 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -385,7 +412,8 @@ def main():
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -393,26 +421,33 @@ def main():
         tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
     else:
         raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
         )
 
     if model_args.model_name_or_path:
-        with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
+        with lazy_load_for_parallelism(
+            tensor_parallel_size=training_args.tensor_parallel_size,
+            pipeline_parallel_size=training_args.pipeline_parallel_size,
+        ):
             model = AutoModelForMaskedLM.from_pretrained(
                 model_args.model_name_or_path,
                 from_tf=bool(".ckpt" in model_args.model_name_or_path),
                 config=config,
                 cache_dir=model_args.cache_dir,
                 revision=model_args.model_revision,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
                 low_cpu_mem_usage=model_args.low_cpu_mem_usage,
             )
 
     else:
         logger.info("Training new model from scratch")
-        with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
-            model = AutoModelForMaskedLM.from_config(config)
+        with lazy_load_for_parallelism(
+            tensor_parallel_size=training_args.tensor_parallel_size,
+            pipeline_parallel_size=training_args.pipeline_parallel_size,
+        ):
+            model = AutoModelForMaskedLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
@@ -440,7 +475,7 @@ def main():
     else:
         if data_args.max_seq_length > tokenizer.model_max_length:
             logger.warning(
-                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
                 f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
             )
         max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
@@ -525,7 +560,7 @@ def group_texts(examples):
         # might be slower to preprocess.
         #
         # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        # https://huggingface.co/docs/datasets/process#map
 
         with training_args.main_process_first(desc="grouping texts together"):
             if not data_args.streaming:
diff --git a/examples/multiple-choice/run_swag.py b/examples/multiple-choice/run_swag.py
index cd522127a..fa8396fd0 100755
--- a/examples/multiple-choice/run_swag.py
+++ b/examples/multiple-choice/run_swag.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional, Union
@@ -48,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0")
+check_min_version("4.35.0")
 
 logger = logging.getLogger(__name__)
 
@@ -80,12 +81,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -226,6 +243,15 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_swag", model_args, data_args)
@@ -250,8 +276,8 @@ def main():
 
     # Log on each process the small summary:
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -293,7 +319,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Downloading and loading the swag dataset from the hub.
@@ -301,7 +327,7 @@ def main():
             "swag",
             "regular",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -315,23 +341,29 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
-    with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
+    with lazy_load_for_parallelism(
+        tensor_parallel_size=training_args.tensor_parallel_size,
+        pipeline_parallel_size=training_args.pipeline_parallel_size,
+    ):
         model = AutoModelForMultipleChoice.from_pretrained(
             model_args.model_name_or_path,
             from_tf=bool(".ckpt" in model_args.model_name_or_path),
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     # When using your own dataset or a different dataset from swag, you will probably need to change this.
@@ -351,7 +383,7 @@ def main():
     else:
         if data_args.max_seq_length > tokenizer.model_max_length:
             logger.warning(
-                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
                 f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
             )
         max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py
index b369571e9..c872e9a05 100755
--- a/examples/question-answering/run_qa.py
+++ b/examples/question-answering/run_qa.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -50,7 +51,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0")
+check_min_version("4.35.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
@@ -80,12 +81,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -228,6 +245,15 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_qa", model_args, data_args)
@@ -252,8 +278,8 @@ def main():
 
     # Log on each process the small summary:
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -290,7 +316,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -309,7 +335,7 @@ def main():
             data_files=data_files,
             field="data",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -323,23 +349,29 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=True,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
-    with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
+    with lazy_load_for_parallelism(
+        tensor_parallel_size=training_args.tensor_parallel_size,
+        pipeline_parallel_size=training_args.pipeline_parallel_size,
+    ):
         model = AutoModelForQuestionAnswering.from_pretrained(
             model_args.model_name_or_path,
             from_tf=bool(".ckpt" in model_args.model_name_or_path),
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     # Tokenizer check: this script requires a fast tokenizer.
@@ -367,7 +399,7 @@ def main():
 
     if data_args.max_seq_length > tokenizer.model_max_length:
         logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
         )
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
diff --git a/examples/question-answering/run_seq2seq_qa.py b/examples/question-answering/run_seq2seq_qa.py
index fe5213a8d..abb883c0a 100644
--- a/examples/question-answering/run_seq2seq_qa.py
+++ b/examples/question-answering/run_seq2seq_qa.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import List, Optional, Tuple
 
@@ -47,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0")
+check_min_version("4.35.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
@@ -81,12 +82,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -155,7 +172,7 @@ class DataTrainingArguments:
         metadata={
             "help": (
                 "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded. Will default to `max_answer_length`."
+                "than this will be truncated, sequences shorter will be padded. Will default to `max_answer_length`. "
                 "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
                 "during ``evaluate`` and ``predict``."
             )
@@ -274,6 +291,15 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_seq2seq_qa", model_args, data_args)
@@ -298,8 +324,8 @@ def main():
 
     # Log on each process the small summary:
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -336,7 +362,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -354,7 +380,7 @@ def main():
             data_files=data_files,
             field="data",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -368,23 +394,29 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
-    with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
+    with lazy_load_for_parallelism(
+        tensor_parallel_size=training_args.tensor_parallel_size,
+        pipeline_parallel_size=training_args.pipeline_parallel_size,
+    ):
         model = AutoModelForSeq2SeqLM.from_pretrained(
             model_args.model_name_or_path,
             from_tf=bool(".ckpt" in model_args.model_name_or_path),
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
@@ -441,13 +473,13 @@ def main():
 
     if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
         logger.warning(
-            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
+            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for "
             f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
         )
 
     if data_args.max_seq_length > tokenizer.model_max_length:
         logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
         )
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
diff --git a/examples/question-answering/trainer_seq2seq_qa.py b/examples/question-answering/trainer_seq2seq_qa.py
index a4acb5ee6..6e04bf3f6 100644
--- a/examples/question-answering/trainer_seq2seq_qa.py
+++ b/examples/question-answering/trainer_seq2seq_qa.py
@@ -47,12 +47,13 @@ def evaluate(
         **gen_kwargs,
     ) -> Dict[str, float]:
         gen_kwargs = gen_kwargs.copy()
-        gen_kwargs["max_length"] = (
-            gen_kwargs["max_length"] if gen_kwargs.get("max_length") is not None else self.args.generation_max_length
-        )
-        gen_kwargs["num_beams"] = (
-            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
-        )
+
+        # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the
+        # training args
+        if gen_kwargs.get("max_length") is None and self.args.generation_max_length is not None:
+            gen_kwargs["max_length"] = self.args.generation_max_length
+        if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
+            gen_kwargs["num_beams"] = self.args.generation_num_beams
         self._gen_kwargs = gen_kwargs
 
         eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index 4b05b3b08..5a442c075 100755
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -53,7 +54,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0")
+check_min_version("4.35.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
@@ -100,12 +101,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -189,7 +206,7 @@ class DataTrainingArguments:
         metadata={
             "help": (
                 "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`. "
                 "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
                 "during ``evaluate`` and ``predict``."
             )
@@ -248,14 +265,14 @@ class DataTrainingArguments:
         },
     )
     source_prefix: Optional[str] = field(
-        default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
+        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
     )
 
     forced_bos_token: Optional[str] = field(
         default=None,
         metadata={
             "help": (
-                "The token to force as the first generated token after the decoder_start_token_id."
+                "The token to force as the first generated token after the decoder_start_token_id. "
                 "Useful for multilingual models like mBART where the first generated token"
                 "needs to be the target language token (Usually it is the target language token)"
             )
@@ -313,6 +330,15 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_summarization", model_args, data_args)
@@ -337,8 +363,8 @@ def main():
 
     # Log on each process the small summary:
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -387,7 +413,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -404,7 +430,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -418,23 +444,29 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
-    with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
+    with lazy_load_for_parallelism(
+        tensor_parallel_size=training_args.tensor_parallel_size,
+        pipeline_parallel_size=training_args.pipeline_parallel_size,
+    ):
         model = AutoModelForSeq2SeqLM.from_pretrained(
             model_args.model_name_or_path,
             from_tf=bool(".ckpt" in model_args.model_name_or_path),
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
@@ -532,7 +564,7 @@ def main():
 
     if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
         logger.warning(
-            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
+            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for "
             f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
         )
 
@@ -694,7 +726,13 @@ def compute_metrics(eval_preds):
     results = {}
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate(metric_key_prefix="eval")
+        if isinstance(eval_dataset, dict):
+            metrics = {}
+            for eval_ds_name, eval_ds in eval_dataset.items():
+                dataset_metrics = trainer.evaluate(eval_dataset=eval_ds, metric_key_prefix=f"eval_{eval_ds_name}")
+                metrics.update(dataset_metrics)
+        else:
+            metrics = trainer.evaluate(metric_key_prefix="eval")
         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 
diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py
index 31d2cc67a..75b321be0 100755
--- a/examples/text-classification/run_glue.py
+++ b/examples/text-classification/run_glue.py
@@ -20,6 +20,7 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -49,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0")
+check_min_version("4.35.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
@@ -189,12 +190,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -217,6 +234,15 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_glue", model_args, data_args)
@@ -241,8 +267,8 @@ def main():
 
     # Log on each process the small summary:
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -282,7 +308,7 @@ def main():
             "glue",
             data_args.task_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
@@ -290,7 +316,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Loading a dataset from your local files.
@@ -319,7 +345,7 @@ def main():
                 "csv",
                 data_files=data_files,
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
         else:
             # Loading a dataset from local json files
@@ -327,7 +353,7 @@ def main():
                 "json",
                 data_files=data_files,
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     # See more about loading any type of standard or custom dataset at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -362,23 +388,29 @@ def main():
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
-    with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
+    with lazy_load_for_parallelism(
+        tensor_parallel_size=training_args.tensor_parallel_size,
+        pipeline_parallel_size=training_args.pipeline_parallel_size,
+    ):
         model = AutoModelForSequenceClassification.from_pretrained(
             model_args.model_name_or_path,
             from_tf=bool(".ckpt" in model_args.model_name_or_path),
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
             ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
         )
 
@@ -432,7 +464,7 @@ def main():
 
     if data_args.max_seq_length > tokenizer.model_max_length:
         logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
         )
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
diff --git a/examples/text-classification/run_xnli.py b/examples/text-classification/run_xnli.py
index 339a649fe..4b06d2653 100755
--- a/examples/text-classification/run_xnli.py
+++ b/examples/text-classification/run_xnli.py
@@ -21,6 +21,7 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -49,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0")
+check_min_version("4.35.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
@@ -153,12 +154,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -176,6 +193,15 @@ def main():
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_xnli", model_args)
@@ -200,8 +226,8 @@ def main():
 
     # Log on each process the small summary:
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -233,7 +259,7 @@ def main():
                 model_args.language,
                 split="train",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
         else:
             train_dataset = load_dataset(
@@ -241,7 +267,7 @@ def main():
                 model_args.train_language,
                 split="train",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
         label_list = train_dataset.features["label"].names
 
@@ -251,7 +277,7 @@ def main():
             model_args.language,
             split="validation",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         label_list = eval_dataset.features["label"].names
 
@@ -261,7 +287,7 @@ def main():
             model_args.language,
             split="test",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         label_list = predict_dataset.features["label"].names
 
@@ -279,7 +305,8 @@ def main():
         finetuning_task="xnli",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -287,16 +314,21 @@ def main():
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
-    with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
+    with lazy_load_for_parallelism(
+        tensor_parallel_size=training_args.tensor_parallel_size,
+        pipeline_parallel_size=training_args.pipeline_parallel_size,
+    ):
         model = AutoModelForSequenceClassification.from_pretrained(
             model_args.model_name_or_path,
             from_tf=bool(".ckpt" in model_args.model_name_or_path),
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
             ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
         )
 
diff --git a/examples/token-classification/run_ner.py b/examples/token-classification/run_ner.py
index ba33cd4a5..b8d870a23 100755
--- a/examples/token-classification/run_ner.py
+++ b/examples/token-classification/run_ner.py
@@ -22,6 +22,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -50,7 +51,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0")
+check_min_version("4.35.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
@@ -80,12 +81,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -218,6 +235,15 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_ner", model_args, data_args)
@@ -242,8 +268,8 @@ def main():
 
     # Log on each process the small summary:
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -280,7 +306,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -349,7 +375,8 @@ def get_label_list(labels):
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
@@ -359,7 +386,8 @@ def get_label_list(labels):
             cache_dir=model_args.cache_dir,
             use_fast=True,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
             add_prefix_space=True,
         )
     else:
@@ -368,17 +396,22 @@ def get_label_list(labels):
             cache_dir=model_args.cache_dir,
             use_fast=True,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
-    with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
+    with lazy_load_for_parallelism(
+        tensor_parallel_size=training_args.tensor_parallel_size,
+        pipeline_parallel_size=training_args.pipeline_parallel_size,
+    ):
         model = AutoModelForTokenClassification.from_pretrained(
             model_args.model_name_or_path,
             from_tf=bool(".ckpt" in model_args.model_name_or_path),
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
             ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
         )
 
diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py
index cc1d79239..31d40b2c3 100755
--- a/examples/translation/run_translation.py
+++ b/examples/translation/run_translation.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -53,7 +54,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0")
+check_min_version("4.35.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
@@ -90,12 +91,28 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
     use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
             )
         },
     )
@@ -157,7 +174,7 @@ class DataTrainingArguments:
         metadata={
             "help": (
                 "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`. "
                 "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
                 "during ``evaluate`` and ``predict``."
             )
@@ -262,6 +279,15 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_translation", model_args, data_args)
@@ -286,8 +312,8 @@ def main():
 
     # Log on each process the small summary:
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
@@ -336,7 +362,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -353,10 +379,10 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading.
 
     # Load pretrained model and tokenizer
     #
@@ -367,23 +393,29 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
-    with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
+    with lazy_load_for_parallelism(
+        tensor_parallel_size=training_args.tensor_parallel_size,
+        pipeline_parallel_size=training_args.pipeline_parallel_size,
+    ):
         model = AutoModelForSeq2SeqLM.from_pretrained(
             model_args.model_name_or_path,
             from_tf=bool(".ckpt" in model_args.model_name_or_path),
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
@@ -444,7 +476,7 @@ def main():
 
     if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
         logger.warning(
-            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
+            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for "
             f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
         )
 
diff --git a/optimum/neuron/__init__.py b/optimum/neuron/__init__.py
index f9ceb961d..92082cc7a 100644
--- a/optimum/neuron/__init__.py
+++ b/optimum/neuron/__init__.py
@@ -47,12 +47,13 @@
         "NeuronAccelerator",
         "NeuronAcceleratorState",
         "NeuronPartialState",
+        "ModelParallelismPlugin",
     ],
     "pipelines": ["pipeline"],
 }
 
 if TYPE_CHECKING:
-    from .accelerate import NeuronAccelerator, NeuronAcceleratorState, NeuronPartialState
+    from .accelerate import ModelParallelismPlugin, NeuronAccelerator, NeuronAcceleratorState, NeuronPartialState
     from .hf_argparser import NeuronHfArgumentParser
     from .modeling import (
         NeuronModelForCausalLM,
diff --git a/optimum/neuron/accelerate/__init__.py b/optimum/neuron/accelerate/__init__.py
index e39649fd7..7a611f826 100644
--- a/optimum/neuron/accelerate/__init__.py
+++ b/optimum/neuron/accelerate/__init__.py
@@ -15,4 +15,4 @@
 
 from .accelerator import NeuronAccelerator
 from .state import NeuronAcceleratorState, NeuronPartialState
-from .utils.dataclasses import NeuronDistributedType
+from .utils.dataclasses import ModelParallelismPlugin, NeuronDistributedType
diff --git a/optimum/neuron/accelerate/accelerator.py b/optimum/neuron/accelerate/accelerator.py
index cf7437175..af3f691ff 100644
--- a/optimum/neuron/accelerate/accelerator.py
+++ b/optimum/neuron/accelerate/accelerator.py
@@ -15,13 +15,14 @@
 """Custom Accelerator class for Neuron."""
 
 import collections
+import contextlib
 import inspect
 import os
 import re
 import shutil
 from functools import partial
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, Union
 
 import torch
 from accelerate import Accelerator
@@ -34,22 +35,26 @@
 from ...utils import logging
 from ..distributed import Parallelizer, ParallelizersManager
 from ..utils import (
+    DynamicPatch,
     ModelPatcher,
     Patcher,
     is_neuronx_distributed_available,
     is_torch_xla_available,
     patch_within_function,
+    patched_finfo,
 )
 from ..utils.misc import args_and_kwargs_to_kwargs_only
-from ..utils.require_utils import requires_neuronx_distributed
+from ..utils.require_utils import requires_neuronx_distributed, requires_torch_xla
 from .optimizer import NeuronAcceleratedOptimizer
 from .scheduler import NeuronAcceleratedScheduler
 from .state import NeuronAcceleratorState
 from .utils import (
+    ModelParallelismPlugin,
     NeuronDistributedType,
     NeuronFullyShardedDataParallelPlugin,
-    TensorParallelismPlugin,
+    get_tied_parameters_dict,
     patch_accelerate_is_tpu_available,
+    tie_parameters,
 )
 from .utils.operations import _xla_gather
 
@@ -75,10 +80,25 @@
 logger = logging.get_logger(__name__)
 
 
-# TODO: should we do a XLAFSDPNeuronAccelerator instead?
+MODEL_PATCHING_SPECS = [
+    ("config.layerdrop", 0),
+    ("no_sync", lambda: contextlib.nullcontext()),
+    (
+        "forward",
+        DynamicPatch(patch_within_function(("torch.finfo", patched_finfo))),
+    ),
+]
+
+NxDPPMODEL_PATCHING_SPECS = [
+    (
+        "forward",
+        DynamicPatch(patch_within_function(("torch.finfo", patched_finfo))),
+    ),
+]
+
+
 class NeuronAccelerator(Accelerator):
-    # @patch_within_function(("accelerate.accelerator.AcceleratorState", NeuronAcceleratorState))
-    def __init__(self, *args, tp_plugin: Optional[TensorParallelismPlugin] = None, zero_1: bool = False, **kwargs):
+    def __init__(self, *args, mp_plugin: Optional[ModelParallelismPlugin] = None, zero_1: bool = False, **kwargs):
         # Patches accelerate.utils.imports.is_tpu_available to match `is_torch_xla_available`
         patch_accelerate_is_tpu_available()
 
@@ -113,18 +133,28 @@ def __init__(self, *args, tp_plugin: Optional[TensorParallelismPlugin] = None, z
         self.fsdp_plugin = fsdp_plugin
 
         use_neuronx_distributed_tp = os.environ.get("ACCELERATE_USE_NEURONX_DISTRIBUTED_TP", "false")
-        if tp_plugin is None:
+        use_neuronx_distributed_pp = os.environ.get("ACCELERATE_USE_NEURONX_DISTRIBUTED_PP", "false")
+        if mp_plugin is None:
             if use_neuronx_distributed_tp == "false":
                 tp_size = 1
             else:
                 tp_size = int(use_neuronx_distributed_tp)
-            tp_plugin = TensorParallelismPlugin(tensor_parallel_size=tp_size, parallelize_embeddings=True)
+            if use_neuronx_distributed_pp == "false":
+                pp_size = 1
+            else:
+                pp_size = int(use_neuronx_distributed_pp)
+            mp_plugin = ModelParallelismPlugin(
+                tensor_parallel_size=tp_size, parallelize_embeddings=True, pipeline_parallel_size=pp_size
+            )
         self._model_cpu_parameters_to_xla = {}
 
-        if tp_plugin.should_parallelize:
+        if mp_plugin.tensor_parallel_size > 1:
             os.environ["ACCELERATE_USE_NEURONX_DISTRIBUTED_TP"] = "true"
 
-        patched_accelerator_state = partial(NeuronAcceleratorState, tp_plugin=tp_plugin)
+        if mp_plugin.pipeline_parallel_size > 1:
+            os.environ["ACCELERATE_USE_NEURONX_DISTRIBUTED_PP"] = "true"
+
+        patched_accelerator_state = partial(NeuronAcceleratorState, mp_plugin=mp_plugin)
         with Patcher([("accelerate.accelerator.AcceleratorState", patched_accelerator_state)]):
             super().__init__(**full_kwargs)
 
@@ -136,7 +166,7 @@ def __init__(self, *args, tp_plugin: Optional[TensorParallelismPlugin] = None, z
         if self.process_index == -1 and self.zero_1:
             raise ValueError("XLA ZeRO Stage 1 can only be enabled in a distributed training setting.")
 
-        if fsdp_plugin is not None and tp_plugin is not None:
+        if fsdp_plugin is not None and mp_plugin is not None:
             raise ValueError("It is not possible to both use neuronx_distributed Tensor Parallelism and XLA FSDP.")
 
         if num_steps != 1:
@@ -164,7 +194,7 @@ def _prepare_data_loader_for_distributed(
 
         sampler = DistributedSampler(data_loader.dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
 
-        data_loader_for_tp = DataLoader(
+        distributed_dataloader = DataLoader(
             data_loader.dataset,
             batch_size=data_loader.batch_size,
             sampler=sampler,
@@ -173,11 +203,11 @@ def _prepare_data_loader_for_distributed(
             pin_memory=data_loader.pin_memory,
             drop_last=data_loader.drop_last,
         )
-        data_loader_for_tp._is_accelerate_prepared = True
-        return data_loader_for_tp
+        distributed_dataloader._is_accelerate_prepared = True
+        return distributed_dataloader
 
     def prepare_data_loader(self, data_loader: DataLoader, device_placement: Optional[bool] = None):
-        if self.state.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM:
+        if self.state.distributed_type is NeuronDistributedType.MODEL_PARALLELISM:
             from neuronx_distributed import parallel_layers
 
             num_replicas = parallel_layers.parallel_state.get_data_parallel_size()
@@ -187,15 +217,17 @@ def prepare_data_loader(self, data_loader: DataLoader, device_placement: Optiona
             rank = xm.get_ordinal()
         if self.state.num_processes > 1:
             data_loader = self._prepare_data_loader_for_distributed(data_loader, num_replicas=num_replicas, rank=rank)
-            data_loader = MpDeviceLoader(data_loader, self.device)
+            # No need to wrap the dataloader if we are using pipeline parallelism.
+            if self.state.mp_plugin.pipeline_parallel_size == 1:
+                data_loader = MpDeviceLoader(data_loader, self.device)
         return data_loader
         # TODO: fix that.
         # return super().prepare_data_loader(data_loader, device_placement=device_placement)
 
-    def _prepare_optimizer_for_tp(self, optimizer: torch.optim.Optimizer, device_placement=None):
+    def _prepare_optimizer_for_mp(self, optimizer: torch.optim.Optimizer, device_placement=None):
         cpu_parameters_to_xla = collections.ChainMap(*self._model_cpu_parameters_to_xla.values())
         if not self.zero_1:
-            optimizer = Parallelizer.optimizer_for_tp(optimizer, cpu_parameters_to_xla)
+            optimizer = Parallelizer.optimizer_for_mp(optimizer, cpu_parameters_to_xla)
         else:
             xla_parameters, _ = Parallelizer.optimizer_cpu_params_to_xla_params(optimizer, cpu_parameters_to_xla)
             if hasattr(optimizer, "_args_to_recreate"):
@@ -234,6 +266,7 @@ def _prepare_optimizer_for_zero_1(self, optimizer: torch.optim.Optimizer, device
             args, kwargs = optimizer._args_to_recreate
             params = args[0]
             defaults = args_and_kwargs_to_kwargs_only(optimizer.__class__, args[1:], kwargs)
+
             zero_1_optimizer = NeuronZero1Optimizer(
                 params,
                 optimizer.__class__,
@@ -262,16 +295,36 @@ def _prepare_optimizer_for_zero_1(self, optimizer: torch.optim.Optimizer, device
 
     @patch_within_function(("accelerate.accelerator.AcceleratedOptimizer", NeuronAcceleratedOptimizer))
     def prepare_optimizer(self, optimizer: torch.optim.Optimizer, device_placement: Optional[bool] = None):
-        if self.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM:
-            optimizer = self._prepare_optimizer_for_tp(optimizer, device_placement=device_placement)
+        if self.distributed_type is NeuronDistributedType.MODEL_PARALLELISM:
+            optimizer = self._prepare_optimizer_for_mp(optimizer, device_placement=device_placement)
         if self.zero_1:
             optimizer = self._prepare_optimizer_for_zero_1(optimizer, device_placement=device_placement)
+        # Edge case: if the optimizer was created lazily outside of the Model Parallelism and/or ZeRO-1 setting, we make
+        # sure to actually load the proper parameters.
+        if hasattr(optimizer, "_args_to_recreate"):
+            args, kwargs = optimizer._args_to_recreate
+            optimizer = optimizer.__class__(*args, **kwargs)
+
         return super().prepare_optimizer(optimizer, device_placement=device_placement)
 
     @patch_within_function(("accelerate.accelerator.AcceleratedScheduler", NeuronAcceleratedScheduler))
     def prepare_scheduler(self, scheduler: "LRScheduler"):
         return super().prepare_scheduler(scheduler)
 
+    @staticmethod
+    def patch_model_for_neuron(
+        model: "torch.nn.Module", patching_specs: Optional[List[Tuple[str, Any]]] = None
+    ) -> "torch.nn.Module":
+        if patching_specs is None:
+            patching_specs = MODEL_PATCHING_SPECS
+        prepared_patching_specs = []
+        for spec in patching_specs:
+            prepared_patching_specs.append((model,) + spec)
+
+        model_patcher = ModelPatcher(prepared_patching_specs, ignore_missing_attributes=True)
+        model_patcher.patch()
+        return model
+
     def prepare_model_for_xla_fsdp(
         self, model: torch.nn.Module, device_placement: Optional[bool] = None, evaluation_mode: bool = False
     ):
@@ -342,49 +395,92 @@ def prepare_model_for_xla_fsdp(
 
         return model
 
-    def _prepare_model_for_tp(
+    @requires_neuronx_distributed
+    def _prepare_model_for_mp(
         self, model: torch.nn.Module, device_placement: Optional[bool] = None, evaluation_mode: bool = False
     ):
+        from neuronx_distributed.pipeline import NxDPPModel
+
         if model in self._models or Parallelizer.was_parallelized(model):
             return model
 
-        cpu_ids = [id(v) for v in model.parameters()]
+        cpu_ids = {name: id(param) for name, param in model.named_parameters()}
+        tied_parameters_dict = get_tied_parameters_dict(model)
+        model_main_input_name = getattr(model, "main_input_name", None)
         # TODO: enable self.device (if needed).
-        model = self.state.tp_plugin.parallelize_model(model, device=None)
+        model = self.state.mp_plugin.parallelize_model(model, device=None)
+
+        if model_main_input_name is not None:
+            setattr(model, "main_input_name", model_main_input_name)
+
+        if isinstance(model, NxDPPModel):
+            model.local_module = self.patch_model_for_neuron(
+                model.local_module, patching_specs=NxDPPMODEL_PATCHING_SPECS
+            )
+            model_to_cast = model.local_module
+        else:
+            model_to_cast = model
+
+        model_to_cast = model.local_module if isinstance(model, NxDPPModel) else model
         if os.environ.get("XLA_USE_BF16", "0") == "1" or os.environ.get("XLA_DOWNCAST_BF16", "0") == "1":
-            model.to(torch.bfloat16)
+            model_to_cast.to(torch.bfloat16)
         else:
-            model.to(torch.float32)
+            model_to_cast.to(torch.float32)
 
-        def _tie_or_clone_weights_for_tp(self, output_embeddings, input_embeddings):
+        def _tie_or_clone_weights_for_mp(self, output_embeddings, input_embeddings):
             """Tie or clone module weights depending of whether we are using TorchScript or not"""
             output_embeddings.weight = input_embeddings.weight
             if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
                 output_embeddings.out_features = input_embeddings.num_embeddings
 
-        with ModelPatcher(patching_specs=[(model, "_tie_or_clone_weights", _tie_or_clone_weights_for_tp)]):
-            model.tie_weights()
-            move_model_to_device(model, self.device)
-            model.tie_weights()
-        self._model_cpu_parameters_to_xla[id(model)] = dict(zip(cpu_ids, model.parameters()))
+        if isinstance(model, NxDPPModel):
+            with ModelPatcher(patching_specs=[(model, "_tie_or_clone_weights", _tie_or_clone_weights_for_mp)]):
+                model.move_model_to_device()
+                tie_parameters(model, tied_parameters_dict)
+            xla_params = dict(model.local_named_parameters())
+            self._model_cpu_parameters_to_xla[id(model)] = {
+                cpu_ids[name]: xla_params[name] for name, _ in model.local_named_parameters()
+            }
+        else:
+            with ModelPatcher(patching_specs=[(model, "_tie_or_clone_weights", _tie_or_clone_weights_for_mp)]):
+                move_model_to_device(model, self.device)
+                tie_parameters(model, tied_parameters_dict)
+            xla_params = dict(model.named_parameters())
+            symmetric_diff = set(cpu_ids.keys()).symmetric_difference((xla_params.keys()))
+            if symmetric_diff:
+                raise ValueError(
+                    f"The parameters on CPU do not match the parameters on the XLA device: {', '.join(symmetric_diff)}."
+                )
+
+            self._model_cpu_parameters_to_xla[id(model)] = {
+                cpu_ids[name]: xla_params[name] for name, _ in model.named_parameters()
+            }
+
         device_placement = False
 
         return super().prepare_model(model, device_placement=device_placement, evaluation_mode=evaluation_mode)
 
+    @requires_torch_xla
+    @requires_neuronx_distributed
     def prepare_model(
         self, model: torch.nn.Module, device_placement: Optional[bool] = None, evaluation_mode: bool = False
     ):
         # If the model was already prepared, we skip.
         if model in self._models:
             return model
+
+        model = self.patch_model_for_neuron(model)
+
         if self.distributed_type is NeuronDistributedType.XLA_FSDP:
             return self.prepare_model_for_xla_fsdp(
                 model, device_placement=device_placement, evaluation_mode=evaluation_mode
             )
-        elif self.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM:
-            return self._prepare_model_for_tp(
+        elif self.distributed_type is NeuronDistributedType.MODEL_PARALLELISM:
+            return self._prepare_model_for_mp(
                 model, device_placement=device_placement, evaluation_mode=evaluation_mode
             )
+        move_model_to_device(model, xm.xla_device())
+        device_placement = False
         return super().prepare_model(model, device_placement=device_placement, evaluation_mode=evaluation_mode)
 
     def backward_for_xla_fsdp(self, loss, **kwargs):
@@ -410,11 +506,15 @@ def clip_grad_norm_for_xla_fsdp(self, parameters, max_norm, norm_type: int = 2):
             if parameters == list(model.parameters()):
                 return model.clip_grad_norm_(max_norm, norm_type)
 
+    @requires_neuronx_distributed
     def _prepare_clip_grad_norm(self, parameters, max_norm, norm_type: int = 2):
+        from neuronx_distributed.pipeline import NxDPPModel
+
         self.unscale_gradients()
         parameters = list(parameters)
         for model in self._models:
-            if parameters == list(model.parameters()):
+            model_parameters = model.local_parameters() if isinstance(model, NxDPPModel) else model.parameters()
+            if parameters == list(model_parameters) or self.zero_1:
                 for opt in self._optimizers:
                     # Under this setting, the gradient clipping will be deferred to the optimizer step.
                     # It will happen after the gradients have been reduced and before the optimizer step.
@@ -423,7 +523,7 @@ def _prepare_clip_grad_norm(self, parameters, max_norm, norm_type: int = 2):
     def clip_grad_norm_(self, parameters, max_norm, norm_type=2):
         if self.distributed_type is NeuronDistributedType.XLA_FSDP:
             return self.clip_grad_norm_for_xla_fsdp(parameters, max_norm, norm_type=norm_type)
-        elif self.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM or self.zero_1:
+        elif self.distributed_type is NeuronDistributedType.MODEL_PARALLELISM or self.zero_1:
             return self._prepare_clip_grad_norm(parameters, max_norm, norm_type=norm_type)
         return super().clip_grad_norm_(parameters, max_norm, norm_type=norm_type)
 
@@ -434,7 +534,7 @@ def clip_grad_value_(self, parameters, clip_value):
 
     def _custom_save_state(
         self,
-        save_model_func: Callable[["Accelerator", "PreTrainedModel", Union[str, Path], int], Any],
+        save_model_func: Optional[Callable[["Accelerator", "PreTrainedModel", Union[str, Path], int], Any]],
         save_optimizer_func: Callable[
             ["Accelerator", "torch.optim.Optimizer", "PreTrainedModel", Union[str, Path], int], Any
         ],
@@ -475,18 +575,25 @@ def _inner(folder):
         xm.mark_step()
 
         # Save the models
-        weights = []
-        for i, model in enumerate(self._models):
-            save_model_func(self, model, output_dir, i)
+        if save_model_func is not None:
+            for i, model in enumerate(self._models):
+                save_model_func(self, model, output_dir, i)
 
         # Save the optimizers
-        optimizers = []
-        for i, opt in enumerate(self._optimizers):
+        if not self._optimizers and save_model_func is None:
+            optimizers = [None] * len(self._models)
+        else:
+            optimizers = self._optimizers
+        for i, opt in enumerate(optimizers):
             save_optimizer_func(self, opt, self._models[i], output_dir, i)
 
         # Save the lr schedulers taking care of DeepSpeed nuances
         schedulers = self._schedulers
 
+        # Setting those to be empty list so that `save_accelerator_state` does not redo the job.
+        weights = []
+        optimizers = []
+
         # Call model loading hooks that might have been registered with
         # accelerator.register_model_state_hook
         for hook in self._save_model_state_pre_hook.values():
@@ -515,15 +622,15 @@ def save_optimizer_func(accelerator, optimizer, model, output_dir, i):
             save_model_func, save_optimizer_func, output_dir=output_dir, **save_model_func_kwargs
         )
 
-    def save_state_for_tp(self, output_dir: Optional[str] = None, **save_model_func_kwargs):
-        def save_model_func(accelelerator, model, output_dir, i):
-            return
+    def save_state_for_mp(self, output_dir: Optional[str] = None, **save_model_func_kwargs):
+        # The model is saved at the same time as the optimizer.
+        save_model_func = None
 
         def save_optimizer_func(accelerator, optimizer, model, output_dir, i):
-            logger.info("Saving TP model and optimizer")
+            logger.info("Saving parallel model and optimizer")
             parallelizer = ParallelizersManager.parallelizer_for_model(model)
             parallelizer.save_model_checkpoint(model, output_dir, as_regular=False, optimizer=optimizer)
-            logger.info(f"TP model and optimizer saved to the directory {output_dir}")
+            logger.info(f"Parallel model and optimizer saved to the directory {output_dir}")
 
         return self._custom_save_state(
             save_model_func, save_optimizer_func, output_dir=output_dir, **save_model_func_kwargs
@@ -533,8 +640,8 @@ def save_optimizer_func(accelerator, optimizer, model, output_dir, i):
     def save_state(self, output_dir: Optional[str] = None, **save_model_func_kwargs) -> str:
         if self.distributed_type is NeuronDistributedType.XLA_FSDP:
             return self.save_state_for_xla_fsdp(output_dir=output_dir, **save_model_func_kwargs)
-        elif self.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM:
-            return self.save_state_for_tp(output_dir=output_dir, **save_model_func_kwargs)
+        elif self.distributed_type is NeuronDistributedType.MODEL_PARALLELISM:
+            return self.save_state_for_mp(output_dir=output_dir, **save_model_func_kwargs)
         return super().save_state(output_dir=output_dir, **save_model_func_kwargs)
 
     def gather(self, tensor, out_of_graph: bool = False):
diff --git a/optimum/neuron/accelerate/optimizer.py b/optimum/neuron/accelerate/optimizer.py
index e55221a27..d62709179 100644
--- a/optimum/neuron/accelerate/optimizer.py
+++ b/optimum/neuron/accelerate/optimizer.py
@@ -14,18 +14,17 @@
 # limitations under the License.
 """Custom AcceleratedOptimizer for Neuron."""
 
-from typing import TYPE_CHECKING, Optional
+from typing import Optional
 
+import torch
 from accelerate.optimizer import AcceleratedOptimizer
 from accelerate.utils import DistributedType
 
-from ..utils import is_neuronx_distributed_available, is_torch_xla_available
+from ..utils import is_torch_xla_available
+from ..utils.require_utils import requires_neuronx_distributed
 from .utils.dataclasses import NeuronDistributedType
 
 
-if TYPE_CHECKING:
-    import torch
-
 if is_torch_xla_available():
     import accelerate
     import torch_xla.core.xla_model as xm
@@ -33,8 +32,29 @@
 
     accelerate.optimizer.xm = xm
 
-if is_neuronx_distributed_available():
-    from neuronx_distributed import parallel_layers
+
+@requires_neuronx_distributed
+def allreduce_sequence_parallel_gradients(optimizer):
+    """
+    All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
+
+    Modified from megatron-lm:
+    https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
+    """
+    from neuronx_distributed.parallel_layers.mappings import reduce_from_tensor_model_parallel_region
+
+    grads = []
+    for param_group in optimizer.__getstate__()["param_groups"]:
+        for group, params in param_group.items():
+            if group == "params":
+                for p in params:
+                    if isinstance(p, torch.Tensor) and p.grad is not None:
+                        sequence_parallel_param = getattr(p, "sequence_parallel_enabled", False)
+                        if sequence_parallel_param:
+                            grads.append(p.grad.data)
+    for grad in grads:
+        # sum v.s. average: sum
+        reduce_from_tensor_model_parallel_region(grad)
 
 
 class NeuronAcceleratedOptimizer(AcceleratedOptimizer):
@@ -49,7 +69,7 @@ def __init__(
         self.parameters = []
         self.parameter_ids = {}
         self.clip_grad_norm_to_perform = None
-        if self.accelerator_state.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM:
+        if self.accelerator_state.distributed_type is NeuronDistributedType.MODEL_PARALLELISM:
             self.parameters = [p for group in self.optimizer.param_groups for p in group["params"]]
             self.parameter_ids = {id(p) for p in self.parameters}
 
@@ -59,11 +79,19 @@ def load_state_dict(self, state_dict):
 
     def prepare_clip_grad_norm(self, parameters, max_norm, norm_type=2):
         parameter_ids = {id(p) for p in parameters}
-        if parameter_ids == self.parameter_ids:
+        if parameter_ids == self.parameter_ids or isinstance(self.optimizer, ZeroRedundancyOptimizer):
             self.clip_grad_norm_to_perform = {"max_norm": max_norm, "norm_type": norm_type}
 
+    @requires_neuronx_distributed
     def step(self, closure=None):
+        from neuronx_distributed import parallel_layers
+        from neuronx_distributed.parallel_layers.grads import bucket_allreduce_gradients
+
         if self.gradient_state.sync_gradients:
+            # For sequence-parallel, we have to explicitly all-reduce the layernorm gradients.
+            if self.accelerator_state.distributed_type is NeuronDistributedType.MODEL_PARALLELISM:
+                allreduce_sequence_parallel_gradients(self.optimizer)
+
             if isinstance(self.optimizer, ZeroRedundancyOptimizer):
                 if self.clip_grad_norm_to_perform is not None:
                     # `ZeroRedundancyOptimizer` does not allow to pass a norm type, it could be done but postponing for
@@ -74,18 +102,21 @@ def step(self, closure=None):
                     self.optimizer.grad_clipping = False
                 optimizer_args = {"closure": closure} if closure is not None else {}
                 self.optimizer.step(closure)
+                # Resetting everything.
+                self.optimizer.grad_clipping = False
+                self.clip_grad_norm_to_perform = None
             elif self.accelerator_state.distributed_type is DistributedType.TPU:
                 optimizer_args = {"closure": closure} if closure is not None else {}
                 # By default barrier=False, but making sure it's the case here since we use ParalleLoader.
                 xm.optimizer_step(self.optimizer, optimizer_args=optimizer_args, barrier=False)
             elif self.accelerator_state.distributed_type is NeuronDistributedType.XLA_FSDP:
                 self.optimizer.step(closure)
-            elif self.accelerator_state.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM:
-                xm.reduce_gradients(
-                    self.optimizer, groups=parallel_layers.parallel_state.get_data_parallel_group(as_list=True)
-                )
+            elif self.accelerator_state.distributed_type is NeuronDistributedType.MODEL_PARALLELISM:
+                if parallel_layers.parallel_state.get_data_parallel_size() > 1:
+                    bucket_allreduce_gradients(xm._fetch_gradients(self.optimizer))
                 if self.clip_grad_norm_to_perform is not None:
                     parallel_layers.clip_grad_norm(self.parameters, **self.clip_grad_norm_to_perform)
+                    self.clip_grad_norm_to_perform = None
                 self.optimizer.step()
             elif self.scaler is not None:
                 scale_before = self.scaler.get_scale()
diff --git a/optimum/neuron/accelerate/state.py b/optimum/neuron/accelerate/state.py
index 1ca852685..1b1fe8c6e 100644
--- a/optimum/neuron/accelerate/state.py
+++ b/optimum/neuron/accelerate/state.py
@@ -36,6 +36,7 @@
 from ...utils import logging
 from ..utils import is_neuronx_distributed_available, is_torch_xla_available
 from .utils import NeuronDistributedType, NeuronFullyShardedDataParallelPlugin
+from .utils.dataclasses import ModelParallelismPlugin
 
 
 if is_torch_xla_available():
@@ -189,7 +190,7 @@ def __init__(self, cpu: bool = False, **kwargs):
         self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)
 
     def wait_for_everyone(self):
-        if self.distributed_type in [NeuronDistributedType.XLA_FSDP, NeuronDistributedType.TENSOR_PARALLELISM]:
+        if self.distributed_type in [NeuronDistributedType.XLA_FSDP, NeuronDistributedType.MODEL_PARALLELISM]:
             xm.rendezvous("accelerate.utils.wait_for_everyone")
         else:
             super().wait_for_everyone()
@@ -223,7 +224,7 @@ def __init__(
         deepspeed_plugin=None,
         fsdp_plugin=None,
         megatron_lm_plugin=None,
-        tp_plugin=None,
+        mp_plugin=None,
         _from_accelerator: bool = False,
         **kwargs,
     ):
@@ -262,29 +263,36 @@ def __init__(
                         os.environ["XLA_USE_BF16"] = str(1)
                         os.environ["XLA_DOWNCAST_BF16"] = str(0)
                         self.downcast_bfloat = False
-                if os.environ.get("ACCELERATE_USE_NEURONX_DISTRIBUTED_TP", "false") == "true":
+                if (
+                    os.environ.get("ACCELERATE_USE_NEURONX_DISTRIBUTED_TP", "false") == "true"
+                    or os.environ.get("ACCELERATE_USE_NEURONX_DISTRIBUTED_PP", "false") == "true"
+                ):
                     if not is_neuronx_distributed_available():
                         raise RuntimeError(
-                            "Tensor parallelism requires the neuronx_distributed package. You can install it by "
+                            "Model parallelism requires the neuronx_distributed package. You can install it by "
                             "running: python -m pip install neuronx_distributed --extra-index-url "
                             "https://pip.repos.neuron.amazonaws.com"
                         )
-                    if tp_plugin is None:
+                    if mp_plugin is None:
                         raise ValueError(
-                            "Could not initialize `neuronx_distributed` tensor parallelism because no "
-                            "TensorParallelismPlugin was provided."
-                        )
-                    if tp_plugin.should_parallelize:
-                        parallel_state.initialize_model_parallel(
-                            tensor_model_parallel_size=tp_plugin.tensor_parallel_size
+                            "Could not initialize `neuronx_distributed` model parallelism because no "
+                            "`ModelParallelismPlugin` was provided."
                         )
-                        self.distributed_type = NeuronDistributedType.TENSOR_PARALLELISM
+                    if mp_plugin.should_parallelize:
+                        if not parallel_state.model_parallel_is_initialized():
+                            parallel_state.initialize_model_parallel(
+                                tensor_model_parallel_size=mp_plugin.tensor_parallel_size,
+                                pipeline_model_parallel_size=mp_plugin.pipeline_parallel_size,
+                            )
+                        self.distributed_type = NeuronDistributedType.MODEL_PARALLELISM
                     else:
                         logger.warning(
-                            "Tensor parallelism is requested but nothing is done because the tensor parallel size is "
-                            "set to 1."
+                            "Model parallelism is requested but nothing is done because the tensor parallel size and "
+                            "the pipeline parallel size are set to 1."
                         )
-                    self.tp_plugin = tp_plugin
+                    self.mp_plugin = mp_plugin
+                else:
+                    self.mp_plugin = ModelParallelismPlugin()
                 if os.environ.get("ACCELERATE_USE_FSDP", "false") == "true":
                     self.distributed_type = NeuronDistributedType.XLA_FSDP
                     if self._mixed_precision != "no":
diff --git a/optimum/neuron/accelerate/utils/__init__.py b/optimum/neuron/accelerate/utils/__init__.py
index 129f75c1c..211d33cf0 100644
--- a/optimum/neuron/accelerate/utils/__init__.py
+++ b/optimum/neuron/accelerate/utils/__init__.py
@@ -13,5 +13,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .dataclasses import NeuronDistributedType, NeuronFullyShardedDataParallelPlugin, TensorParallelismPlugin
-from .misc import patch_accelerate_is_tpu_available
+from .dataclasses import ModelParallelismPlugin, NeuronDistributedType, NeuronFullyShardedDataParallelPlugin
+from .misc import get_tied_parameters_dict, patch_accelerate_is_tpu_available, tie_parameters
diff --git a/optimum/neuron/accelerate/utils/dataclasses.py b/optimum/neuron/accelerate/utils/dataclasses.py
index d5ade238a..f4d0dc0dd 100644
--- a/optimum/neuron/accelerate/utils/dataclasses.py
+++ b/optimum/neuron/accelerate/utils/dataclasses.py
@@ -46,7 +46,7 @@ class NeuronDistributedType(str, enum.Enum):
     """
 
     XLA_FSDP = "XLA_FSDP"
-    TENSOR_PARALLELISM = "TENSOR_PARALLELISM"
+    MODEL_PARALLELISM = "MODEL_PARALLELISM"
 
 
 @dataclass
@@ -140,21 +140,28 @@ def load_optimizer(self, accelerator, optimizer, model, input_dir, optimizer_ind
 
 
 @dataclass
-class TensorParallelismPlugin:
+class ModelParallelismPlugin:
     tensor_parallel_size: int = 1
     parallelize_embeddings: bool = True
     sequence_parallel_enabled: bool = False
+    pipeline_parallel_size: int = 1
+    pipeline_parallel_num_microbatches: int = 1
+    pipeline_parallel_use_zero1_optimizer: bool = False
     checkpoint_dir: Optional[Union[str, Path]] = None
 
     def __post_init__(self):
         if self.tensor_parallel_size < 1:
             raise ValueError(f"The tensor parallel size must be >= 1, but {self.tensor_parallel_size} was given here.")
+        if self.pipeline_parallel_size < 1:
+            raise ValueError(
+                f"The pipeline parallel size must be >= 1, but {self.pipeline_parallel_size} was given here."
+            )
         if isinstance(self.checkpoint_dir, str):
             self.checkpoint_dir = Path(self.checkpoint_dir)
 
     @property
     def should_parallelize(self):
-        return self.tensor_parallel_size > 1
+        return self.tensor_parallel_size > 1 or self.pipeline_parallel_size > 1
 
     def parallelize_model(
         self,
@@ -167,6 +174,8 @@ def parallelize_model(
             device=device,
             parallelize_embeddings=self.parallelize_embeddings,
             sequence_parallel_enabled=self.sequence_parallel_enabled,
+            pipeline_parallel_num_microbatches=self.pipeline_parallel_num_microbatches,
+            pipeline_parallel_use_zero1_optimizer=self.pipeline_parallel_use_zero1_optimizer,
             checkpoint_dir=self.checkpoint_dir,
         )
         return parallelized_model
diff --git a/optimum/neuron/accelerate/utils/misc.py b/optimum/neuron/accelerate/utils/misc.py
index cbea3183c..773649474 100644
--- a/optimum/neuron/accelerate/utils/misc.py
+++ b/optimum/neuron/accelerate/utils/misc.py
@@ -14,7 +14,18 @@
 # limitations under the License.
 """Utilities of various sorts related to accelerate with Neuron."""
 
-from ...utils import is_torch_xla_available, patch_everywhere
+from typing import TYPE_CHECKING, Dict, Union
+
+import torch
+
+from ...distributed.utils import named_parameters
+from ...utils import is_torch_neuronx_available, is_torch_xla_available, patch_everywhere
+from ...utils.require_utils import requires_neuronx_distributed
+
+
+if TYPE_CHECKING:
+    if is_torch_neuronx_available():
+        from neuronx_distributed.pipeline import NxDPPModel
 
 
 def is_tpu_available(check_device=True):
@@ -26,3 +37,48 @@ def is_tpu_available(check_device=True):
 
 def patch_accelerate_is_tpu_available():
     patch_everywhere("is_tpu_available", is_tpu_available, module_name_prefix="accelerate")
+
+
+@requires_neuronx_distributed
+def get_tied_parameters_dict(model: Union["torch.nn.Module", "NxDPPModel"]) -> Dict[str, str]:
+    from neuronx_distributed.pipeline import NxDPPModel
+
+    unique_parameters = {}
+    tied_parameters = {}
+    if isinstance(model, NxDPPModel):
+        module = model.local_module
+    else:
+        module = model
+    for name, param in named_parameters(module, remove_duplicate=False):
+        if param in unique_parameters:
+            tied_parameter_name = unique_parameters[param]
+            tied_parameters[name] = tied_parameter_name
+        else:
+            unique_parameters[param] = name
+    return tied_parameters
+
+
+@requires_neuronx_distributed
+def tie_parameters(model: Union["torch.nn.Module", "NxDPPModel"], tied_parameters_dict: Dict[str, str]):
+    from neuronx_distributed.pipeline import NxDPPModel
+
+    if isinstance(model, NxDPPModel):
+        module = model.local_module
+    else:
+        module = model
+
+    for param_to_tie_name, param_name in tied_parameters_dict.items():
+        param_to_tie_name = param_to_tie_name.rsplit(".", maxsplit=1)
+
+        param_to_tie_parent_module = (
+            module if len(param_to_tie_name) == 1 else module.get_submodule(param_to_tie_name[0])
+        )
+        param_to_tie = getattr(param_to_tie_parent_module, param_to_tie_name[1])
+
+        param_name = param_name.rsplit(".", maxsplit=1)
+        parent_module = module if len(param_name) == 1 else module.get_submodule(param_name[0])
+        param = getattr(parent_module, param_name[1])
+
+        if param_to_tie is not param:
+            del param_to_tie
+            setattr(param_to_tie_parent_module, param_to_tie_name[1], param)
diff --git a/optimum/neuron/distributed/base.py b/optimum/neuron/distributed/base.py
index 94d355558..8f9d65343 100644
--- a/optimum/neuron/distributed/base.py
+++ b/optimum/neuron/distributed/base.py
@@ -21,15 +21,16 @@
 from dataclasses import asdict
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set, Tuple, Type, Union
 
 import torch
+from transformers import PreTrainedModel
 from transformers.utils import WEIGHTS_NAME
 
 from ...utils import logging
 from ..utils import is_neuronx_distributed_available, is_torch_xla_available
-from ..utils.deprecate_utils import deprecate
-from ..utils.require_utils import requires_neuronx_distributed
+from ..utils.patching import Patcher
+from ..utils.require_utils import requires_neuronx_distributed, requires_torch_xla
 from .parallel_layers import (
     IOSequenceParallelizer,
     LayerNormSequenceParallelizer,
@@ -40,16 +41,20 @@
     TENSOR_PARALLEL_SHARDS_DIR_NAME,
     ParameterMetadata,
     WeightInformation,
-    initialize_linear,
     initialize_parallel_linear,
+    initialize_torch_nn_module,
+    linear_to_parallel_linear,
     load_tensor_for_weight,
+    named_parameters,
+    parameter_can_be_initialized,
     try_to_hf_initialize,
+    was_already_initialized_during_parallelization,
 )
 
 
 if TYPE_CHECKING:
-    from transformers import PreTrainedModel
-
+    if is_neuronx_distributed_available():
+        from neuronx_distributed.pipeline import NxDPPModel
 
 logger = logging.get_logger()
 
@@ -67,31 +72,64 @@ def __exit__(self, *exc):
         self.tmpdir.cleanup()
 
 
-@deprecate(
-    "2.0.0",
-    package_name="torch",
-    reason="torch.nn.Module._named_members takes a `remove_duplicate` parameter starting from 2.0.0",
-)
-def _named_members(module, get_members_fn, prefix="", recurse=True, remove_duplicate: bool = True):
-    r"""Helper method for yielding various names + members of modules."""
-    memo = set()
-    modules = module.named_modules(prefix=prefix, remove_duplicate=remove_duplicate) if recurse else [(prefix, module)]
-    for module_prefix, mod in modules:
-        members = get_members_fn(mod)
-        for k, v in members:
-            if v is None or v in memo:
-                continue
-            if remove_duplicate:
-                memo.add(v)
-            name = module_prefix + ("." if module_prefix else "") + k
-            yield name, v
+class SequenceParallelismSpecs:
+    SEQUENCE_PARALLEL_LAYERNORM_PATTERNS: Optional[List[str]] = None
+    LAYERNORM_TYPE: LayerNormType = LayerNormType.REGULAR
+    SEQUENCE_COLLECTIVE_OPS_INFOS: Optional[List[SequenceCollectiveOpInfo]] = None
 
+    @abstractclassmethod
+    def patch_for_sequence_parallelism(cls, model: "PreTrainedModel", sequence_parallel_enabled: bool):
+        """
+        This method needs to be overriden. It must patch anything model-specfic to make the model compatible with
+        sequence parallelism.
+        """
+        if sequence_parallel_enabled:
+            raise NotImplementedError(
+                f"No patching for the attention mechanism for sequence parallelism was implemented for {model.__class__}"
+            )
 
-def named_parameters(module: "torch.nn.Module", prefix: str = "", recurse: bool = True, remove_duplicate: bool = True):
-    gen = _named_members(
-        module, lambda mod: mod._parameters.items(), prefix=prefix, recurse=recurse, remove_duplicate=remove_duplicate
-    )
-    yield from gen
+
+class PipelineParallelismSpecs:
+    TRASNFORMER_LAYER_CLS: Type["torch.nn.Module"]
+    DEFAULT_INPUT_NAMES: Tuple[str, ...]
+    LEAF_MODULE_CLASSES_NAMES: Optional[List[Union[str, Type["torch.nn.Module"]]]] = None
+    OUTPUT_LOSS_SPECS: Tuple[bool, ...] = (True, False)
+
+    @classmethod
+    @requires_torch_xla
+    def create_pipeline_cuts(cls, model: PreTrainedModel, pipeline_parallel_size: int) -> List[str]:
+        """
+        Creates the pipeline cuts, e.g. the name of the layers at each the cuts happen for pipeline parallelism.
+        """
+        import torch_xla.core.xla_model as xm
+
+        num_layers = sum(1 if isinstance(mod, cls.TRASNFORMER_LAYER_CLS) else 0 for mod in model.modules())
+        if num_layers % pipeline_parallel_size != 0:
+            raise ValueError(
+                f"The number of transformer layers ({num_layers}) is not divisible by the pipeline parallel size "
+                f"({pipeline_parallel_size})."
+            )
+        num_layers_per_partition = num_layers // pipeline_parallel_size
+        layers_names = [name for (name, mod) in model.named_modules() if isinstance(mod, cls.TRASNFORMER_LAYER_CLS)]
+        pipeline_cuts = [
+            layers_names[cut_idx]
+            for cut_idx in range(num_layers_per_partition - 1, num_layers - 1, num_layers_per_partition)
+        ]
+
+        if xm.get_local_ordinal() == 0:
+            logger.info(f"Pipeline parallelism cuts: {pipeline_cuts}.")
+
+        return pipeline_cuts
+
+    @classmethod
+    def leaf_module_cls(cls) -> List[str]:
+        if cls.LEAF_MODULE_CLASSES_NAMES is None:
+            return []
+        return [class_ if isinstance(class_, str) else class_.__name__ for class_ in cls.LEAF_MODULE_CLASSES_NAMES]
+
+    @classmethod
+    def get_patching_specs(cls) -> List[Tuple[str, Any]]:
+        return []
 
 
 class Parallelizer(ABC):
@@ -99,9 +137,8 @@ class Parallelizer(ABC):
     Base abstract class that handles model parallelism.
     """
 
-    SEQUENCE_PARALLEL_LAYERNORM_PATTERNS: Optional[List[str]] = None
-    LAYERNORM_TYPE: LayerNormType = LayerNormType.REGULAR
-    SEQUENCE_COLLECTIVE_OPS_INFOS: Optional[List[SequenceCollectiveOpInfo]] = None
+    SEQUENCE_PARALLELSIM_SPECS_CLS: Optional[Type[SequenceParallelismSpecs]] = None
+    PIPELINE_PARALLELISM_SPECS_CLS: Optional[Type[PipelineParallelismSpecs]] = None
 
     def __init__(self):
         self._validate_required_libaries_are_available()
@@ -128,6 +165,76 @@ def saved_model_in_temporary_directory(cls, model: "PreTrainedModel"):
         finally:
             tmpdir.cleanup()
 
+    @classmethod
+    def supports_sequence_parallelism(cls) -> bool:
+        return cls.SEQUENCE_PARALLELSIM_SPECS_CLS is not None
+
+    @classmethod
+    def supports_pipeline_parallelism(cls) -> bool:
+        return cls.PIPELINE_PARALLELISM_SPECS_CLS is not None
+
+    @classmethod
+    @requires_neuronx_distributed
+    def _get_parameter_names_for_current_pipeline(
+        cls, model: "torch.nn.Module", remove_duplicate: bool = True
+    ) -> Set[str]:
+        """
+        Retrieves the names of the parameters that will be in the current pipeline stage by using the pipeline
+        parallelism rank.
+        """
+        from neuronx_distributed.parallel_layers.parallel_state import (
+            get_pipeline_model_parallel_rank,
+            get_pipeline_model_parallel_size,
+        )
+
+        pp_size = get_pipeline_model_parallel_size()
+        pp_rank = get_pipeline_model_parallel_rank()
+        all_parameter_names = {n for n, _ in named_parameters(model, remove_duplicate=remove_duplicate)}
+        if pp_size == 1:
+            return all_parameter_names
+
+        if not cls.supports_pipeline_parallelism():
+            raise NotImplementedError(f"{cls} does not support pipeline parallelism.")
+
+        cuts = cls.PIPELINE_PARALLELISM_SPECS_CLS.create_pipeline_cuts(model, pp_size)
+
+        start_module_name = cuts[pp_rank - 1] if pp_rank >= 1 else None
+        end_module_name = None if pp_rank == pp_size - 1 else cuts[pp_rank]
+        parameter2name = {p: n for n, p in named_parameters(model, remove_duplicate=remove_duplicate)}
+        parameter_names = set()
+        should_add = False
+        for name, mod in model.named_modules():
+            if not isinstance(mod, cls.PIPELINE_PARALLELISM_SPECS_CLS.TRASNFORMER_LAYER_CLS):
+                continue
+            # If start_module_name is None, it means we are on the first rank, we should add right from the beginning.
+            if start_module_name is None:
+                should_add = True
+            if should_add:
+                for _, param in named_parameters(mod, remove_duplicate=remove_duplicate):
+                    # It is important to use this dictionary (built with `model.named_parameters()`) instead of using
+                    # `mod.named_parameters()` to get the fully qualified names.
+                    param_name = parameter2name[param]
+                    parameter_names.add(param_name)
+
+            # We consider the parameters inside ]start_module_name, end_module_name].
+            if start_module_name == name:
+                should_add = True
+            if name == end_module_name:
+                break
+
+        parameters_inside_transformer_layers = {
+            p
+            for mod in model.modules()
+            if isinstance(mod, cls.PIPELINE_PARALLELISM_SPECS_CLS.TRASNFORMER_LAYER_CLS)
+            for _, p in named_parameters(mod, remove_duplicate=remove_duplicate)
+        }
+        parameter_outside_of_transformer_layers_names = {
+            name
+            for name, param in named_parameters(model, remove_duplicate=remove_duplicate)
+            if param not in parameters_inside_transformer_layers
+        }
+        return parameter_names | parameter_outside_of_transformer_layers_names
+
     @abstractclassmethod
     def _parallelize(
         cls,
@@ -154,17 +261,6 @@ def _parallelize(
             `PreTrainedModel`: The parallelized model.
         """
 
-    @classmethod
-    def patch_for_sequence_parallelism(cls, model: "PreTrainedModel", sequence_parallel_enabled: bool):
-        """
-        This method needs to be overriden. It must patch anything model-specfic to make the model compatible with
-        sequence parallelism.
-        """
-        if sequence_parallel_enabled:
-            raise NotImplementedError(
-                f"No patching for the attention mechanism for sequence parallelism was implemented for {model.__class__}"
-            )
-
     @classmethod
     @requires_neuronx_distributed
     def parallelize(
@@ -173,6 +269,9 @@ def parallelize(
         device: Optional["torch.device"] = None,
         parallelize_embeddings: bool = True,
         sequence_parallel_enabled: bool = False,
+        pipeline_parallel_input_names: Optional[Union[Tuple[str, ...], List[str]]] = None,
+        pipeline_parallel_num_microbatches: int = 1,
+        pipeline_parallel_use_zero1_optimizer: bool = False,
         checkpoint_dir: Optional[Union[str, Path]] = None,
     ) -> "PreTrainedModel":
         """
@@ -192,6 +291,11 @@ def parallelize(
                 This can be disabled in the case when the TP size does not divide the vocabulary size.
             sequence_parallel_enabled (`bool`, defaults to `False`):
                 Whether or not sequence parallelism is enabled.
+            pipeline_parallel_num_microbatches (`int`, defaults to 1):
+                The number of microbatches used for pipeline execution.
+            pipeline_parallel_use_zero1_optimizer (`bool`, defaults to `False`):
+                When zero-1 optimizer is used, set this to True, so the PP model will understand that zero-1 optimizer
+                will handle data parallel gradient averaging.
             checkpoint_dir (`Optional[Union[str, Path]]`):
                 Path to a sharded checkpoint. If specified, the checkpoint weights will be loaded to the parallelized
                 model.
@@ -201,45 +305,61 @@ def parallelize(
         """
         from neuronx_distributed import parallel_layers
 
-        if sequence_parallel_enabled and cls.SEQUENCE_PARALLEL_LAYERNORM_PATTERNS is None:
+        if sequence_parallel_enabled and not cls.supports_sequence_parallelism():
             raise NotImplementedError(f"Sequence parallelism is not supported for {model.__class__}.")
 
-        from neuronx_distributed.parallel_layers.parallel_state import get_tensor_model_parallel_rank
+        from neuronx_distributed.parallel_layers.parallel_state import (
+            get_pipeline_model_parallel_size,
+            get_tensor_model_parallel_rank,
+            get_tensor_model_parallel_size,
+        )
+        from neuronx_distributed.pipeline import NxDPPModel
+
+        tp_size = get_tensor_model_parallel_size()
+
+        sequence_parallel_enabled = sequence_parallel_enabled and tp_size > 1
 
         # Parallelizing the model.
         # This needs to be done prior to preparing the model for sequence parallelism because modules can be overriden.
-        model = cls._parallelize(
-            model,
-            device=device,
-            parallelize_embeddings=parallelize_embeddings,
-            sequence_parallel_enabled=sequence_parallel_enabled,
-        )
+        if tp_size > 1:
+            model = cls._parallelize(
+                model,
+                device=device,
+                parallelize_embeddings=parallelize_embeddings,
+                sequence_parallel_enabled=sequence_parallel_enabled,
+            )
 
         # Preparing the model for sequence parallelism:
-        # 1. Transforming the LayerNorms.
-        layer_norm_qualified_name_patterns = (
-            cls.SEQUENCE_PARALLEL_LAYERNORM_PATTERNS if cls.SEQUENCE_PARALLEL_LAYERNORM_PATTERNS is not None else []
-        )
-        layer_norm_sequence_parallelizer = LayerNormSequenceParallelizer(
-            sequence_parallel_enabled, layer_norm_qualified_name_patterns
-        )
-        layer_norm_sequence_parallelizer.sequence_parallelize(model, cls.LAYERNORM_TYPE)
-
-        # 2. Taking care of scattering / gathering on the sequence axis in the model via the IOSequenceParallelizer.
-        io_sequence_parallelizer = IOSequenceParallelizer(
-            sequence_parallel_enabled,
-            sequence_collective_op_infos=cls.SEQUENCE_COLLECTIVE_OPS_INFOS,
-        )
-        io_sequence_parallelizer.sequence_parallelize(model)
+        sp_specs_cls = cls.SEQUENCE_PARALLELSIM_SPECS_CLS
 
-        # 3. Applying model specific patching for sequence parallelism.
         if sequence_parallel_enabled:
-            cls.patch_for_sequence_parallelism(model, sequence_parallel_enabled)
+            # 1. Transforming the LayerNorms.
+            layer_norm_qualified_name_patterns = (
+                sp_specs_cls.SEQUENCE_PARALLEL_LAYERNORM_PATTERNS
+                if sp_specs_cls.SEQUENCE_PARALLEL_LAYERNORM_PATTERNS is not None
+                else []
+            )
+            layer_norm_sequence_parallelizer = LayerNormSequenceParallelizer(
+                sequence_parallel_enabled, layer_norm_qualified_name_patterns
+            )
+            layer_norm_sequence_parallelizer.sequence_parallelize(model, sp_specs_cls.LAYERNORM_TYPE)
+
+            # 2. Taking care of scattering / gathering on the sequence axis in the model via the IOSequenceParallelizer.
+            io_sequence_parallelizer = IOSequenceParallelizer(
+                sequence_parallel_enabled,
+                sequence_collective_op_infos=sp_specs_cls.SEQUENCE_COLLECTIVE_OPS_INFOS,
+            )
+            io_sequence_parallelizer.sequence_parallelize(model)
+
+            # 3. Applying model specific patching for sequence parallelism.
+            sp_specs_cls.patch_for_sequence_parallelism(model, sequence_parallel_enabled)
 
-        weight_map = getattr(model, "_weight_map", None)
         # The model was not loaded lazily, it is already ready.
-        if weight_map is None:
-            return model
+        weight_map = getattr(model, "_weight_map", {})
+
+        names_of_the_parameters_to_consider = cls._get_parameter_names_for_current_pipeline(
+            model, remove_duplicate=True
+        )
 
         with torch.no_grad():
             tied_weights = {}
@@ -249,7 +369,10 @@ def parallelize(
                 split = name.rsplit(".", maxsplit=1)
                 module = model.get_submodule(split[0])
                 attribute_name = split[1]
-                current_weight = getattr(module, attribute_name)
+
+                # Skipping the parameters that will not end-up in this pipeline rank.
+                if name not in names_of_the_parameters_to_consider:
+                    continue
 
                 try:
                     weight_info = WeightInformation(weight_map[name], name, weight_map=weight_map, device=device)
@@ -265,14 +388,14 @@ def parallelize(
                     # It can be the case when weights are tied. For example between the embeddings and the LM head.
                     new_parameter = tied_weights[parameter]
                 elif weight_info is not None:
-                    if getattr(current_weight, "tensor_model_parallel", False):
+                    if getattr(parameter, "tensor_model_parallel", False):
                         if parameter.device == torch.device("meta"):
                             # This must either be a torch.nn.Embedding or a torch.nn.Linear that was not handled during
                             # parallelization since those are the only classes that we initialize on the `meta` device.
-                            num_dims = current_weight.dim()
-                            partition_dim = getattr(current_weight, "partition_dim")
+                            num_dims = parameter.dim()
+                            partition_dim = getattr(parameter, "partition_dim")
                             tp_rank = get_tensor_model_parallel_rank()
-                            size_per_rank = current_weight.size(partition_dim)
+                            size_per_rank = parameter.size(partition_dim)
                             slices = [
                                 None
                                 if idx != partition_dim
@@ -291,10 +414,17 @@ def parallelize(
                     new_parameter = torch.nn.Parameter(
                         load_tensor_for_weight(weight_info, tensor_slices=slices).to(parameter.dtype)
                     )
+                elif parameter.device != torch.device("meta") and (
+                    was_already_initialized_during_parallelization(parameter)
+                    or not parameter_can_be_initialized(model, module, attribute_name)
+                ):
+                    tied_weights[parameter] = parameter
+                    new_parameters.add(parameter)
+                    continue
                 else:
                     # This means that there is no information about where to find the weights for this parameter.
                     device = torch.device("cpu") if device is None else device
-                    new_parameter = torch.nn.Parameter(torch.empty_like(current_weight, device=device))
+                    new_parameter = torch.nn.Parameter(torch.empty_like(parameter, device=device))
                     modules_to_initialize[module].append(attribute_name)
 
                 setattr(
@@ -317,23 +447,63 @@ def parallelize(
                     # `reset_parameters()` method but we need to be careful because one of the parameters might not
                     # need initialization.
                     left_uninitialized = try_to_hf_initialize(model, mod, parameter_names)
-                    if not left_uninitialized:
-                        continue
-                    initialize_linear(mod, left_uninitialized)
-
+                    if left_uninitialized:
+                        initialize_torch_nn_module(mod, left_uninitialized)
                 elif isinstance(mod, parallel_layers.layers.BaseParallelLinear):
                     # First, we try to initialize the layer similarly as it would be done with the model.
-                    # To do that it is necessary to change the model class to that the `model._init_weights` method
-                    # considers this module as a `torch.nn.Linear` instance.
-                    orig_class = mod.__class__
-                    mod.__class__ = torch.nn.Linear
-                    left_uninitialized = try_to_hf_initialize(model, mod, parameter_names)
-                    mod.__class__ = orig_class
-                    if not left_uninitialized:
-                        continue
-                    initialize_parallel_linear(mod, left_uninitialized)
+                    # To do that we initialize a `torch.nn.Linear` with the full shape, and then scatter the weights.
+                    input_is_parallel = gather_output = False
+                    if isinstance(mod, parallel_layers.layers.RowParallelLinear):
+                        axis = "row"
+                        input_is_parallel = mod.input_is_parallel
+                    else:
+                        axis = "column"
+                        gather_output = mod.gather_output
+                    fake_linear_mod = torch.nn.Linear(mod.input_size, mod.output_size)
+                    left_uninitialized = try_to_hf_initialize(model, fake_linear_mod, parameter_names)
+                    if left_uninitialized:
+                        initialize_parallel_linear(mod, left_uninitialized)
+                    else:
+                        fake_parallel_linear_mod = linear_to_parallel_linear(
+                            fake_linear_mod,
+                            axis,
+                            input_is_parallel=input_is_parallel,
+                            gather_output=gather_output,
+                            sequence_parallel_enabled=mod.sequence_parallel_enabled,
+                        )
+                        mod.weight.data = fake_parallel_linear_mod.weight.data.clone()
+                        if mod.bias is not None:
+                            mod.bias.data = fake_parallel_linear_mod.bias.data.clone()
+                        del fake_linear_mod
+                        del fake_parallel_linear_mod
                 else:
-                    raise ValueError(f"Do not know how to initialize a module of type {mod.__class__}")
+                    left_uninitialized = try_to_hf_initialize(model, mod, parameter_names)
+                    if left_uninitialized and hasattr(mod, "reset_parameters"):
+                        initialize_torch_nn_module(mod, parameter_names)
+
+        pp_size = get_pipeline_model_parallel_size()
+        if pp_size > 1:
+            if not cls.supports_pipeline_parallelism():
+                raise NotImplementedError("{cls} does not support pipeline parallelism.")
+
+            model.config.return_dict = False
+            model.config.use_cache = False
+            model.config.output_attentions = False
+            model.config.output_hidden_states = False
+
+            with Patcher(cls.PIPELINE_PARALLELISM_SPECS_CLS.get_patching_specs()):
+                if pipeline_parallel_input_names is None:
+                    pipeline_parallel_input_names = cls.PIPELINE_PARALLELISM_SPECS_CLS.DEFAULT_INPUT_NAMES
+                model = NxDPPModel(
+                    model,
+                    transformer_layer_cls=cls.PIPELINE_PARALLELISM_SPECS_CLS.TRASNFORMER_LAYER_CLS,
+                    num_microbatches=pipeline_parallel_num_microbatches,
+                    output_loss_value_spec=cls.PIPELINE_PARALLELISM_SPECS_CLS.OUTPUT_LOSS_SPECS,
+                    input_names=pipeline_parallel_input_names,
+                    pipeline_cuts=cls.PIPELINE_PARALLELISM_SPECS_CLS.create_pipeline_cuts(model, pp_size),
+                    leaf_module_cls=cls.PIPELINE_PARALLELISM_SPECS_CLS.leaf_module_cls(),
+                    use_zero1_optimizer=pipeline_parallel_use_zero1_optimizer,
+                )
 
         if checkpoint_dir is not None:
             cls.load_model_checkpoint(model, checkpoint_dir)
@@ -348,13 +518,21 @@ def deparallelize(cls, model: "PreTrainedModel") -> "PreTrainedModel":
     @requires_neuronx_distributed
     def was_parallelized(cls, model: "PreTrainedModel") -> bool:
         import neuronx_distributed
+        from neuronx_distributed.parallel_layers.parallel_state import (
+            get_pipeline_model_parallel_size,
+            get_tensor_model_parallel_size,
+        )
+        from neuronx_distributed.pipeline import NxDPPModel
 
+        needs_parallelization_for_pp = get_pipeline_model_parallel_size() > 1 and not isinstance(model, NxDPPModel)
         parallel_layer_classes = (
             neuronx_distributed.parallel_layers.ParallelEmbedding,
             neuronx_distributed.parallel_layers.ColumnParallelLinear,
             neuronx_distributed.parallel_layers.RowParallelLinear,
         )
-        return any(isinstance(mod, parallel_layer_classes) for mod in model.modules())
+        layers_are_parallel = any(isinstance(mod, parallel_layer_classes) for mod in model.modules())
+        needs_parallelization_for_tp = get_tensor_model_parallel_size() > 1 and not layers_are_parallel
+        return (not needs_parallelization_for_pp) and (not needs_parallelization_for_tp)
 
     @classmethod
     def _check_model_was_parallelized(cls, model: "PreTrainedModel"):
@@ -362,35 +540,64 @@ def _check_model_was_parallelized(cls, model: "PreTrainedModel"):
             raise ValueError("The model needs to be parallelized first.")
 
     @classmethod
+    @requires_torch_xla
     def optimizer_cpu_params_to_xla_params(
         cls,
         optimizer: "torch.optim.Optimizer",
         orig_param_to_parallel_param_on_xla: Mapping[int, "torch.nn.Parameter"],
     ) -> Tuple[List[Dict[str, Any]], bool]:
+        import torch_xla.core.xla_model as xm
+
         parameters_on_xla = []
         need_to_create_new_optimizer = False
         if hasattr(optimizer, "_args_to_recreate"):
             args, _ = optimizer._args_to_recreate
-            parameters = args[0]
-            for param in parameters:
-                if isinstance(param, dict):
-                    new_param = {k: v for k, v in param.items() if k != "params"}
-                    params = []
-                    for p in param["params"]:
-                        params.append(orig_param_to_parallel_param_on_xla[id(p)])
-                    new_param["params"] = params
-                else:
-                    new_param = []
-                    for p in param:
-                        new_param.append(orig_param_to_parallel_param_on_xla[id(p)])
+
+            # parameter_groups can either be an iterable of dictionaries (groups), or of parameters, in which case
+            # there is only one group.
+            parameter_groups = args[0]
+            parameter_groups = list(parameter_groups)
+            # parameter_groups cannot be empty
+            if isinstance(parameter_groups[0], dict):
+                for group in parameter_groups:
+                    new_group = {k: v for k, v in group.items() if k != "params"}
+                    params_on_xla = []
+                    for p in group["params"]:
+                        if p.device == xm.xla_device():
+                            params_on_xla.append(p)
+                        elif id(p) not in orig_param_to_parallel_param_on_xla:
+                            # This can be the case with pipeline parallelism.
+                            continue
+                        else:
+                            params_on_xla.append(orig_param_to_parallel_param_on_xla[id(p)])
+                    new_group["params"] = params_on_xla
+                    parameters_on_xla.append(new_group)
+            else:
+                new_param = {}
+                params_on_xla = []
+                for param in parameter_groups:
+                    if param.device == xm.xla_device():
+                        params_on_xla.append(param)
+                    elif id(param) not in orig_param_to_parallel_param_on_xla:
+                        # This can be the case with pipeline parallelism.
+                        continue
+                    else:
+                        params_on_xla.append(orig_param_to_parallel_param_on_xla[id(param)])
+                new_param["params"] = params_on_xla
                 parameters_on_xla.append(new_param)
         else:
             for param_group in optimizer.param_groups:
                 new_params = []
                 params = param_group["params"]
                 for idx in range(len(params)):
-                    param_on_xla = orig_param_to_parallel_param_on_xla[id(params[idx])]
-                    if params[idx] != param_on_xla:
+                    if params[idx].device == xm.xla_device():
+                        param_on_xla = params[idx]
+                    elif id(params[idx]) not in orig_param_to_parallel_param_on_xla:
+                        need_to_create_new_optimizer = True
+                        continue
+                    else:
+                        param_on_xla = orig_param_to_parallel_param_on_xla[id(params[idx])]
+                    if params[idx] is not param_on_xla:
                         need_to_create_new_optimizer = True
                     new_params.append(param_on_xla)
                 new_group = {k: v for k, v in param_group.items() if k != "params"}
@@ -399,7 +606,7 @@ def optimizer_cpu_params_to_xla_params(
         return parameters_on_xla, need_to_create_new_optimizer
 
     @classmethod
-    def optimizer_for_tp(
+    def optimizer_for_mp(
         cls,
         optimizer: "torch.optim.Optimizer",
         orig_param_to_parallel_param_on_xla: Mapping[int, "torch.nn.Parameter"],
@@ -429,14 +636,14 @@ def optimizer_for_tp(
         )
         if hasattr(optimizer, "_args_to_recreate"):
             args, kwargs = optimizer._args_to_recreate
-            optimizer_for_tp = optimizer.__class__(parallel_parameters, *args[1:], **kwargs)
+            optimizer_for_mp = optimizer.__class__(parallel_parameters, *args[1:], **kwargs)
             del optimizer
         elif need_to_create_new_optimizer:
-            optimizer_for_tp = optimizer.__class__(parallel_parameters)
+            optimizer_for_mp = optimizer.__class__(parallel_parameters)
             del optimizer
         else:
-            optimizer_for_tp = optimizer
-        return optimizer_for_tp
+            optimizer_for_mp = optimizer
+        return optimizer_for_mp
 
     @classmethod
     def _get_parameters_tp_metadata(cls, named_parameters: Dict[str, "torch.nn.Parameter"]):
@@ -509,26 +716,25 @@ def save_model_checkpoint_as_regular(
     @requires_neuronx_distributed
     def save_model_checkpoint_as_sharded(
         cls,
-        model: "PreTrainedModel",
+        model: Union["PreTrainedModel", "NxDPPModel"],
         output_dir: Union[str, Path],
         optimizer: Optional["torch.optim.Optimizer"] = None,
     ):
         import torch_xla.core.xla_model as xm
         from neuronx_distributed import parallel_layers
-        from neuronx_distributed.parallel_layers.parallel_state import (
-            get_data_parallel_rank,
-            get_tensor_model_parallel_rank,
-        )
+        from neuronx_distributed.pipeline import NxDPPModel
 
         cls._check_model_was_parallelized(model)
 
-        data_parallel_rank = get_data_parallel_rank()
-        tensor_parallel_rank = get_tensor_model_parallel_rank()
-
         if not isinstance(output_dir, Path):
             output_dir = Path(output_dir)
 
-        state_dict = {"model": model.state_dict()}
+        if isinstance(model, NxDPPModel):
+            model_state_dict = model.local_state_dict()
+        else:
+            model_state_dict = model.state_dict()
+
+        state_dict = {"model": model_state_dict}
         state_dict["sharded_metadata"] = {
             k: asdict(v) for k, v in cls._get_parameters_tp_metadata(dict(model.named_parameters())).items()
         }
@@ -539,12 +745,12 @@ def save_model_checkpoint_as_sharded(
 
         output_path = output_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME
 
-        if data_parallel_rank == 0 and tensor_parallel_rank == 0:
+        if xm.get_local_ordinal() == 0:
             if output_path.is_dir():
                 shutil.rmtree(output_path, ignore_errors=True)
             output_path.mkdir()
         xm.rendezvous("waiting before saving")
-        parallel_layers.save(state_dict, output_path.as_posix())
+        parallel_layers.save(state_dict, output_path.as_posix(), save_xser=True)
 
     @classmethod
     def save_model_checkpoint(
@@ -572,7 +778,10 @@ def load_model_sharded_checkpoint(cls, model: "PreTrainedModel", load_dir: Union
         if not isinstance(load_dir, Path):
             load_dir = Path(load_dir)
         neuronx_distributed.parallel_layers.load(
-            load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME, model_or_optimizer=model, sharded=True
+            load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME,
+            model_or_optimizer=model,
+            load_xser=True,
+            sharded=True,
         )
 
     @classmethod
@@ -588,6 +797,7 @@ def load_model_checkpoint(cls, model: "PreTrainedModel", load_dir: Union[str, Pa
     @classmethod
     @requires_neuronx_distributed
     def load_optimizer_sharded_checkpoint(cls, optimizer: "torch.optim.Optimizer", load_dir: Union[str, Path]):
+        import neuronx_distributed
         from neuronx_distributed.optimizer import NeuronZero1Optimizer
 
         is_zero_1_optimizer = optimizer.__class__.__name__ == "NeuronAcceleratedOptimizer" and isinstance(
@@ -599,10 +809,13 @@ def load_optimizer_sharded_checkpoint(cls, optimizer: "torch.optim.Optimizer", l
                 "It is not possible to load a sharded optimizer checkpoint when using ZeRO-1 yet."
             )
 
-        from neuronx_distributed.parallel_layers import load
-
         if not isinstance(load_dir, Path):
             load_dir = Path(load_dir)
-        load(
-            load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME, model_or_optimizer=optimizer, model_key="optimizer_state_dict"
+
+        neuronx_distributed.parallel_layers.load(
+            load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME,
+            model_or_optimizer=optimizer,
+            model_key="optimizer_state_dict",
+            load_xser=True,
+            sharded=True,
         )
diff --git a/optimum/neuron/distributed/decoder_models.py b/optimum/neuron/distributed/decoder_models.py
index 481890eed..0bb795e31 100644
--- a/optimum/neuron/distributed/decoder_models.py
+++ b/optimum/neuron/distributed/decoder_models.py
@@ -14,15 +14,18 @@
 # limitations under the License.
 """Classes related to `neuronx-distributed` to perform parallelism."""
 
-from typing import TYPE_CHECKING, Optional, Tuple
+import warnings
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple
 
 import torch
+from transformers.cache_utils import Cache
 from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoBlock, GPTNeoSelfAttention
 from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXAttention
 from transformers.models.llama.modeling_llama import (
     LlamaAttention,
     LlamaDecoderLayer,
     LlamaRMSNorm,
+    _prepare_4d_causal_attention_mask,
     apply_rotary_pos_emb,
     repeat_kv,
 )
@@ -32,7 +35,7 @@
     MistralRMSNorm,
 )
 
-from .base import Parallelizer
+from .base import Parallelizer, PipelineParallelismSpecs, SequenceParallelismSpecs
 from .parallel_layers import (
     LayerNormType,
     ParallelCrossEntropy,
@@ -71,7 +74,7 @@ class GPTNeoParallelCrossEntropy(ParallelCrossEntropy):
     LAST_LINEAR_PROJECTION_NAME = {"GPTNeoForCausalLM": "lm_head"}
 
 
-class GPTNeoParallelizer(Parallelizer):
+class GPTNeoSequenceParallelismSpecs(SequenceParallelismSpecs):
     SEQUENCE_PARALLEL_LAYERNORM_PATTERNS = [
         "transformer.h.[0-9]+.ln_[1-2]",
         "transformer.ln_f",
@@ -108,6 +111,10 @@ def _merge_heads(self, tensor, num_heads, attn_head_size):
                 module._split_heads = _split_heads.__get__(module)
                 module._merge_heads = _merge_heads.__get__(module)
 
+
+class GPTNeoParallelizer(Parallelizer):
+    SEQUENCE_PARALLELSIM_SPECS_CLS = GPTNeoSequenceParallelismSpecs
+
     @classmethod
     def _parallelize(
         cls,
@@ -158,14 +165,14 @@ class GPTNeoXParallelCrossEntropy(ParallelCrossEntropy):
     LAST_LINEAR_PROJECTION_NAME = {"GPTNeoXForCausalLM": "embed_out"}
 
 
-class GPTNeoXParallelizer(Parallelizer):
+class GPTNeoXSequenceParallelismSpecs(SequenceParallelismSpecs):
     SEQUENCE_PARALLEL_LAYERNORM_PATTERNS = [
         "gpt_neox.layers.[0-9]+.input_layernorm",
         "gpt_neox.layers.[0-9]+.post_attention_layernorm",
         "gpt_neox.final_layer_norm",
     ]
     SEQUENCE_COLLECTIVE_OPS_INFOS = [
-        SequenceCollectiveOpInfo("scatter", torch.nn.Embedding, "output", "first"),
+        SequenceCollectiveOpInfo("scatter", "gpt_neox.embed_in", "output", "first"),
         SequenceCollectiveOpInfo("gather", torch.nn.LayerNorm, "output", "last"),
     ]
 
@@ -269,6 +276,10 @@ def sequence_parallel_forward(
             if isinstance(module, GPTNeoXAttention):
                 module.forward = sequence_parallel_forward.__get__(module)
 
+
+class GPTNeoXParallelizer(Parallelizer):
+    SEQUENCE_PARALLELSIM_SPECS_CLS = GPTNeoXSequenceParallelismSpecs
+
     @classmethod
     def _parallelize(
         cls,
@@ -366,7 +377,7 @@ class LlamaParallelCrossEntropy(ParallelCrossEntropy):
     }
 
 
-class LlamaParallelizer(Parallelizer):
+class LlamaSequenceParallelismSpecs(SequenceParallelismSpecs):
     SEQUENCE_PARALLEL_LAYERNORM_PATTERNS = [
         "model.layers.[0-9]+.input_layernorm",
         "model.layers.[0-9]+.post_attention_layernorm",
@@ -391,13 +402,20 @@ def patch_for_sequence_parallelism(cls, model: "PreTrainedModel", sequence_paral
 
         def attention_forward(
             self,
-            hidden_states: "torch.Tensor",
-            attention_mask: Optional["torch.Tensor"] = None,
-            position_ids: Optional["torch.LongTensor"] = None,
-            past_key_value: Optional[Tuple["torch.Tensor"]] = None,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Cache] = None,
             output_attentions: bool = False,
             use_cache: bool = False,
-        ) -> Tuple["torch.Tensor", Optional["torch.Tensor"], Optional[Tuple["torch.Tensor"]]]:
+            **kwargs,
+        ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+            if "padding_mask" in kwargs:
+                warnings.warn(
+                    "Passing `padding_mask` is deprecated and removed since `transformers` v4.37. Please make sure to "
+                    "use `attention_mask` instead.`"
+                )
+
             if self.config.pretraining_tp > 1:
                 key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
                 query_slices = self.q_proj.weight.split(
@@ -439,16 +457,21 @@ def attention_forward(
 
             kv_seq_len = key_states.shape[-2]
             if past_key_value is not None:
-                kv_seq_len += past_key_value[0].shape[-2]
+                if self.layer_idx is None:
+                    raise ValueError(
+                        "The cache structure has changed since version `transformers v4.36. If you are using "
+                        f"{self.__class__.__name__} for auto-regressive decoding with k/v caching, please make sure to "
+                        "initialize the attention class with a layer index."
+                    )
+                kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
             cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
             query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
             if past_key_value is not None:
-                # reuse k, v, self_attention
-                key_states = torch.cat([past_key_value[0], key_states], dim=2)
-                value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-            past_key_value = (key_states, value_states) if use_cache else None
+                cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, cache_kwargs
+                )
 
             # repeat k/v heads if n_kv_heads < n_heads
             key_states = repeat_kv(key_states, self.num_key_value_groups)
@@ -505,6 +528,29 @@ def attention_forward(
             if isinstance(module, LlamaAttention):
                 module.forward = attention_forward.__get__(module)
 
+
+class LlamaPipelineParallelismSpecs(PipelineParallelismSpecs):
+    TRASNFORMER_LAYER_CLS = LlamaDecoderLayer
+    DEFAULT_INPUT_NAMES = ("input_ids", "attention_mask", "labels")
+    LEAF_MODULE_CLASSES_NAMES = [LlamaRMSNorm]
+
+    @classmethod
+    def get_patching_specs(cls) -> List[Tuple[str, Any]]:
+        leaf_prepare_4d_causal_attention_mask = torch.fx._symbolic_trace._create_wrapped_func(
+            _prepare_4d_causal_attention_mask
+        )
+        return [
+            (
+                "transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask",
+                leaf_prepare_4d_causal_attention_mask,
+            ),
+        ]
+
+
+class LlamaParallelizer(Parallelizer):
+    SEQUENCE_PARALLELSIM_SPECS_CLS = LlamaSequenceParallelismSpecs
+    PIPELINE_PARALLELISM_SPECS_CLS = LlamaPipelineParallelismSpecs
+
     @classmethod
     def _parallelize(
         cls,
@@ -598,7 +644,7 @@ class MistralParallelCrossEntropy(ParallelCrossEntropy):
     LAST_LINEAR_PROJECTION_NAME = {"MistralForCausalLM": "lm_head"}
 
 
-class MistralParallelizer(Parallelizer):
+class MistralSequenceParallelismSpecs(SequenceParallelismSpecs):
     SEQUENCE_PARALLEL_LAYERNORM_PATTERNS = [
         "model.layers.[0-9]+.input_layernorm",
         "model.layers.[0-9]+.post_attention_layernorm",
@@ -625,11 +671,16 @@ def attention_forward(
             hidden_states: torch.Tensor,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
-            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            past_key_value: Optional[Cache] = None,
             output_attentions: bool = False,
             use_cache: bool = False,
             **kwargs,
         ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+            if "padding_mask" in kwargs:
+                warnings.warn(
+                    "Passing `padding_mask` is deprecated and removed since `transformers` v4.37. Please make sure to "
+                    "use `attention_mask` instead.`"
+                )
             query_states = self.q_proj(hidden_states)
             key_states = self.k_proj(hidden_states)
             value_states = self.v_proj(hidden_states)
@@ -653,16 +704,21 @@ def attention_forward(
 
             kv_seq_len = key_states.shape[-2]
             if past_key_value is not None:
-                kv_seq_len += past_key_value[0].shape[-2]
+                if self.layer_idx is None:
+                    raise ValueError(
+                        "The cache structure has changed since `transformers` v4.36. If you are using "
+                        f"{self.__class__.__name__} for auto-regressive decoding with k/v caching, please make sure to "
+                        "initialize the attention class with a layer index."
+                    )
+                kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
             cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
             query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
             if past_key_value is not None:
-                # reuse k, v, self_attention
-                key_states = torch.cat([past_key_value[0], key_states], dim=2)
-                value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-            past_key_value = (key_states, value_states) if use_cache else None
+                cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, cache_kwargs
+                )
 
             # repeat k/v heads if n_kv_heads < n_heads
             key_states = repeat_kv(key_states, self.num_key_value_groups)
@@ -713,6 +769,10 @@ def attention_forward(
             if isinstance(module, MistralAttention):
                 module.forward = attention_forward.__get__(module)
 
+
+class MistralParallelizer(Parallelizer):
+    SEQUENCE_PARALLELSIM_SPECS_CLS = MistralSequenceParallelismSpecs
+
     @classmethod
     def _parallelize(
         cls,
diff --git a/optimum/neuron/distributed/encoder_decoder_models.py b/optimum/neuron/distributed/encoder_decoder_models.py
index 4fb537330..fa29ee8b6 100644
--- a/optimum/neuron/distributed/encoder_decoder_models.py
+++ b/optimum/neuron/distributed/encoder_decoder_models.py
@@ -20,7 +20,7 @@
 from transformers.models.t5.modeling_t5 import T5Attention, T5ForSequenceClassification, T5LayerNorm
 
 from ...utils import NormalizedConfigManager
-from .base import Parallelizer
+from .base import Parallelizer, SequenceParallelismSpecs
 from .parallel_layers import (
     LayerNormType,
     ParallelCrossEntropy,
@@ -154,7 +154,7 @@ class T5ParallelCrossEntropy(ParallelCrossEntropy):
     LAST_LINEAR_PROJECTION_NAME = {"T5ForConditionalGeneration": "lm_head"}
 
 
-class T5Parallelizer(Parallelizer):
+class T5SequenceParallelismSpecs(SequenceParallelismSpecs):
     SEQUENCE_PARALLEL_LAYERNORM_PATTERNS = [
         "encoder.block.[0-9]+.layer.[0-9]+.layer_norm",
         "encoder.final_layer_norm",
@@ -316,6 +316,8 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value):
             if isinstance(module, T5Attention):
                 module.forward = sequence_parallel_forward.__get__(module)
 
+
+class T5Parallelizer(Parallelizer):
     @classmethod
     def _parallelize(
         cls,
diff --git a/optimum/neuron/distributed/encoder_models.py b/optimum/neuron/distributed/encoder_models.py
index 2322d7434..c8e2c617c 100644
--- a/optimum/neuron/distributed/encoder_models.py
+++ b/optimum/neuron/distributed/encoder_models.py
@@ -19,7 +19,7 @@
 import torch
 
 from ..utils.require_utils import requires_neuronx_distributed
-from .base import Parallelizer
+from .base import Parallelizer, SequenceParallelismSpecs
 from .parallel_layers import (
     ParallelCrossEntropy,
     ParallelEmbedding,
@@ -90,7 +90,7 @@ class BertParallelCrossEntropy(ParallelCrossEntropy):
     }
 
 
-class BertParallelizer(Parallelizer):
+class BertSequenceParallelismSpecs(SequenceParallelismSpecs):
     SEQUENCE_PARALLEL_LAYERNORM_PATTERNS = [
         "bert.embeddings.LayerNorm",
         "bert.encoder.layer.[0-9]+.attention.output.LayerNorm",
@@ -123,6 +123,10 @@ def transpose_for_scores(self, x: "torch.Tensor") -> "torch.Tensor":
                     module.forward, sequence_parallel_enabled
                 ).__get__(module)
 
+
+class BertParallelizer(Parallelizer):
+    SEQUENCE_PARALLELSIM_SPECS_CLS = BertSequenceParallelismSpecs
+
     @classmethod
     def _parallelize(
         cls,
@@ -181,7 +185,7 @@ class RobertaParallelCrossEntropy(ParallelCrossEntropy):
     }
 
 
-class RobertaParallelizer(Parallelizer):
+class RobertaSequenceParallelismSpecs(SequenceParallelismSpecs):
     SEQUENCE_PARALLEL_LAYERNORM_PATTERNS = [
         "roberta.embeddings.LayerNorm",
         "roberta.encoder.layer.[0-9]+.attention.output.LayerNorm",
@@ -214,6 +218,10 @@ def transpose_for_scores(self, x: "torch.Tensor") -> "torch.Tensor":
                     module.forward, sequence_parallel_enabled
                 ).__get__(module)
 
+
+class RobertaParallelizer(Parallelizer):
+    SEQUENCE_PARALLELSIM_SPECS_CLS = RobertaSequenceParallelismSpecs
+
     @classmethod
     def _parallelize(
         cls,
diff --git a/optimum/neuron/distributed/parallel_layers.py b/optimum/neuron/distributed/parallel_layers.py
index 1db914886..9f626f61d 100644
--- a/optimum/neuron/distributed/parallel_layers.py
+++ b/optimum/neuron/distributed/parallel_layers.py
@@ -693,6 +693,7 @@ def transform(
 
 
 @requires_neuronx_distributed
+@torch.fx.wrap
 def safe_parallel_cross_entropy(*args, **kwargs):
     if kwargs.pop("weight", None) is not None:
         raise ValueError("The weight keyword argument is not supported when using parallel cross entropy")
@@ -714,6 +715,7 @@ def safe_parallel_cross_entropy(*args, **kwargs):
     input_ = args[0]
     if _PARALLEL_CROSS_ENTROPY_SHOULD_PRESERVE_INPUT:
         input_ = input_.clone()
+
     loss = parallel_cross_entropy(input_, *args[1:], **kwargs)
 
     if reduction == "mean":
diff --git a/optimum/neuron/distributed/parallelizers_manager.py b/optimum/neuron/distributed/parallelizers_manager.py
index 09fb929df..9c7d92e36 100644
--- a/optimum/neuron/distributed/parallelizers_manager.py
+++ b/optimum/neuron/distributed/parallelizers_manager.py
@@ -19,6 +19,7 @@
 
 from transformers import PreTrainedModel
 
+from ..utils.require_utils import requires_neuronx_distributed
 from .base import Parallelizer
 
 
@@ -69,7 +70,12 @@ def get_supported_model_types(cls) -> List[str]:
         return list(cls._MODEL_TYPE_TO_PARALLEL_MODEL_CLASS.keys())
 
     @classmethod
+    @requires_neuronx_distributed
     def _get_model_type(cls, model_type_or_model: Union[str, PreTrainedModel]) -> str:
+        from neuronx_distributed.pipeline import NxDPPModel
+
+        if isinstance(model_type_or_model, NxDPPModel):
+            model_type_or_model = model_type_or_model.original_torch_module
         if isinstance(model_type_or_model, PreTrainedModel):
             model_type = model_type_or_model.config.model_type
         else:
diff --git a/optimum/neuron/distributed/utils.py b/optimum/neuron/distributed/utils.py
index 7093818a6..66118b108 100644
--- a/optimum/neuron/distributed/utils.py
+++ b/optimum/neuron/distributed/utils.py
@@ -15,6 +15,7 @@
 """Utilities for performing parallelism with `neuronx_distributed`"""
 
 import contextlib
+import copy
 import functools
 import itertools
 import json
@@ -28,21 +29,49 @@
 from transformers.utils import is_peft_available
 
 from ..utils import DynamicPatch, Patcher
+from ..utils.deprecate_utils import deprecate
 from ..utils.import_utils import is_neuronx_distributed_available
 from ..utils.misc import download_checkpoints_in_cache
 from ..utils.require_utils import requires_neuronx_distributed, requires_safetensors, requires_torch_xla
 
 
+if is_neuronx_distributed_available():
+    from neuronx_distributed.parallel_layers import layers
+
 if TYPE_CHECKING:
     from transformers import PreTrainedModel
 
-    if is_neuronx_distributed_available():
-        from neuronx_distributed.parallel_layers import layers
-
 
 TENSOR_PARALLEL_SHARDS_DIR_NAME = "tensor_parallel_shards"
 
 
+@deprecate(
+    "2.0.0",
+    package_name="torch",
+    reason="torch.nn.Module._named_members takes a `remove_duplicate` parameter starting from 2.0.0",
+)
+def _named_members(module, get_members_fn, prefix="", recurse=True, remove_duplicate: bool = True):
+    r"""Helper method for yielding various names + members of modules."""
+    memo = set()
+    modules = module.named_modules(prefix=prefix, remove_duplicate=remove_duplicate) if recurse else [(prefix, module)]
+    for module_prefix, mod in modules:
+        members = get_members_fn(mod)
+        for k, v in members:
+            if v is None or v in memo:
+                continue
+            if remove_duplicate:
+                memo.add(v)
+            name = module_prefix + ("." if module_prefix else "") + k
+            yield name, v
+
+
+def named_parameters(module: "torch.nn.Module", prefix: str = "", recurse: bool = True, remove_duplicate: bool = True):
+    gen = _named_members(
+        module, lambda mod: mod._parameters.items(), prefix=prefix, recurse=recurse, remove_duplicate=remove_duplicate
+    )
+    yield from gen
+
+
 @dataclass
 class WeightInformation:
     """
@@ -140,6 +169,14 @@ def _validate_weight_info_device_matches_specified_device(device: "torch.device"
         )
 
 
+def mark_parameter_init_status_during_parallelization(parameter: "torch.nn.Parameter", initialized: bool):
+    setattr(parameter, "_was_initialized_during_parallelization", initialized)
+
+
+def was_already_initialized_during_parallelization(parameter: "torch.nn.Parameter") -> bool:
+    return getattr(parameter, "_was_initialized_during_parallelization", False)
+
+
 @requires_neuronx_distributed
 def embedding_to_parallel_embedding(
     embedding_layer: "torch.nn.Embedding",
@@ -217,10 +254,14 @@ def embedding_to_parallel_embedding(
                 ),
             )
             parallel_embedding_layer.weight.copy_(weight_data)
-        else:
+            mark_parameter_init_status_during_parallelization(parallel_embedding_layer.weight, True)
+        elif embedding_layer.weight.device != torch.device("meta"):
             parallel_embedding_layer.weight.copy_(
                 embedding_layer.weight[tp_rank * row_size : (tp_rank + 1) * row_size, :]
             )
+            mark_parameter_init_status_during_parallelization(parallel_embedding_layer.weight, True)
+        else:
+            mark_parameter_init_status_during_parallelization(parallel_embedding_layer.weight, False)
 
         if lm_head_layer is not None:
             parallel_lm_head_layer = linear_to_parallel_linear(
@@ -334,19 +375,25 @@ def linear_to_parallel_linear(
                     ),
                 )
                 parallel_linear_layer.weight.copy_(weight_data)
+                mark_parameter_init_status_during_parallelization(parallel_linear_layer.weight, True)
             elif linear_layer.weight.device != torch.device("meta"):
                 parallel_linear_layer.weight.copy_(
                     linear_layer.weight[:, tp_rank * col_size : (tp_rank + 1) * col_size]
                 )
+                mark_parameter_init_status_during_parallelization(parallel_linear_layer.weight, True)
             else:
-                raise ValueError("Could not find data for the linear layer to parellelize.")
+                mark_parameter_init_status_during_parallelization(parallel_linear_layer.weight, False)
 
             if linear_layer.bias is not None:
                 if linear_layer_bias_weight_info is not None:
                     bias_weight_data = load_tensor_for_weight(linear_layer_bias_weight_info)
                     parallel_linear_layer.bias.copy_(bias_weight_data)
-                else:
+                    mark_parameter_init_status_during_parallelization(parallel_linear_layer.bias, True)
+                elif linear_layer.bias.device != torch.device("meta"):
                     parallel_linear_layer.bias.copy_(linear_layer.bias)
+                    mark_parameter_init_status_during_parallelization(parallel_linear_layer.bias, True)
+                else:
+                    mark_parameter_init_status_during_parallelization(parallel_linear_layer.bias, False)
 
         else:
             if embedding_weight_to_tie is not None:
@@ -360,12 +407,14 @@ def linear_to_parallel_linear(
                     ),
                 )
                 parallel_linear_layer.weight.copy_(weight_data)
+                mark_parameter_init_status_during_parallelization(parallel_linear_layer.weight, True)
             elif linear_layer.weight.device != torch.device("meta"):
                 parallel_linear_layer.weight.copy_(
                     linear_layer.weight[tp_rank * row_size : (tp_rank + 1) * row_size, :]
                 )
+                mark_parameter_init_status_during_parallelization(parallel_linear_layer.weight, True)
             else:
-                raise ValueError("Could not find data for the linear layer to parellelize.")
+                mark_parameter_init_status_during_parallelization(parallel_linear_layer.weight, False)
 
             if linear_layer.bias is not None:
                 if linear_layer_bias_weight_info is not None:
@@ -383,13 +432,17 @@ def linear_to_parallel_linear(
                         tensor_slices=tensor_slices,
                     )
                     parallel_linear_layer.bias.copy_(bias_weight_data)
-                else:
+                    mark_parameter_init_status_during_parallelization(parallel_linear_layer.bias, True)
+                elif linear_layer.bias.device != torch.device("meta"):
                     if gather_output:
                         parallel_linear_layer.bias.copy_(linear_layer.bias)
                     else:
                         parallel_linear_layer.bias.copy_(
                             linear_layer.bias[tp_rank * row_size : (tp_rank + 1) * row_size]
                         )
+                    mark_parameter_init_status_during_parallelization(parallel_linear_layer.bias, True)
+                else:
+                    mark_parameter_init_status_during_parallelization(parallel_linear_layer.bias, False)
 
     return parallel_linear_layer
 
@@ -451,13 +504,15 @@ def gqa_key_value_slicing_when_tp_size_greater_than_num_key_value_heads(
                 ),
             )
             sliced_linear_layer.weight.copy_(weight_data)
+            mark_parameter_init_status_during_parallelization(sliced_linear_layer.weight, True)
 
         elif linear_layer.weight.device != torch.device("meta"):
             sliced_linear_layer.weight.copy_(
                 linear_layer.weight[key_value_head_index * head_dim : (key_value_head_index + 1) * head_dim, :]
             )
+            mark_parameter_init_status_during_parallelization(sliced_linear_layer.weight, True)
         else:
-            raise ValueError("Could not find data for the linear layer to slice.")
+            mark_parameter_init_status_during_parallelization(sliced_linear_layer.weight, False)
 
         if linear_layer.bias is not None:
             if linear_layer_bias_weight_info is not None:
@@ -466,10 +521,14 @@ def gqa_key_value_slicing_when_tp_size_greater_than_num_key_value_heads(
                     tensor_slices=((key_value_head_index * head_dim, (key_value_head_index + 1) * head_dim),),
                 )
                 sliced_linear_layer.bias.copy_(bias_weight_data)
-            else:
+                mark_parameter_init_status_during_parallelization(sliced_linear_layer.bias, True)
+            elif sliced_linear_layer.bias.device != torch.device("meta"):
                 sliced_linear_layer.bias.copy_(
                     linear_layer.bias[key_value_head_index * head_dim : (key_value_head_index + 1) * head_dim]
                 )
+                mark_parameter_init_status_during_parallelization(sliced_linear_layer.bias, True)
+            else:
+                mark_parameter_init_status_during_parallelization(sliced_linear_layer.bias, False)
     return sliced_linear_layer
 
 
@@ -490,31 +549,47 @@ def try_to_hf_initialize(model: "PreTrainedModel", mod: torch.nn.Module, paramet
     """
     cached_params_data = {name: param.data.clone() for name, param in mod.named_parameters()}
     model._init_weights(mod)
+
+    dummy_mod = copy.deepcopy(mod)
+    for name in parameter_names:
+        getattr(dummy_mod, name).random_()
+    model._init_weights(dummy_mod)
+
     left_uninitialized = []
     with torch.no_grad():
         for name in parameter_names:
-            if torch.all(cached_params_data[name] == getattr(mod, name).data):
-                left_uninitialized.append(name)
+            # The parameter was left unchanged.
+            if torch.all(getattr(mod, name).data == cached_params_data[name]):
+                # There are two possible reasons:
+                #   1. The model cannot initialize the module that owns the parameter.
+                #   2. The parameter already had the proper value.
+
+                # We check if a dummy copy of the module, filled with random values is modified to know if the model
+                # can initialize the module.
+                dummy_param_was_changed = torch.all(getattr(dummy_mod, name).data == getattr(mod, name).data)
+                if not dummy_param_was_changed:
+                    left_uninitialized.append(name)
+
         for name, cached_data in cached_params_data.items():
             if name not in parameter_names:
                 param = getattr(mod, name)
                 param.data = cached_data
+
     return left_uninitialized
 
 
-def initialize_linear(mod: torch.nn.Linear, parameter_names: List[str]):
+def initialize_torch_nn_module(mod: torch.nn.Module, parameter_names: List[str]):
     """
     Initializes the parameters in `parameter_names` of a `torch.nn.Linear` module.
     """
-    cached_parameters = [mod.weight.data]
-    if mod.bias is not None:
-        cached_parameters.append(mod.bias.data)
+    if not hasattr(mod, "reset_parameters"):
+        raise ValueError(f"{mod} does not have a `reset_parameters` method.")
+    cached_parameters = {name: param.data.clone() for name, param in mod.named_parameters()}
     mod.reset_parameters()
     with torch.no_grad():
-        if "weight" not in parameter_names:
-            mod.weight.data = cached_parameters[0]
-        if mod.bias is not None and "bias" not in parameter_names:
-            mod.bias.data = cached_parameters[1]
+        for name, param in mod.named_parameters():
+            if param is not None and name not in parameter_names:
+                param.data = cached_parameters[name]
 
 
 def initialize_parallel_linear(mod: "layers.BaseParallelLinear", parameter_names: List[str]):
@@ -531,9 +606,18 @@ def initialize_parallel_linear(mod: "layers.BaseParallelLinear", parameter_names
         mod._init_bias()
 
 
+def parameter_can_be_initialized(model: torch.nn.Module, parent_module: torch.nn.Module, parameter_name: str) -> bool:
+    clone = copy.deepcopy(parent_module)
+    left_uninitialized = try_to_hf_initialize(model, clone, [parameter_name])
+    is_parallel_linear = isinstance(parent_module, layers.BaseParallelLinear)
+    return (
+        hasattr(parent_module, "reset_parameters") or is_parallel_linear or (parameter_name not in left_uninitialized)
+    )
+
+
 @classmethod
 @requires_torch_xla
-def from_pretrained_for_tp(
+def from_pretrained_for_mp(
     cls,
     pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
     *model_args,
@@ -672,8 +756,8 @@ def from_pretrained_for_tp(
                 if not sharing_same_suffix_as_name:
                     continue
                 names_of_weights_not_in_model.add(name)
-                longest_sharing_parameter_name = max(sharing_same_suffix_as_name, key=lambda s: len(s))
-                prefixes.add(longest_sharing_parameter_name.replace(name, ""))
+                shortest_sharing_parameter_name = min(sharing_same_suffix_as_name, key=lambda s: len(s))
+                prefixes.add(shortest_sharing_parameter_name.replace(name, ""))
             else:
                 weight_map_for_model[name] = filename
         if names_of_weights_not_in_model:
@@ -703,7 +787,7 @@ def from_pretrained_for_tp(
 
 
 @contextlib.contextmanager
-def lazy_load_for_parallelism(tensor_parallel_size: int = 1):
+def lazy_load_for_parallelism(tensor_parallel_size: int = 1, pipeline_parallel_size: int = 1):
     """
     Context manager that makes the loading of a model lazy for model parallelism:
 
@@ -711,11 +795,15 @@ def lazy_load_for_parallelism(tensor_parallel_size: int = 1):
         instantiate.
         - Every `torch.nn.Embedding` is also put on the `torch.device("meta")` device.
         - No state dict is actually loaded, instead a weight map is created and attached to the model. For more
-        information, read the [`optimum.neuron.distributed.utils.from_pretrained_for_tp`] docstring.
+        information, read the [`optimum.neuron.distributed.utils.from_pretrained_for_mp`] docstring.
+
+    If both `tensor_parallel_size` and `pipeline_parallel_size` are set to 1, no lazy loading is performed.
 
     Args:
         tensor_parallel_size (`int`, defaults to 1):
-            The parallel size considered for tensor parallel size. If set to 1, no lazy loading is performed.
+            The tensor parallel size considered.
+        pipeline_parallel_size (`int`, defaults to 1):
+            The pipeline parallel size considered.
     """
 
     def meta_init(init_fn):
@@ -731,9 +819,9 @@ def wrapper(*args, **kwargs):
     patching_specs = [
         ("torch.nn.Embedding.__init__", meta_init_patch),
         ("torch.nn.Linear.__init__", meta_init_patch),
-        ("transformers.modeling_utils.PreTrainedModel.from_pretrained", from_pretrained_for_tp),
+        ("transformers.modeling_utils.PreTrainedModel.from_pretrained", from_pretrained_for_mp),
     ]
-    if tensor_parallel_size > 1:
+    if tensor_parallel_size > 1 or pipeline_parallel_size > 1:
         patcher = Patcher(patching_specs=patching_specs)
     else:
         patcher = contextlib.nullcontext()
@@ -753,6 +841,21 @@ def make_optimizer_constructor_lazy(optimizer_cls: Type["torch.optim.Optimizer"]
 
     def optimizer_constructor(*args, **kwargs):
         optimizer_with_no_parameters = optimizer_cls([torch.nn.Parameter(torch.empty(1))], *args[1:], **kwargs)
+        # It is necessary to make sure that what's holding the parameters is not an iterator, otherwise it can lead to
+        # unexpected behaviour since each entry will be evaluated at iteration time. There are 2 possibilities:
+        #   1. args[0] holds the parameters
+        #   2. args[0] holds a list of parameter groups
+        parameters_or_parameter_groups = args[0]
+        if not isinstance(parameters_or_parameter_groups, list):
+            parameters_or_parameter_groups = list(parameters_or_parameter_groups)
+        if isinstance(parameters_or_parameter_groups[0], dict):
+            # It means that parameter groups were provided. We iterate over each group and make sure that the
+            # `"params"` entry is not an iterator.
+            for group in parameters_or_parameter_groups:
+                if not isinstance(group["params"], list):
+                    group["params"] = list(group["params"])
+
+        args = (parameters_or_parameter_groups,) + args[1:]
         optimizer_with_no_parameters._args_to_recreate = (args, kwargs)
         return optimizer_with_no_parameters
 
diff --git a/optimum/neuron/trainers.py b/optimum/neuron/trainers.py
index e464d0f02..c066ae797 100755
--- a/optimum/neuron/trainers.py
+++ b/optimum/neuron/trainers.py
@@ -14,11 +14,14 @@
 # limitations under the License.
 """Defines Trainer subclasses to perform training on AWS Neuron instances."""
 
-import contextlib
 import copy
 import glob
+import math
 import os
 import random
+import shutil
+import sys
+import time
 import warnings
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -26,41 +29,59 @@
 
 import numpy as np
 import torch
+from accelerate import __version__ as accelerate_version
 from packaging import version
 from transformers import PreTrainedModel, Seq2SeqTrainer, Trainer, TrainingArguments
+from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
+from transformers.integrations import hp_params
 from transformers.modeling_utils import unwrap_model
+from transformers.pytorch_utils import is_torch_less_than_1_11
 from transformers.trainer import (
     OPTIMIZER_NAME,
     SCHEDULER_NAME,
     TRAINER_STATE_NAME,
     TRAINING_ARGS_NAME,
 )
+from transformers.trainer_callback import TrainerState
 from transformers.trainer_pt_utils import (
+    IterableDatasetShard,
+    find_batch_size,
+    get_dataloader_sampler,
+    nested_concat,
+    nested_numpify,
     reissue_pt_warnings,
 )
-from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, EvalLoopOutput, has_length
-from transformers.utils import WEIGHTS_NAME, is_sagemaker_mp_enabled
+from transformers.trainer_utils import (
+    PREFIX_CHECKPOINT_DIR,
+    EvalLoopOutput,
+    EvalPrediction,
+    HPSearchBackend,
+    TrainOutput,
+    denumpify_detensorize,
+    has_length,
+    speed_metrics,
+)
+from transformers.training_args import ParallelMode
+from transformers.utils import WEIGHTS_NAME, is_apex_available, is_sagemaker_mp_enabled
 
 from ..utils import check_if_transformers_greater, logging
 from .accelerate import NeuronAccelerator, NeuronDistributedType
-from .distributed import ParallelizersManager
+from .distributed import Parallelizer, ParallelizersManager
 from .distributed.utils import make_optimizer_constructor_lazy
 from .trainer_callback import NeuronCacheCallback
 from .utils import (
-    DynamicPatch,
-    ModelPatcher,
     Patcher,
     is_torch_xla_available,
     patch_within_function,
 )
 from .utils.cache_utils import get_neuron_cache_path, set_neuron_cache_path
+from .utils.require_utils import requires_neuronx_distributed
 from .utils.training_utils import (
     TRANSFORMERS_MIN_VERSION_USE_ACCELERATE,
     get_model_param_count,
     is_precompilation,
     is_topology_supported,
     patch_generation_mixin_to_neuron_generation_mixin,
-    patched_finfo,
     prepare_environment_for_neuron,
     set_neuron_cc_optlevel_for_model,
     skip_first_batches,
@@ -68,8 +89,15 @@
 )
 
 
+if is_apex_available():
+    from apex import amp
+
+if is_sagemaker_mp_enabled():
+    import smdistributed.modelparallel.torch as smp
+
 if is_torch_xla_available():
     import torch_xla.core.xla_model as xm
+    import torch_xla.debug.metrics as met
 
 if is_sagemaker_mp_enabled():
     from smdistributed.modelparallel import __version__ as SMP_VERSION
@@ -79,7 +107,6 @@
 else:
     IS_SAGEMAKER_MP_POST_1_10 = False
 
-
 logger = logging.get_logger("transformers.trainer")
 
 KEEP_HF_HUB_PROGRESS_BARS = os.environ.get("KEEP_HF_HUB_PROGRESS_BARS")
@@ -94,16 +121,6 @@
 _TCP_STORE_PORT = 5000
 
 
-MODEL_PATCHING_SPECS = [
-    ("config.layerdrop", 0),
-    ("no_sync", lambda: contextlib.nullcontext()),
-    (
-        "forward",
-        DynamicPatch(patch_within_function(("torch.finfo", patched_finfo))),
-    ),
-]
-
-
 if os.environ.get("TORCHELASTIC_RUN_ID"):
     import torch_xla.distributed.xla_backend as xbn
 
@@ -178,7 +195,7 @@ def __init__(self, *args, **kwargs):
             logger.setLevel(logging.INFO)
 
         push = self.args.local_rank <= 0 and not is_precompilation() and not self.args.skip_cache_push
-        fetch = self.args.local_rank <= 0 or self.args.tp_plugin.should_parallelize
+        fetch = self.args.local_rank <= 0 or self.args.mp_plugin.should_parallelize
 
         callback = NeuronCacheCallback(
             tmp_neuron_cache=_TMP_NEURON_CACHE_PATH,
@@ -196,11 +213,8 @@ def __init__(self, *args, **kwargs):
         set_neuron_cc_optlevel_for_model(self.model, optlevel=self.args.neuron_cc_optlevel)
 
     @property
-    def tp_enabled(self):
-        return (
-            check_if_transformers_greater("4.30.0")
-            and self.accelerator.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM
-        )
+    def mp_enabled(self):
+        return self.accelerator.distributed_type is NeuronDistributedType.MODEL_PARALLELISM
 
     def prepare_args_for_precompilation(self, args: "TrainingArguments"):
         if args.num_train_epochs != 1:
@@ -221,7 +235,7 @@ def create_accelerator_and_postprocess(self):
         self.accelerator = NeuronAccelerator(
             deepspeed_plugin=self.args.deepspeed_plugin,
             gradient_accumulation_steps=self.args.gradient_accumulation_steps,
-            tp_plugin=self.args.tp_plugin,
+            mp_plugin=self.args.mp_plugin,
             zero_1=self.args.zero_1,
         )
 
@@ -246,12 +260,9 @@ def create_accelerator_and_postprocess(self):
                 ds_plugin.hf_ds_config.trainer_config_process(self.args)
 
     def _wrap_model(self, model, training=True, dataloader=None):
-        patching_specs = []
-        for spec in MODEL_PATCHING_SPECS:
-            patching_specs.append((model,) + spec)
-
-        with ModelPatcher(patching_specs, ignore_missing_attributes=True):
-            return super()._wrap_model(model, training=training, dataloader=dataloader)
+        return super()._wrap_model(
+            self.accelerator.patch_model_for_neuron(model), training=training, dataloader=dataloader
+        )
 
     # TODO: make this cleaner.
     def trigger_on_step_middle_for_neuron_cache_callback(self, model: "PreTrainedModel"):
@@ -269,7 +280,7 @@ def trigger_on_step_middle_for_neuron_cache_callback(self, model: "PreTrainedMod
                 callback.on_step_middle(self.args, self.state, self.control, **kwargs)
 
     def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
-        if self.tp_enabled:
+        if self.mp_enabled:
             if self.train_dataset is None or not has_length(self.train_dataset):
                 return None
 
@@ -285,7 +296,7 @@ def _get_eval_sampler(self, eval_dataset: torch.utils.data.Dataset) -> Optional[
     @staticmethod
     def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]:
         optimizer_cls, optimizer_kwargs = transformers_get_optimizer_cls_and_kwargs(args)
-        lazy_load = args.tp_plugin.should_parallelize or args.zero_1
+        lazy_load = args.mp_plugin.should_parallelize or args.zero_1
         if check_if_transformers_greater("4.30.0") and lazy_load:
             optimizer_cls = make_optimizer_constructor_lazy(optimizer_cls)
         return optimizer_cls, optimizer_kwargs
@@ -294,11 +305,47 @@ def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]:
     def create_optimizer(self):
         return super().create_optimizer()
 
+    def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torch.Tensor, Any]:
+        # When pipeline parallelism is enabled, we should not put any tensor on device.
+        # It is handled by the NxDPPModel class.
+        if self.args.mp_plugin.pipeline_parallel_size > 1:
+            return data
+        return super()._prepare_input(data)
+
     def compute_loss(self, model, inputs, return_outputs: bool = False):
         self.state.last_inputs = inputs
         self.trigger_on_step_middle_for_neuron_cache_callback(model)
+        from neuronx_distributed.pipeline import NxDPPModel
+
+        if isinstance(model, NxDPPModel):
+            inputs = self._prepare_inputs(inputs)
+            loss = model.run_train(**inputs)
+            return loss
+
         return super().compute_loss(model, inputs, return_outputs=return_outputs)
 
+    def training_step(self, model: torch.nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
+        from neuronx_distributed.pipeline import NxDPPModel
+
+        if isinstance(model, NxDPPModel):
+            from neuronx_distributed.parallel_layers.parallel_state import (
+                get_pipeline_model_parallel_rank,
+                get_pipeline_model_parallel_size,
+            )
+
+            with self.compute_loss_context_manager():
+                loss = self.compute_loss(model, inputs)
+
+            if get_pipeline_model_parallel_rank() != get_pipeline_model_parallel_size() - 1:
+                use_bf16 = os.environ.get("XLA_USE_BF16", False) or os.environ.get("XLA_DOWNCAST_BF16", False)
+                dtype = torch.bfloat16 if use_bf16 else torch.float32
+                loss = torch.tensor(0, dtype=dtype).to(xm.xla_device())
+            else:
+                loss = loss.detach()
+            return loss / self.args.gradient_accumulation_steps
+        return super().training_step(model, inputs)
+
+    @requires_neuronx_distributed
     def prediction_step(
         self,
         model: torch.nn.Module,
@@ -306,21 +353,21 @@ def prediction_step(
         prediction_loss_only: bool,
         ignore_keys: Optional[List[str]] = None,
     ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        from neuronx_distributed.pipeline import NxDPPModel
+
         self.state.last_inputs = inputs
         self.trigger_on_step_middle_for_neuron_cache_callback(model)
-        return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
 
-    @patch_within_function(("transformers.trainer.get_model_param_count", get_model_param_count))
-    def _inner_training_loop(
-        self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
-    ):
-        return super()._inner_training_loop(
-            batch_size=batch_size,
-            args=args,
-            resume_from_checkpoint=resume_from_checkpoint,
-            trial=trial,
-            ignore_keys_for_eval=ignore_keys_for_eval,
-        )
+        if isinstance(model, NxDPPModel):
+            if not prediction_loss_only:
+                raise ValueError("Only the prediction loss can be returned when doing pipeline parallelism.")
+            loss = model.run_eval(**inputs)
+            if loss is None:
+                use_bf16 = os.environ.get("XLA_USE_BF16", False) or os.environ.get("XLA_DOWNCAST_BF16", False)
+                dtype = torch.bfloat16 if use_bf16 else torch.float32
+                loss = torch.tensor(0, dtype=dtype).to(xm.xla_device())
+            return (loss, None, None)
+        return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
 
     def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval):
         if self.control.should_log:
@@ -328,20 +375,36 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for
 
             xm.mark_step()
 
-            if self.args.tp_plugin.tensor_parallel_size > 1:
+            if self.args.mp_plugin.should_parallelize:
                 from neuronx_distributed.parallel_layers.parallel_state import (
                     get_data_parallel_group,
                     get_data_parallel_size,
+                    get_pipeline_model_parallel_group,
+                    get_pipeline_model_parallel_size,
                 )
 
                 dp_size = get_data_parallel_size()
+                pp_size = get_pipeline_model_parallel_size()
                 tr_loss_div = tr_loss / dp_size
-                tr_loss_scalar = xm.all_reduce(
-                    xm.REDUCE_SUM,
-                    tr_loss_div,
-                    groups=get_data_parallel_group(as_list=True),
-                )
-                tr_loss_scalar = tr_loss_scalar.detach().item()
+
+                if pp_size > 1:
+                    tr_loss_div = xm.all_reduce(
+                        xm.REDUCE_SUM, tr_loss_div, groups=get_data_parallel_group(as_list=True)
+                    )
+                    tr_loss_div = xm.all_reduce(
+                        xm.REDUCE_SUM,
+                        tr_loss_div,
+                        groups=get_pipeline_model_parallel_group(as_list=True),
+                    )
+                    xm.mark_step()
+                    tr_loss_scalar = tr_loss_div.item()
+                else:
+                    tr_loss_scalar = xm.all_reduce(
+                        xm.REDUCE_SUM,
+                        tr_loss_div,
+                        groups=get_data_parallel_group(as_list=True),
+                    )
+                    tr_loss_scalar = tr_loss_scalar.detach().item()
             else:
                 # all_gather + mean() to get average loss over all processes
                 tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
@@ -395,20 +458,20 @@ def _save_xla(self, output_dir: Optional[str] = None):
         # Save a trained model and configuration using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
         xm.rendezvous("saving_checkpoint")
-        if self.accelerator.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM:
+        if self.accelerator.distributed_type is NeuronDistributedType.MODEL_PARALLELISM:
             logger.info("Model parallelism is enabled, only saving the model sharded state dict.")
+            # TODO: how to handle pp?
             if isinstance(self.model, PreTrainedModel):
                 from neuronx_distributed.parallel_layers.parallel_state import get_tensor_model_parallel_size
 
                 config = copy.deepcopy(self.model.config)
-                if self.args.tp_plugin.parallelize_embeddings:
+                if self.args.mp_plugin.parallelize_embeddings:
                     config.vocab_size = config.vocab_size * get_tensor_model_parallel_size()
                 config.save_pretrained(output_dir)
 
-            parallelizer = ParallelizersManager.parallelizer_for_model(self.model)
             # This mark_step is needed to avoid hang issues.
             xm.mark_step()
-            parallelizer.save_model_checkpoint(self.model, output_dir, as_sharded=True, optimizer=self.optimizer)
+            Parallelizer.save_model_checkpoint(self.model, output_dir, as_sharded=True, optimizer=self.optimizer)
         else:
             safe_save_function_patcher = Patcher(
                 [("transformers.modeling_utils.safe_save_file", torch_xla_safe_save_file)]
@@ -468,8 +531,9 @@ def _save_checkpoint(self, model, trial, metrics=None):
         self.save_model(output_dir, _internal_call=True)
 
         # The optimizer state is saved in the shard alongside with the model parameters when doing TP.
-        if self.accelerator.distributed_type is not NeuronDistributedType.TENSOR_PARALLELISM:
+        if self.accelerator.distributed_type is not NeuronDistributedType.MODEL_PARALLELISM:
             xm.rendezvous("saving_optimizer_states")
+            # TODO: how to handle pp?
             xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
 
         with warnings.catch_warnings(record=True) as caught_warnings:
@@ -523,9 +587,10 @@ def _save_checkpoint(self, model, trial, metrics=None):
 
     def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
         # It has been handled during model parallelization.
-        if self.accelerator.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM:
+        # TODO: how to handle pp?
+        if self.accelerator.distributed_type is NeuronDistributedType.MODEL_PARALLELISM:
             return
-        super()._load_from_checkpoint(self, resume_from_checkpoint, model=model)
+        super()._load_from_checkpoint(resume_from_checkpoint, model=model)
 
     def _load_optimizer_and_scheduler_for_xla_fsdp(self, checkpoint):
         checkpoint_file_exists = (
@@ -549,7 +614,7 @@ def _load_optimizer_and_scheduler(self, checkpoint):
             return
         if self.accelerator.distributed_type is NeuronDistributedType.XLA_FSDP:
             return self._load_optimizer_and_scheduler_for_xla_fsdp(checkpoint)
-        elif self.accelerator.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM:
+        elif self.accelerator.distributed_type is NeuronDistributedType.MODEL_PARALLELISM:
             lr_scheduler_state = torch.load(os.path.join(checkpoint, SCHEDULER_NAME), map_location="cpu")
             xm.send_cpu_data_to_device(lr_scheduler_state, self.args.device)
             self.lr_scheduler.load_state_dict(lr_scheduler_state)
@@ -559,18 +624,461 @@ def _load_optimizer_and_scheduler(self, checkpoint):
         else:
             return super()._load_optimizer_and_scheduler(checkpoint)
 
-    @patch_within_function(("transformers.trainer.skip_first_batches", skip_first_batches))
+    @requires_neuronx_distributed
     def _inner_training_loop(
         self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
     ):
-        return super()._inner_training_loop(
-            batch_size=batch_size,
-            args=args,
-            resume_from_checkpoint=resume_from_checkpoint,
-            trial=trial,
-            ignore_keys_for_eval=ignore_keys_for_eval,
+        from neuronx_distributed.pipeline import NxDPPModel
+
+        self.accelerator.free_memory()
+        self._train_batch_size = batch_size
+        logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
+        # Data loader and number of training steps
+        train_dataloader = self.get_train_dataloader()
+
+        # Setting up training control variables:
+        # number of training epochs: num_train_epochs
+        # number of training steps per epoch: num_update_steps_per_epoch
+        # total number of training steps to execute: max_steps
+        total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size
+
+        len_dataloader = None
+        num_train_tokens = None
+        if has_length(train_dataloader):
+            len_dataloader = len(train_dataloader)
+            num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps
+            num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1)
+            num_examples = self.num_examples(train_dataloader)
+            if args.max_steps > 0:
+                max_steps = args.max_steps
+                num_train_epochs = args.max_steps // num_update_steps_per_epoch + int(
+                    args.max_steps % num_update_steps_per_epoch > 0
+                )
+                # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's
+                # the best we can do.
+                num_train_samples = args.max_steps * total_train_batch_size
+                if args.include_tokens_per_second:
+                    num_train_tokens = (
+                        self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
+                    )
+            else:
+                max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch)
+                num_train_epochs = math.ceil(args.num_train_epochs)
+                num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs
+                if args.include_tokens_per_second:
+                    num_train_tokens = self.num_tokens(train_dataloader) * args.num_train_epochs
+        elif args.max_steps > 0:  # Rely on max_steps when dataloader does not have a working size
+            max_steps = args.max_steps
+            # Setting a very large number of epochs so we go as many times as necessary over the iterator.
+            num_train_epochs = sys.maxsize
+            num_update_steps_per_epoch = max_steps
+            num_examples = total_train_batch_size * args.max_steps
+            num_train_samples = args.max_steps * total_train_batch_size
+            if args.include_tokens_per_second:
+                num_train_tokens = self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
+        else:
+            raise ValueError(
+                "args.max_steps must be set to a positive value if dataloader does not have a length, was"
+                f" {args.max_steps}"
+            )
+
+        if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
+            if self.args.n_gpu > 1:
+                # nn.DataParallel(model) replicates the model, creating new variables and module
+                # references registered here no longer work on other gpus, breaking the module
+                raise ValueError(
+                    "Currently --debug underflow_overflow is not supported under DP. Please use DDP"
+                    " (torch.distributed.launch)."
+                )
+            else:
+                debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
+
+        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
+
+        # We need to reset the scheduler, as its parameters may be different on subsequent calls
+        if self._created_lr_scheduler:
+            self.lr_scheduler = None
+            self._created_lr_scheduler = False
+
+        if not delay_optimizer_creation:
+            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+
+        self.state = TrainerState()
+        self.state.is_hyper_param_search = trial is not None
+
+        # Compute absolute values for logging, eval, and save if given as ratio
+        if args.logging_steps is not None:
+            if args.logging_steps < 1:
+                self.state.logging_steps = math.ceil(max_steps * args.logging_steps)
+            else:
+                self.state.logging_steps = args.logging_steps
+        if args.eval_steps is not None:
+            if args.eval_steps < 1:
+                self.state.eval_steps = math.ceil(max_steps * args.eval_steps)
+            else:
+                self.state.eval_steps = args.eval_steps
+        if args.save_steps is not None:
+            if args.save_steps < 1:
+                self.state.save_steps = math.ceil(max_steps * args.save_steps)
+            else:
+                self.state.save_steps = args.save_steps
+
+        # Activate gradient checkpointing if needed
+        if args.gradient_checkpointing:
+            if args.gradient_checkpointing_kwargs is None:
+                gradient_checkpointing_kwargs = {}
+            else:
+                gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs
+
+            self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
+
+        model = self._wrap_model(self.model_wrapped)
+
+        # as the model is wrapped, don't use `accelerator.prepare`
+        # this is for unhandled cases such as
+        # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX
+        use_accelerator_prepare = True if model is self.model else False
+
+        if delay_optimizer_creation:
+            if use_accelerator_prepare:
+                self.model = self.accelerator.prepare(self.model)
+            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+
+        # prepare using `accelerator` prepare
+        if use_accelerator_prepare:
+            self.model.train()
+            if hasattr(self.lr_scheduler, "step"):
+                if self.use_apex:
+                    model = self.accelerator.prepare(self.model)
+                else:
+                    model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
+            else:
+                # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
+                model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
+                    self.model, self.optimizer, self.lr_scheduler
+                )
+
+        if isinstance(model, NxDPPModel):
+            self.model = model
+
+        if self.is_fsdp_enabled:
+            self.model = self.model_wrapped = model
+
+        # for the rest of this function `model` is the outside model, whether it was wrapped or not
+        if model is not self.model:
+            self.model_wrapped = model
+
+        # Check if saved optimizer or scheduler states exist
+        self._load_optimizer_and_scheduler(resume_from_checkpoint)
+
+        # important: at this point:
+        # self.model         is the Transformers Model
+        # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model),
+        # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc.
+
+        # Train!
+        logger.info("***** Running training *****")
+        logger.info(f"  Num examples = {num_examples:,}")
+        logger.info(f"  Num Epochs = {num_train_epochs:,}")
+        logger.info(f"  Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
+        if self.args.per_device_train_batch_size != self._train_batch_size:
+            logger.info(f"  Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}")
+        logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
+        logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+        logger.info(f"  Total optimization steps = {max_steps:,}")
+        logger.info(f"  Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}")
+
+        self.state.epoch = 0
+        start_time = time.time()
+        epochs_trained = 0
+        steps_trained_in_current_epoch = 0
+        steps_trained_progress_bar = None
+
+        # Check if continuing training from a checkpoint
+        if resume_from_checkpoint is not None and os.path.isfile(
+            os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)
+        ):
+            self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
+            epochs_trained = self.state.global_step // num_update_steps_per_epoch
+            if not args.ignore_data_skip:
+                steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch)
+                steps_trained_in_current_epoch *= args.gradient_accumulation_steps
+            else:
+                steps_trained_in_current_epoch = 0
+
+            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+            logger.info(f"  Continuing training from epoch {epochs_trained}")
+            logger.info(f"  Continuing training from global step {self.state.global_step}")
+            if not args.ignore_data_skip:
+                logger.info(
+                    f"  Will skip the first {epochs_trained} epochs then the first"
+                    f" {steps_trained_in_current_epoch} batches in the first epoch."
+                )
+
+        # Update the references
+        self.callback_handler.model = self.model
+        self.callback_handler.optimizer = self.optimizer
+        self.callback_handler.lr_scheduler = self.lr_scheduler
+        self.callback_handler.train_dataloader = train_dataloader
+        if self.hp_name is not None and self._trial is not None:
+            # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial
+            # parameter to Train when using DDP.
+            self.state.trial_name = self.hp_name(self._trial)
+        if trial is not None:
+            assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial
+            self.state.trial_params = hp_params(assignments)
+        else:
+            self.state.trial_params = None
+        # This should be the same if the state has been saved but in case the training arguments changed, it's safer
+        # to set this after the load.
+        self.state.max_steps = max_steps
+        self.state.num_train_epochs = num_train_epochs
+        self.state.is_local_process_zero = self.is_local_process_zero()
+        self.state.is_world_process_zero = self.is_world_process_zero()
+
+        # tr_loss is a tensor to avoid synchronization of TPUs through .item()
+        tr_loss = torch.tensor(0.0).to(args.device)
+        # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses
+        self._total_loss_scalar = 0.0
+        self._globalstep_last_logged = self.state.global_step
+
+        # It should be equivalent but prefer to use the `zero_grad` method from the optimizer when doing pipeline
+        # parallelism.
+        if isinstance(model, NxDPPModel):
+            self.optimizer.zero_grad()
+        else:
+            model.zero_grad()
+
+        self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
+
+        # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
+        if not args.ignore_data_skip:
+            for epoch in range(epochs_trained):
+                sampler = get_dataloader_sampler(train_dataloader)
+                sampler_kinds = [torch.utils.data.RandomSampler]
+                if version.parse(accelerate_version) > version.parse("0.23.0"):
+                    from accelerate.data_loader import SeedableRandomSampler
+
+                    sampler_kinds.append(SeedableRandomSampler)
+                is_random_sampler = isinstance(sampler, tuple(sampler_kinds))
+                if is_torch_less_than_1_11 or not is_random_sampler:
+                    # We just need to begin an iteration to create the randomization of the sampler.
+                    for _ in train_dataloader:
+                        break
+                else:
+                    # Otherwise we need to call the whooooole sampler cause there is some random operation added
+                    # AT THE VERY END!
+                    sampler = sampler if sampler is not None else []
+                    _ = list(sampler)
+
+        total_batched_samples = 0
+        for epoch in range(epochs_trained, num_train_epochs):
+            epoch_iterator = train_dataloader
+            if hasattr(epoch_iterator, "set_epoch"):
+                epoch_iterator.set_epoch(epoch)
+
+            # Reset the past mems state at the beginning of each epoch if necessary.
+            if args.past_index >= 0:
+                self._past = None
+
+            steps_in_epoch = (
+                len(epoch_iterator)
+                if len_dataloader is not None
+                else args.max_steps * args.gradient_accumulation_steps
+            )
+            self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
+
+            if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0:
+                self._load_rng_state(resume_from_checkpoint)
+
+            rng_to_sync = False
+            steps_skipped = 0
+            if steps_trained_in_current_epoch > 0:
+                epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch)
+                steps_skipped = steps_trained_in_current_epoch
+                steps_trained_in_current_epoch = 0
+                rng_to_sync = True
+
+            step = -1
+            for step, inputs in enumerate(epoch_iterator):
+                total_batched_samples += 1
+                if rng_to_sync:
+                    self._load_rng_state(resume_from_checkpoint)
+                    rng_to_sync = False
+
+                # Skip past any already trained steps if resuming training
+                if steps_trained_in_current_epoch > 0:
+                    steps_trained_in_current_epoch -= 1
+                    if steps_trained_progress_bar is not None:
+                        steps_trained_progress_bar.update(1)
+                    if steps_trained_in_current_epoch == 0:
+                        self._load_rng_state(resume_from_checkpoint)
+                    continue
+                elif steps_trained_progress_bar is not None:
+                    steps_trained_progress_bar.close()
+                    steps_trained_progress_bar = None
+
+                if step % args.gradient_accumulation_steps == 0:
+                    self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
+
+                with self.accelerator.accumulate(model):
+                    tr_loss_step = self.training_step(model, inputs)
+
+                if (
+                    args.logging_nan_inf_filter
+                    and not is_torch_xla_available()
+                    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
+                ):
+                    # if loss is nan or inf simply add the average of previous logged losses
+                    tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
+                else:
+                    tr_loss += tr_loss_step
+
+                self.current_flos += float(self.floating_point_ops(inputs))
+
+                is_last_step_and_steps_less_than_grad_acc = (
+                    steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch
+                )
+
+                if (
+                    total_batched_samples % args.gradient_accumulation_steps == 0
+                    or
+                    # last step in epoch but step is always smaller than gradient_accumulation_steps
+                    is_last_step_and_steps_less_than_grad_acc
+                ):
+                    # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
+                    # in accelerate. So, explicitly enable sync gradients to True in that case.
+                    if is_last_step_and_steps_less_than_grad_acc or (
+                        version.parse(accelerate_version) <= version.parse("0.20.3")
+                    ):
+                        self.accelerator.gradient_state._set_sync_gradients(True)
+
+                    # Gradient clipping
+                    if args.max_grad_norm is not None and args.max_grad_norm > 0:
+                        # deepspeed does its own clipping
+
+                        if is_sagemaker_mp_enabled() and args.fp16:
+                            self.optimizer.clip_master_grads(args.max_grad_norm)
+                        elif self.use_apex:
+                            # Revert to normal clipping otherwise, handling Apex or full precision
+                            torch.nn.utils.clip_grad_norm_(
+                                amp.master_params(self.optimizer),
+                                args.max_grad_norm,
+                            )
+                        else:
+                            self.accelerator.clip_grad_norm_(
+                                model.parameters(),
+                                args.max_grad_norm,
+                            )
+
+                    # Optimizer step
+                    self.optimizer.step()
+                    optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
+                    if optimizer_was_run:
+                        # Delay optimizer scheduling until metrics are generated
+                        if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+                            self.lr_scheduler.step()
+
+                    # It should be equivalent but prefer to use the `zero_grad` method from the optimizer when doing
+                    # pipeline parallelism.
+                    if isinstance(model, NxDPPModel):
+                        self.optimizer.zero_grad()
+                    else:
+                        model.zero_grad()
+
+                    self.state.global_step += 1
+                    self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
+                    self.control = self.callback_handler.on_step_end(args, self.state, self.control)
+
+                    self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
+                else:
+                    self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
+
+                if self.control.should_epoch_stop or self.control.should_training_stop:
+                    break
+            if step < 0:
+                logger.warning(
+                    "There seems to be not a single sample in your epoch_iterator, stopping training at step"
+                    f" {self.state.global_step}! This is expected if you're using an IterableDataset and set"
+                    f" num_steps ({max_steps}) higher than the number of available samples."
+                )
+                self.control.should_training_stop = True
+
+            self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
+            self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
+
+            if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
+                if is_torch_xla_available():
+                    # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
+                    xm.master_print(met.metrics_report())
+                else:
+                    logger.warning(
+                        "You enabled PyTorch/XLA debug metrics but you don't have a TPU "
+                        "configured. Check your training configuration if this is unexpected."
+                    )
+            if self.control.should_training_stop:
+                break
+
+        if args.past_index and hasattr(self, "_past"):
+            # Clean the state at the end of training
+            delattr(self, "_past")
+
+        logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
+        if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
+            # Wait for everyone to get here so we are sure the model has been saved by process 0.
+            if is_torch_xla_available():
+                xm.rendezvous("load_best_model_at_end")
+            elif args.parallel_mode == ParallelMode.DISTRIBUTED:
+                torch.distributed.barrier()
+            elif is_sagemaker_mp_enabled():
+                smp.barrier()
+
+            self._load_best_model()
+
+        # add remaining tr_loss
+        self._total_loss_scalar += tr_loss.item()
+        train_loss = self._total_loss_scalar / self.state.global_step
+
+        metrics = speed_metrics(
+            "train",
+            start_time,
+            num_samples=num_train_samples,
+            num_steps=self.state.max_steps,
+            num_tokens=num_train_tokens,
         )
+        self.store_flos()
+        metrics["total_flos"] = self.state.total_flos
+        metrics["train_loss"] = train_loss
+
+        self.is_in_train = False
+
+        self._memory_tracker.stop_and_update_metrics(metrics)
+
+        self.log(metrics)
+
+        run_dir = self._get_output_dir(trial)
+        checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir)
+
+        # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
+        if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
+            for checkpoint in checkpoints_sorted:
+                if not os.path.samefile(checkpoint, self.state.best_model_checkpoint):
+                    logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
+                    shutil.rmtree(checkpoint)
 
+        self.control = self.callback_handler.on_train_end(args, self.state, self.control)
+
+        # Wait for the checkpoint to be uploaded.
+        self._finish_current_push()
+
+        # After training we make sure to retrieve back the original forward pass method
+        # for the embedding layer by removing the forward post hook.
+        if self.neftune_noise_alpha is not None:
+            self._deactivate_neftune(self.model)
+
+        return TrainOutput(self.state.global_step, train_loss, metrics)
+
+    @requires_neuronx_distributed
     def evaluation_loop(
         self,
         dataloader: torch.utils.data.DataLoader,
@@ -579,19 +1087,233 @@ def evaluation_loop(
         ignore_keys: Optional[List[str]] = None,
         metric_key_prefix: str = "eval",
     ) -> EvalLoopOutput:
+        """
+        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+        from neuronx_distributed.parallel_layers.parallel_state import get_data_parallel_size
+        from neuronx_distributed.pipeline import NxDPPModel
+
         # This will prepare the model if it was not prepared before.
         # This is needed for example for TP when we performing only evaluation (no training):
         #   1. The model needs to be loaded if it was lazy loaded.
         #   2. The model needs to be parallelized.
-        self.accelerator.prepare_model(self.model)
-
-        return super().evaluation_loop(
-            dataloader,
-            description,
-            prediction_loss_only=prediction_loss_only,
-            ignore_keys=ignore_keys,
-            metric_key_prefix=metric_key_prefix,
-        )
+        model = self.accelerator.prepare_model(self.model)
+
+        args = self.args
+
+        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
+
+        is_nxdppmodel = isinstance(model, NxDPPModel)
+        if not is_nxdppmodel:
+            model = self._wrap_model(model, training=False, dataloader=dataloader)
+
+        if len(self.accelerator._models) == 0 and model is self.model:
+            model = (
+                self.accelerator.prepare(model)
+                if self.is_deepspeed_enabled
+                else self.accelerator.prepare_model(model, evaluation_mode=True)
+            )
+
+            if self.is_fsdp_enabled:
+                self.model = model
+
+            # for the rest of this function `model` is the outside model, whether it was wrapped or not
+            if model is not self.model:
+                self.model_wrapped = model
+
+            # backward compatibility
+            if self.is_deepspeed_enabled:
+                self.deepspeed = self.model_wrapped
+
+        # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
+        # while ``train`` is running, cast it to the right dtype first and then put on device
+        if not self.is_in_train and not is_nxdppmodel:
+            if args.fp16_full_eval:
+                model = model.to(dtype=torch.float16, device=args.device)
+            elif args.bf16_full_eval:
+                model = model.to(dtype=torch.bfloat16, device=args.device)
+
+        batch_size = self.args.eval_batch_size
+
+        logger.info(f"***** Running {description} *****")
+        dp_size = get_data_parallel_size()
+        logger.info(f"  Num data parallel workers = {dp_size}")
+        if has_length(dataloader):
+            num_examples = self.num_examples(dataloader)
+            total_num_examples = num_examples * dp_size
+            logger.info(f"  Per data parallel worker num examples = {num_examples}")
+            logger.info(f"  Total num examples = {total_num_examples}")
+        else:
+            logger.info("  Num examples: Unknown")
+        logger.info(f"  Batch size = {batch_size}")
+
+        if not is_nxdppmodel:
+            model.eval()
+
+        self.callback_handler.eval_dataloader = dataloader
+        # Do this before wrapping.
+        eval_dataset = getattr(dataloader, "dataset", None)
+
+        if args.past_index >= 0:
+            self._past = None
+
+        # Initialize containers
+        # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps)
+        losses_host = None
+        preds_host = None
+        labels_host = None
+        inputs_host = None
+
+        # losses/preds/labels on CPU (final containers)
+        all_losses = None
+        all_preds = None
+        all_labels = None
+        all_inputs = None
+        # Will be useful when we have an iterable dataset so don't know its length.
+
+        observed_num_examples = 0
+        # Main evaluation loop
+        for step, inputs in enumerate(dataloader):
+            # Update the observed num examples
+            observed_batch_size = find_batch_size(inputs)
+            if observed_batch_size is not None:
+                observed_num_examples += observed_batch_size
+                # For batch samplers, batch_size is not known by the dataloader in advance.
+                if batch_size is None:
+                    batch_size = observed_batch_size
+
+            if is_nxdppmodel and observed_batch_size % model.num_microbatches != 0:
+                if xm.get_local_ordinal() == 0:
+                    logger.warning(
+                        "Skipping the evaluation step because the pipeline number of microbatches "
+                        f"({model.num_microbatches}) does not divide the batch size ({observed_batch_size})."
+                    )
+                continue
+
+            # Prediction step
+            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
+            main_input_name = getattr(model, "main_input_name", "input_ids")
+            inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None
+
+            xm.mark_step()
+
+            # Update containers on host
+            if loss is not None:
+                losses = self.accelerator.gather_for_metrics((loss.repeat(batch_size)))
+                losses_host = losses if losses_host is None else nested_concat(losses_host, losses, padding_index=-100)
+            if labels is not None:
+                labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
+            if inputs_decode is not None:
+                inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100)
+                inputs_decode = self.accelerator.gather_for_metrics((inputs_decode))
+                inputs_host = (
+                    inputs_decode
+                    if inputs_host is None
+                    else nested_concat(inputs_host, inputs_decode, padding_index=-100)
+                )
+            if logits is not None:
+                logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100)
+                if self.preprocess_logits_for_metrics is not None:
+                    logits = self.preprocess_logits_for_metrics(logits, labels)
+                logits = self.accelerator.gather_for_metrics((logits))
+                preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
+
+            if labels is not None:
+                labels = self.accelerator.gather_for_metrics((labels))
+                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+
+            self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
+
+            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
+            if (
+                args.eval_accumulation_steps is not None
+                and (step + 1) % args.eval_accumulation_steps == 0
+                and (self.accelerator.sync_gradients or version.parse(accelerate_version) > version.parse("0.20.3"))
+            ):
+                if losses_host is not None:
+                    losses = nested_numpify(losses_host)
+                    all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
+                if preds_host is not None:
+                    logits = nested_numpify(preds_host)
+                    all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+                if inputs_host is not None:
+                    inputs_decode = nested_numpify(inputs_host)
+                    all_inputs = (
+                        inputs_decode
+                        if all_inputs is None
+                        else nested_concat(all_inputs, inputs_decode, padding_index=-100)
+                    )
+                if labels_host is not None:
+                    labels = nested_numpify(labels_host)
+                    all_labels = (
+                        labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+                    )
+
+                # Set back to None to begin a new accumulation
+                losses_host, preds_host, inputs_host, labels_host = None, None, None, None
+
+        if args.past_index and hasattr(self, "_past"):
+            # Clean the state at the end of the evaluation loop
+            delattr(self, "_past")
+
+        # Gather all remaining tensors and put them back on the CPU
+        if losses_host is not None:
+            losses = nested_numpify(losses_host)
+            all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
+        if preds_host is not None:
+            logits = nested_numpify(preds_host)
+            all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+        if inputs_host is not None:
+            inputs_decode = nested_numpify(inputs_host)
+            all_inputs = (
+                inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100)
+            )
+        if labels_host is not None:
+            labels = nested_numpify(labels_host)
+            all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+
+        # Number of samples
+        if has_length(eval_dataset):
+            num_samples = len(eval_dataset)
+        # The instance check is weird and does not actually check for the type, but whether the dataset has the right
+        # methods. Therefore we need to make sure it also has the attribute.
+        elif isinstance(eval_dataset, IterableDatasetShard) and getattr(eval_dataset, "num_examples", 0) > 0:
+            num_samples = eval_dataset.num_examples
+        else:
+            if has_length(dataloader):
+                num_samples = self.num_examples(dataloader)
+            else:  # both len(dataloader.dataset) and len(dataloader) fail
+                num_samples = observed_num_examples
+        if num_samples == 0 and observed_num_examples > 0:
+            num_samples = observed_num_examples
+
+        # Metrics!
+        if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
+            if args.include_inputs_for_metrics:
+                metrics = self.compute_metrics(
+                    EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs)
+                )
+            else:
+                metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
+        else:
+            metrics = {}
+
+        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
+        metrics = denumpify_detensorize(metrics)
+
+        if all_losses is not None:
+            metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
+        if hasattr(self, "jit_compilation_time"):
+            metrics[f"{metric_key_prefix}_jit_compilation_time"] = self.jit_compilation_time
+
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+        return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)
 
 
 class NeuronTrainer(AugmentTrainerForNeuronMixin, Trainer):
diff --git a/optimum/neuron/training_args.py b/optimum/neuron/training_args.py
index 415148057..33c6a60ff 100644
--- a/optimum/neuron/training_args.py
+++ b/optimum/neuron/training_args.py
@@ -36,7 +36,7 @@
 
 from ..utils import check_if_transformers_greater, logging
 from .accelerate import NeuronAcceleratorState, NeuronPartialState
-from .accelerate.utils import TensorParallelismPlugin, patch_accelerate_is_tpu_available
+from .accelerate.utils import ModelParallelismPlugin, patch_accelerate_is_tpu_available
 from .utils import is_accelerate_available, is_torch_xla_available
 from .utils.training_utils import TRANSFORMERS_MIN_VERSION_FOR_XLA_FSDP
 
@@ -80,6 +80,14 @@ class NeuronTrainingArgumentsMixin:
             "help": "Specify the level of optimization the Neuron compiler should perform.",
         },
     )
+    pipeline_parallel_size: int = field(
+        default=1,
+        metadata={"help": "The number of pipeline parallel replicas."},
+    )
+    pipeline_parallel_num_microbatches: int = field(
+        default=-1,
+        metadata={"help": "The number of microbatches used for pipeline execution."},
+    )
 
     def __post_init__(self):
         # Patches accelerate.utils.imports.is_tpu_available to match `is_torch_xla_available`
@@ -120,10 +128,27 @@ def __post_init__(self):
             checkpoint = get_last_checkpoint(self.output_dir)
             resume_from_checkpoint = checkpoint
 
-        self.tp_plugin = TensorParallelismPlugin(
+        if self.pipeline_parallel_size > 1:
+            if self.pipeline_parallel_num_microbatches == -1:
+                self.pipeline_parallel_num_microbatches = self.per_device_train_batch_size
+            if self.per_device_train_batch_size % self.pipeline_parallel_num_microbatches != 0:
+                raise ValueError(
+                    f"The number of pipeline microbatches ({self.pipeline_parallel_num_microbatches}) divide the total "
+                    f"per-device train batch size ({self.per_device_train_batch_size})."
+                )
+            if self.per_device_eval_batch_size % self.pipeline_parallel_num_microbatches != 0:
+                raise ValueError(
+                    f"The number of pipeline microbatches ({self.pipeline_parallel_num_microbatches}) divide the total "
+                    f"per-device eval batch size ({self.per_device_eval_batch_size})."
+                )
+
+        self.mp_plugin = ModelParallelismPlugin(
             self.tensor_parallel_size,
-            not self.disable_embedding_parallelization,
+            parallelize_embeddings=not self.disable_embedding_parallelization,
             sequence_parallel_enabled=not self.disable_sequence_parallel,
+            pipeline_parallel_size=self.pipeline_parallel_size,
+            pipeline_parallel_num_microbatches=self.pipeline_parallel_num_microbatches,
+            pipeline_parallel_use_zero1_optimizer=self.zero_1,
             checkpoint_dir=resume_from_checkpoint,
         )
         super().__post_init__()
@@ -228,13 +253,13 @@ def _setup_devices(self) -> "torch.device":
 
     @property
     def place_model_on_device(self):
-        return not self.tp_plugin.should_parallelize and super().place_model_on_device
+        return not self.mp_plugin.should_parallelize and super().place_model_on_device
 
     @property
     def world_size(self):
         divisor = 1
-        if self.tp_plugin.should_parallelize:
-            divisor = self.tp_plugin.tensor_parallel_size
+        if self.mp_plugin.should_parallelize:
+            divisor = self.mp_plugin.tensor_parallel_size * self.mp_plugin.pipeline_parallel_size
         return super().world_size // divisor
 
 
diff --git a/optimum/neuron/utils/cache_utils.py b/optimum/neuron/utils/cache_utils.py
index 698dde5e0..d68aa4642 100644
--- a/optimum/neuron/utils/cache_utils.py
+++ b/optimum/neuron/utils/cache_utils.py
@@ -21,7 +21,6 @@
 import os
 import re
 import shutil
-import subprocess
 import tempfile
 from dataclasses import InitVar, asdict, dataclass, field
 from pathlib import Path
@@ -33,9 +32,9 @@
 from huggingface_hub import (
     CommitOperationAdd,
     HfApi,
-    HfFolder,
     RepoUrl,
     create_repo,
+    get_token,
     hf_hub_download,
     whoami,
 )
@@ -45,8 +44,8 @@
 
 from ...utils import logging
 from ...utils.logging import warn_once
-from .constant import NEURON_BINARIES_PATH
 from .misc import is_main_worker, string_to_bool
+from .require_utils import requires_neuronx_distributed
 from .version_utils import get_neuronxcc_version
 
 
@@ -137,7 +136,7 @@ def is_private_repo(repo_id: str) -> bool:
     if _DISABLE_IS_PRIVATE_REPO_CHECK:
         return False
     try:
-        HfApi().model_info(repo_id=repo_id, token=HfFolder.get_token())
+        HfApi().model_info(repo_id=repo_id, token=get_token())
         private_to_user = False
     except RepositoryNotFoundError:
         private_to_user = True
@@ -260,15 +259,12 @@ def set_neuron_cache_path(neuron_cache_path: Union[str, Path], ignore_no_cache:
 
 
 def get_num_neuron_cores() -> int:
-    path = os.environ["PATH"]
-    if NEURON_BINARIES_PATH not in path:
-        path = f"{NEURON_BINARIES_PATH}:{path}"
-        os.environ["PATH"] = path
-    proc = subprocess.Popen(["neuron-ls", "-j"], stdout=subprocess.PIPE)
-    stdout, _ = proc.communicate()
-    stdout = stdout.decode("utf-8")
-    json_stdout = json.loads(stdout)
-    return sum(neuron_device_info["nc_count"] for neuron_device_info in json_stdout)
+    neuron_devices_path = Path("/sys/class/neuron_device/")
+    if not neuron_devices_path.is_dir():
+        num_cores = 0
+    else:
+        num_cores = len(list(neuron_devices_path.iterdir())) * 2
+    return num_cores
 
 
 def get_num_neuron_cores_used() -> int:
@@ -656,6 +652,9 @@ class NeuronHash:
     tensor_parallel_size: Union[int, _UnspecifiedHashAttribute] = field(
         default_factory=_UnspecifiedHashAttribute.with_args(min_optimum_neuron_version="0.0.8", default=1)
     )
+    pipeline_parallel_size: Union[int, _UnspecifiedHashAttribute] = field(
+        default_factory=_UnspecifiedHashAttribute.with_args(min_optimum_neuron_version="0.0.17", default=1)
+    )
     _model_name_or_path: Optional[str] = None
     _is_private: Optional[bool] = None
     _model_type: Optional[str] = None
@@ -739,11 +738,19 @@ def compute_sha512_hash(self, *buffers: bytes) -> str:
             hash_.update(buffer)
         return hash_.hexdigest()
 
+    @requires_neuronx_distributed
     def compute_hash(self, model: Optional["PreTrainedModel"] = None) -> Tuple[str, str]:
         if self._hash.is_empty:
             if model is None:
                 raise ValueError("A model must be specified the first time the hash is computed.")
-            model_hash = self.compute_sha512_hash(self.state_dict_to_bytes(model.state_dict()))
+
+            from neuronx_distributed.pipeline import NxDPPModel
+
+            if isinstance(model, NxDPPModel):
+                state_dict = model.local_state_dict()
+            else:
+                state_dict = model.state_dict()
+            model_hash = self.compute_sha512_hash(self.state_dict_to_bytes(state_dict))
 
             hash_dict = asdict(self)
             hash_dict["model"] = model_hash
@@ -756,6 +763,9 @@ def compute_hash(self, model: Optional["PreTrainedModel"] = None) -> Tuple[str,
             self._insert_potential_unspecified_hash_attribute(
                 "tensor_parallel_size", self.tensor_parallel_size, hash_dict
             )
+            self._insert_potential_unspecified_hash_attribute(
+                "pipeline_parallel_size", self.tensor_parallel_size, hash_dict
+            )
             self._insert_potential_unspecified_hash_attribute("fsdp", self.fsdp, hash_dict)
 
             hash_dict["data_type"] = str(hash_dict["data_type"]).split(".")[1]
@@ -817,7 +827,7 @@ def get_cached_model_on_the_hub(neuron_hash: NeuronHash) -> Optional[CachedModel
         else:
             revision = "main"
         try:
-            repo_filenames = HfApi().list_repo_files(repo_id, revision=revision, token=HfFolder.get_token())
+            repo_filenames = HfApi().list_repo_files(repo_id, revision=revision, token=get_token())
         except Exception:
             continue
         model_files_on_the_hub = []
@@ -974,7 +984,7 @@ def push_to_cache_on_hub(
         path_in_repo = Path().joinpath(*path_in_repo.parts[1:])
     path_in_repo = neuron_hash.cache_path / path_in_repo
 
-    repo_filenames = HfApi().list_repo_files(cache_repo_id, token=HfFolder.get_token())
+    repo_filenames = HfApi().list_repo_files(cache_repo_id, token=get_token())
     path_in_repo_str = path_in_repo.as_posix()
     if local_cache_dir_or_file.is_dir():
         exists = any(filename.startswith(path_in_repo_str) for filename in repo_filenames)
diff --git a/optimum/neuron/utils/patching.py b/optimum/neuron/utils/patching.py
index b806997dd..3311352a0 100644
--- a/optimum/neuron/utils/patching.py
+++ b/optimum/neuron/utils/patching.py
@@ -37,20 +37,36 @@ def __init__(
         self.patching_specs = self.process_patching_specs(
             patching_specs, ignore_missing_attributes=ignore_missing_attributes
         )
+        self.already_patched = False
 
     @abstractmethod
     def process_patching_specs(
         self, patching_specs: Optional[List[Tuple[Any, Any]]] = None, ignore_missing_attributes: bool = False
-    ) -> List[Tuple[Any, str, Any, Any]]:
+    ) -> List[Tuple[Any, str, Any, Any, bool]]:
         pass
 
-    def __enter__(self):
-        for module, attribute_name, _, patch in self.patching_specs:
+    def patch(self):
+        if self.already_patched:
+            return
+        for module, attribute_name, _, patch, _ in self.patching_specs:
             setattr(module, attribute_name, patch)
+        self.already_patched = True
+
+    def restore(self):
+        if not self.already_patched:
+            return
+        for module, attribute_name, orig, _, should_delete_attribute_at_restore in self.patching_specs:
+            if should_delete_attribute_at_restore:
+                delattr(module, attribute_name)
+            else:
+                setattr(module, attribute_name, orig)
+        self.already_patched = False
+
+    def __enter__(self):
+        return self.patch()
 
     def __exit__(self, exc_type, exc_value, traceback):
-        for module, attribute_name, _, patch in self.patching_specs:
-            setattr(module, attribute_name, patch)
+        return self.restore()
 
 
 class DynamicPatch:
@@ -103,7 +119,7 @@ def process_patching_specs(
                 )
             if isinstance(patch, DynamicPatch):
                 patch = patch(attribute)
-            proccessed_patching_specs.append((module, attribute_name, attribute, patch))
+            proccessed_patching_specs.append((module, attribute_name, attribute, patch, not module_has_attr))
         return proccessed_patching_specs
 
 
@@ -144,7 +160,7 @@ def process_patching_specs(
             if inspect.ismethod(attribute):
                 patch = patch.__get__(model)
 
-            proccessed_patching_specs.append((module, attribute_name, attribute, patch))
+            proccessed_patching_specs.append((module, attribute_name, attribute, patch, not module_has_attr))
 
         return proccessed_patching_specs
 
diff --git a/optimum/neuron/utils/runner.py b/optimum/neuron/utils/runner.py
index e6790f98b..d738c6f67 100644
--- a/optimum/neuron/utils/runner.py
+++ b/optimum/neuron/utils/runner.py
@@ -28,7 +28,7 @@
 import requests
 from huggingface_hub import (
     HfApi,
-    HfFolder,
+    get_token,
     snapshot_download,
 )
 from transformers import AutoConfig
@@ -172,7 +172,7 @@ class ExampleRunner:
             ],
         },
         "image-classification": {
-            "dataset_name": "beans",
+            "dataset_name": "mnist",
             "extra_command_line_arguments": [
                 "--remove_unused_columns false",
                 "--ignore_mismatched_sizes",
@@ -304,7 +304,7 @@ def install_requirements(self, requirements_filename: Union[str, Path]):
             self._installed_requirements = True
 
     def check_user_logged_in_and_cache_repo_is_set(self):
-        token = HfFolder.get_token()
+        token = get_token()
         if not token:
             raise RuntimeError(
                 "You need to log in the Hugging Face Hub otherwise you will not be able to push anything. "
@@ -333,7 +333,7 @@ def download_model_repo_and_override_config(
         if not config_overrides:
             return model_name_or_path
 
-        filenames = HfApi().list_repo_files(repo_id=model_name_or_path, token=HfFolder.get_token())
+        filenames = HfApi().list_repo_files(repo_id=model_name_or_path, token=get_token())
         safetensors_model_file_pattern = re.compile(r"\w+(-[0-9]*-of-[0-9]*)?\.safetensors")
         allow_patterns = ["*.json", "*.txt"]
         if any(re.match(safetensors_model_file_pattern, filename) for filename in filenames):
@@ -380,6 +380,7 @@ def run(
         save_total_limit: int = -1,
         learning_rate: float = 1e-4,
         tensor_parallel_size: int = 1,
+        pipeline_parallel_size: int = 1,
         disable_embedding_parallelization: bool = False,
         zero_1: bool = False,
         output_dir: Optional[Union[Path, str]] = None,
@@ -417,9 +418,14 @@ def run(
             self.install_requirements(script_path.parent / "requirements.txt")
 
         def compute_max_train_samples(
-            max_steps: int, num_cores: int, tensor_parallel_size: int, per_device_train_batch_size: int
+            max_steps: int,
+            num_cores: int,
+            tensor_parallel_size: int,
+            pipeline_parallel_size: int,
+            per_device_train_batch_size: int,
         ) -> int:
-            total_batch_size = (num_cores // tensor_parallel_size) * per_device_train_batch_size
+            number_of_cores_per_replicas = tensor_parallel_size * pipeline_parallel_size
+            total_batch_size = (num_cores // number_of_cores_per_replicas) * per_device_train_batch_size
             total_num_samples = max_steps * total_batch_size
             # Adding 10% more examples just to make sure.
             return int(total_num_samples * 1.1)
@@ -442,7 +448,9 @@ def compute_max_train_samples(
         if max_steps is not None:
             cmd.append(f"--max_steps {max_steps}")
             max_steps_idx = len(cmd) - 1
-            max_train_samples = compute_max_train_samples(max_steps, num_cores, tensor_parallel_size, train_batch_size)
+            max_train_samples = compute_max_train_samples(
+                max_steps, num_cores, tensor_parallel_size, pipeline_parallel_size, train_batch_size
+            )
             cmd.append(f"--max_train_samples {max_train_samples}")
 
         cmd.append("--do_train")
@@ -469,6 +477,8 @@ def compute_max_train_samples(
         # Parallelism
         if tensor_parallel_size > 1:
             cmd.append(f"--tensor_parallel_size {tensor_parallel_size}")
+        if pipeline_parallel_size > 1:
+            cmd.append(f"--pipeline_parallel_size {pipeline_parallel_size}")
         if disable_embedding_parallelization:
             cmd.append("--disable_embedding_parallelization")
         if zero_1:
diff --git a/optimum/neuron/utils/training_utils.py b/optimum/neuron/utils/training_utils.py
index b08f6e6d9..113096237 100644
--- a/optimum/neuron/utils/training_utils.py
+++ b/optimum/neuron/utils/training_utils.py
@@ -286,7 +286,7 @@ def set_neuron_cc_optlevel_for_model(model: "PreTrainedModel", optlevel: str = "
     neuron_cc_flags = os.environ.get("NEURON_CC_FLAGS", "")
     match_ = re.search(r"-O[123]", neuron_cc_flags)
     if match_:
-        neuron_cc_flags = neuron_cc_flags[: match_.start(0)] + f"{optlevel}" + neuron_cc_flags[match_.end(1) + 1 :]
+        neuron_cc_flags = neuron_cc_flags[: match_.start(0)] + f"{optlevel}" + neuron_cc_flags[match_.end(0) + 1 :]
     else:
         neuron_cc_flags += f"{optlevel} "
     os.environ["NEURON_CC_FLAGS"] = neuron_cc_flags
diff --git a/tests/cli/test_neuron_cache_cli.py b/tests/cli/test_neuron_cache_cli.py
index 67f6dca1b..8b9a7640b 100644
--- a/tests/cli/test_neuron_cache_cli.py
+++ b/tests/cli/test_neuron_cache_cli.py
@@ -14,14 +14,17 @@
 # limitations under the License.
 
 import os
+import random
+import string
 import subprocess
+from pathlib import Path
 from tempfile import TemporaryDirectory
 from unittest import TestCase
 
 import torch
 from huggingface_hub import HfApi, create_repo, delete_repo
 from huggingface_hub.utils import RepositoryNotFoundError
-from transformers import BertConfig, BertModel
+from transformers import BertConfig, BertModel, BertTokenizer
 from transformers.testing_utils import is_staging_test
 
 from optimum.neuron.utils.cache_utils import (
@@ -39,6 +42,12 @@
 from ..utils import StagingTestMixin
 
 
+# Taken from https://pynative.com/python-generate-random-string/
+def get_random_string(length: int) -> str:
+    letters = string.ascii_lowercase
+    return "".join(random.choice(letters) for i in range(length))
+
+
 @is_trainium_test
 @is_staging_test
 class TestNeuronCacheCLI(StagingTestMixin, TestCase):
@@ -54,7 +63,6 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
         os.environ["HF_HOME"] = self._hf_home
-
         try:
             delete_repo(self.default_repo_id, repo_type="model")
         except RepositoryNotFoundError:
@@ -126,65 +134,86 @@ def test_optimum_neuron_cache_set(self):
             )
 
     def test_optimum_neuron_cache_add(self):
-        os.environ["CUSTOM_CACHE_REPO"] = self.CUSTOM_CACHE_REPO
-        # TODO: activate those later.
-        # Without any sequence length, it should fail.
-        # command = (
-        #     "optimum-cli neuron cache add -m bert-base-uncased --task text-classification --train_batch_size 16 "
-        #     "--precision bf16 --num_cores 2"
-        # ).split()
-        # p = subprocess.Popen(command, stderr=PIPE)
-        # _, stderr = p.communicate()
-        # stderr = stderr.decode("utf-8")
-        # self.assertIn("either sequence_length or encoder_sequence and decoder_sequence_length", stderr)
-
-        # Without both encoder and decoder sequence lengths, it should fail.
-        # command = (
-        #     "optimum-cli neuron cache add -m t5-small --task translation --train_batch_size 16 --precision bf16 "
-        #     "--num_cores 2 --encoder_sequence_length 512"
-        # ).split()
-        # p = subprocess.Popen(command, stderr=PIPE)
-        # _, stderr = p.communicate()
-        # stderr = stderr.decode("utf-8")
-        # self.assertIn("Both the encoder_sequence and decoder_sequence_length", stderr)
-
-        bert_model_name = "__DUMMY_OPTIMUM_USER__/tiny-random-BertModel-neuron"
-
-        # With wrong precision value, it should fail.
-        command = (
-            f"optimum-cli neuron cache add -m  {bert_model_name} --task text-classification --train_batch_size 1 "
-            "--precision wrong --num_cores 2 --sequence_length 128"
-        ).split()
-        p = subprocess.Popen(command)
-        returncode = p.wait()
-        self.assertNotEqual(returncode, 0)
-
-        # With wrong num_cores value, it should fail.
-        command = (
-            f"optimum-cli neuron cache add -m {bert_model_name} --task text-classification --train_batch_size 1 "
-            "--precision bf16 --num_cores 999 --sequence_length 128"
-        ).split()
-        p = subprocess.Popen(command)
-        returncode = p.wait()
-        self.assertNotEqual(returncode, 0)
-
-        # Non seq2seq model.
-        command = (
-            f"optimum-cli neuron cache add -m {bert_model_name} --task text-classification --train_batch_size 1 "
-            "--precision bf16 --num_cores 2 --sequence_length 128"
-        ).split()
-        p = subprocess.Popen(command)
-        returncode = p.wait()
-        self.assertEqual(returncode, 0)
-
-        # seq2seq model.
-        command = (
-            f"optimum-cli neuron cache add -m {bert_model_name} --task translation --train_batch_size 1 --precision bf16 "
-            "--num_cores 2 --encoder_sequence_length 12 --decoder_sequence_length 12"
-        ).split()
-        p = subprocess.Popen(command)
-        returncode = p.wait()
-        self.assertEqual(returncode, 0)
+        with TemporaryDirectory() as tmpdir:
+            tmpdir = Path(tmpdir)
+
+            os.environ["CUSTOM_CACHE_REPO"] = self.CUSTOM_CACHE_REPO
+            # TODO: activate those later.
+            # Without any sequence length, it should fail.
+            # command = (
+            #     "optimum-cli neuron cache add -m bert-base-uncased --task text-classification --train_batch_size 16 "
+            #     "--precision bf16 --num_cores 2"
+            # ).split()
+            # p = subprocess.Popen(command, stderr=PIPE)
+            # _, stderr = p.communicate()
+            # stderr = stderr.decode("utf-8")
+            # self.assertIn("either sequence_length or encoder_sequence and decoder_sequence_length", stderr)
+
+            # Without both encoder and decoder sequence lengths, it should fail.
+            # command = (
+            #     "optimum-cli neuron cache add -m t5-small --task translation --train_batch_size 16 --precision bf16 "
+            #     "--num_cores 2 --encoder_sequence_length 512"
+            # ).split()
+            # p = subprocess.Popen(command, stderr=PIPE)
+            # _, stderr = p.communicate()
+            # stderr = stderr.decode("utf-8")
+            # self.assertIn("Both the encoder_sequence and decoder_sequence_length", stderr)
+
+            # Create dummy BERT model.
+            bert_model_name = tmpdir / "bert_model"
+            config = BertConfig()
+
+            config.num_hidden_layers = 2
+            config.num_attention_heads = 2
+            config.vocab_size = 100
+
+            with open(tmpdir / "vocab.txt", "w") as fp:
+                fp.write("\n".join(get_random_string(random.randint(10, 20))))
+
+            tokenizer = BertTokenizer(tmpdir / "vocab.txt")
+            tokenizer.save_pretrained(bert_model_name)
+
+            model = BertModel(config)
+            model.save_pretrained(bert_model_name)
+
+            env = dict(os.environ)
+            env["OPTIMUM_NEURON_DISABLE_IS_PRIVATE_REPO_CHECK"] = "1"
+
+            # With wrong precision value, it should fail.
+            command = (
+                f"optimum-cli neuron cache add -m  {bert_model_name} --task text-classification --train_batch_size 1 "
+                "--precision wrong --num_cores 2 --sequence_length 128"
+            ).split()
+            p = subprocess.Popen(command, env=env)
+            returncode = p.wait()
+            self.assertNotEqual(returncode, 0)
+
+            # With wrong num_cores value, it should fail.
+            command = (
+                f"optimum-cli neuron cache add -m {bert_model_name} --task text-classification --train_batch_size 1 "
+                "--precision bf16 --num_cores 999 --sequence_length 128"
+            ).split()
+            p = subprocess.Popen(command, env=env)
+            returncode = p.wait()
+            self.assertNotEqual(returncode, 0)
+
+            # Non seq2seq model.
+            command = (
+                f"optimum-cli neuron cache add -m {bert_model_name} --task text-classification --train_batch_size 1 "
+                "--precision bf16 --num_cores 2 --sequence_length 128"
+            ).split()
+            p = subprocess.Popen(command, env=env)
+            returncode = p.wait()
+            self.assertEqual(returncode, 0)
+
+            # seq2seq model.
+            command = (
+                f"optimum-cli neuron cache add -m {bert_model_name} --task translation --train_batch_size 1 --precision bf16 "
+                "--num_cores 2 --encoder_sequence_length 12 --decoder_sequence_length 12"
+            ).split()
+            p = subprocess.Popen(command, env=env)
+            returncode = p.wait()
+            self.assertEqual(returncode, 0)
 
     def test_optimum_neuron_cache_list(self):
         with TemporaryDirectory() as tmpdirname:
diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py
new file mode 100644
index 000000000..6efd9aa3a
--- /dev/null
+++ b/tests/distributed/conftest.py
@@ -0,0 +1,43 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+
+# This hook is run before the default pytest_runtest_call
+@pytest.hookimpl(tryfirst=True)
+def pytest_runtest_call(item):
+    # We want to use our own launching function for distributed tests
+    if getattr(item.cls, "is_dist_test", False):
+        dist_test_class = item.cls()
+        dist_test_class(item._request)
+        item.runtest = lambda: True  # Dummy function so test is not run twice
+
+
+# We allow DistributedTest to reuse distributed environments. When the last
+# test for a class is run, we want to make sure those distributed environments
+# are destroyed.
+def pytest_runtest_teardown(item, nextitem):
+    if getattr(item.cls, "reuse_dist_env", False) and not nextitem:
+        dist_test_class = item.cls()
+        for num_procs, pool in dist_test_class._pool_cache.items():
+            dist_test_class._close_pool(pool, num_procs, force=True)
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_fixture_setup(fixturedef, request):
+    if getattr(fixturedef.func, "is_dist_fixture", False):
+        dist_fixture_class = fixturedef.func()
+        dist_fixture_class(request)
diff --git a/tests/distributed/distributed.py b/tests/distributed/distributed.py
new file mode 100644
index 000000000..690140cd1
--- /dev/null
+++ b/tests/distributed/distributed.py
@@ -0,0 +1,353 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Defines classes to enable running tests in a distributed setting."""
+
+# The following code is copied and adapted from the DeepSpeed repo:
+# https://github.com/microsoft/DeepSpeed/blob/master/tests/unit/common.py
+
+import inspect
+import multiprocessing
+import os
+import socket
+import time
+import uuid
+from abc import ABC, abstractmethod
+from typing import List, Union
+
+import psutil
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from _pytest.fixtures import FixtureLookupError
+from _pytest.outcomes import Skipped
+
+from optimum.neuron.utils.cache_utils import get_num_neuron_cores
+from optimum.neuron.utils.import_utils import is_neuronx_distributed_available, is_torch_xla_available
+
+
+if is_torch_xla_available():
+    import torch_xla.distributed.xla_backend as xbn
+
+if is_neuronx_distributed_available():
+    import neuronx_distributed
+
+TEST_TIMEOUT = 600
+
+
+def is_neuron_environment_available() -> bool:
+    return get_num_neuron_cores() > 0
+
+
+def get_xdist_worker_id():
+    xdist_worker = os.environ.get("PYTEST_XDIST_WORKER", None)
+    if xdist_worker is not None:
+        xdist_worker_id = xdist_worker.replace("gw", "")
+        return int(xdist_worker_id)
+    return None
+
+
+def get_master_port(base_port=29500, port_range_size=1000):
+    xdist_worker_id = get_xdist_worker_id()
+    if xdist_worker_id is not None:
+        # Make xdist workers use different port ranges to avoid race conditions
+        base_port += port_range_size * xdist_worker_id
+
+    # Select first open port in range
+    port = base_port
+    max_port = base_port + port_range_size
+    sock = socket.socket()
+    while port < max_port:
+        try:
+            sock.bind(("", port))
+            sock.close()
+            return str(port)
+        except OSError:
+            port += 1
+    raise IOError("no free ports")
+
+
+class DistributedExec(ABC):
+    """
+    Base class for distributed execution of functions/methods. Contains common
+    methods needed for DistributedTest and DistributedFixture (not included in this file).
+    """
+
+    world_size: Union[int, List[int]] = 2
+    tp_size: int = 1
+    pp_size: int = 1
+    backend: str = "xla"
+    init_distributed: bool = True
+    set_dist_env: bool = True
+    requires_neuron_environment: bool = True
+    reuse_dist_env: bool = False
+    _pool_cache = {}
+    exec_timeout: int = TEST_TIMEOUT
+
+    @abstractmethod
+    def run(self):
+        ...
+
+    def __call__(self, request=None):
+        self._fixture_kwargs = self._get_fixture_kwargs(request, self.run)
+        world_size = self.world_size
+        if self.requires_neuron_environment and not is_neuron_environment_available():
+            pytest.skip("Only supported in a Neuron environment.")
+
+        if isinstance(world_size, int):
+            world_size = [world_size]
+        for procs in world_size:
+            self._launch_procs(procs, self.tp_size, self.pp_size)
+
+    def _get_fixture_kwargs(self, request, func):
+        if not request:
+            return {}
+        # Grab fixture / parametrize kwargs from pytest request object
+        fixture_kwargs = {}
+        params = inspect.getfullargspec(func).args
+        params.remove("self")
+        for p in params:
+            try:
+                fixture_kwargs[p] = request.getfixturevalue(p)
+            except FixtureLookupError:
+                pass  # test methods can have kwargs that are not fixtures
+        return fixture_kwargs
+
+    def _launch_procs(self, num_procs, tp_size, pp_size):
+        if not is_torch_xla_available() or not is_neuronx_distributed_available():
+            raise RuntimeError(
+                "The `torch_xla` and `neuronx_distributed` packages are required to run a distributed test."
+            )
+
+        # Verify we have enough accelerator devices to run this test
+        num_cores = get_num_neuron_cores()
+        if 0 < num_cores < num_procs:
+            pytest.skip(
+                f"Skipping test because not enough Neuron cores are available: {num_procs} required, {num_cores} "
+                "available."
+            )
+
+        # Set start method to `forkserver` (or `fork`)
+        mp.set_start_method("forkserver", force=True)
+        os.environ["TORCHELASTIC_RUN_ID"] = str(uuid.uuid4())
+
+        # Create process pool or use cached one
+        master_port = None
+        if self.reuse_dist_env:
+            if num_procs not in self._pool_cache:
+                self._pool_cache[num_procs] = mp.Pool(processes=num_procs)
+                master_port = get_master_port()
+            pool = self._pool_cache[num_procs]
+        else:
+            pool = mp.Pool(processes=num_procs)
+            master_port = get_master_port()
+
+        # Run the test
+        args = [(local_rank, num_procs, master_port, tp_size, pp_size) for local_rank in range(num_procs)]
+        skip_msgs_async = pool.starmap_async(self._dist_run, args)
+
+        skip_msgs = ""  # Otherwise the linter complains.
+        try:
+            skip_msgs = skip_msgs_async.get(self.exec_timeout)
+        except mp.TimeoutError:
+            # Shortcut to exit pytest in the case of a hanged test. This
+            # usually means an environment error and the rest of tests will
+            # hang (causing super long unit test runtimes)
+            pytest.exit("Test hanged, exiting", returncode=0)
+        except Exception as e:
+            self._close_pool(pool, num_procs)
+            self._terminate_xrt_server()
+            raise e
+        finally:
+            # Tear down distributed environment and close process pools
+            self._close_pool(pool, num_procs)
+            self._terminate_xrt_server()
+
+        # If we skipped a test, propagate that to this process
+        if any(skip_msgs):
+            assert len(set(skip_msgs)) == 1, "Multiple different skip messages received"
+            pytest.skip(skip_msgs[0])
+
+    def _dist_run(self, local_rank, num_procs, master_port, tp_size, pp_size):
+        skip_msg = ""
+        if not dist.is_initialized():
+            """Initializes communication and executes the user function."""
+            if self.set_dist_env:
+                os.environ["MASTER_ADDR"] = "127.0.0.1"
+                os.environ["MASTER_PORT"] = str(master_port)
+                # Unit tests do not support multi-node so local_rank == global rank
+                os.environ["LOCAL_RANK"] = str(local_rank)
+                os.environ["RANK"] = str(local_rank)
+                os.environ["LOCAL_SIZE"] = str(num_procs)
+                os.environ["WORLD_SIZE"] = str(num_procs)
+                os.environ["LOCAL_WORLD_SIZE"] = str(num_procs)
+                # Unit tests do not support multi-node so there is only one group in our case
+                os.environ["GROUP_RANK"] = "0"
+
+            if self.init_distributed:
+                dist.init_process_group(backend=self.backend, rank=local_rank, world_size=num_procs)
+                if not isinstance(torch.distributed.group.WORLD, xbn.ProcessGroupXla):
+                    raise AssertionError("Failed to initialize torch.distributed process group using XLA backend.")
+
+                # Intializing NxD.
+                neuronx_distributed.parallel_layers.parallel_state.initialize_model_parallel(
+                    tensor_model_parallel_size=tp_size,
+                    pipeline_model_parallel_size=pp_size,
+                )
+        try:
+            self.run(**self._fixture_kwargs)
+        except BaseException as e:
+            if isinstance(e, Skipped):
+                skip_msg = e.msg
+            else:
+                raise e
+
+        return skip_msg
+
+    def _dist_destroy(self):
+        if (dist is not None) and dist.is_initialized():
+            dist.barrier()
+            dist.destroy_process_group()
+
+    def _close_pool(self, pool, num_procs, force=False):
+        if force or not self.reuse_dist_env:
+            try:
+                _ = pool.starmap(self._dist_destroy, [() for _ in range(num_procs)])
+                pool.close()
+                pool.join()
+            except ValueError:
+                pass
+
+    def _terminate_xrt_server(self):
+        xrt_server_str = "torch_neuronx.distributed._xrt_run_server"
+        startmethod = mp.get_start_method(allow_none=True)
+        # Rules:
+        # - `startmethod is None`: the XRT server tracks pytest's PID.
+        # - `startmethod="spawn"`: the parent process of the pool's processes is pytest, so the XRT server tracks
+        # pytest's PID.
+        # - `startmethod="fork"`: same as `startmethod="spawn"`.
+        # - `startmethod="forkserver"`: the parent process of the pool's processes is the forkserver, so the XRT server tracks
+        # the forkserver's PID.
+        if startmethod == "forkserver":
+            target_pid = multiprocessing.forkserver._forkserver._forkserver_pid
+        else:
+            target_pid = os.getpid()
+
+        for p in psutil.process_iter():
+            try:
+                if "python3" in p.name() and len(p.cmdline()) == 7:
+                    cmdline = p.cmdline()
+                    if cmdline[2] == xrt_server_str and cmdline[-1] == str(target_pid):
+                        p.terminate()
+            except psutil.ZombieProcess:
+                continue
+
+
+class DistributedTest(DistributedExec):
+    """
+    Implementation for running pytest with distributed execution.
+    """
+
+    is_dist_test = True
+
+    def early_skip(self, fixtures_kwargs):
+        """
+        Override to enable early test skipping (before processes creation).
+        """
+        pass
+
+    # Temporary directory that is shared among test methods in a class
+    @pytest.fixture(autouse=True, scope="class")
+    def class_tmpdir(self, tmpdir_factory):
+        fn = tmpdir_factory.mktemp(self.__class__.__name__)
+        return fn
+
+    def run(self, **fixture_kwargs):
+        self._current_test(**fixture_kwargs)
+
+    def __call__(self, request):
+        self._current_test = self._get_current_test_func(request)
+        self._fixture_kwargs = self._get_fixture_kwargs(request, self._current_test)
+
+        if self.requires_neuron_environment and not is_neuron_environment_available():
+            pytest.skip("Only supported in a Neuron environment.")
+
+        self.early_skip(self._fixture_kwargs)
+
+        world_size = tp_size = pp_size = parallel_sizes = None
+
+        # Catch world_size, tp_size or pp_size override pytest mark.
+        def try_to_override_via_pytest_mark(mark, name):
+            if mark.name == name:
+                return mark.args[0]
+            return None
+
+        for mark in getattr(request.function, "pytestmark", []):
+            world_size = try_to_override_via_pytest_mark(mark, "world_size")
+            tp_size = try_to_override_via_pytest_mark(mark, "tp_size")
+            pp_size = try_to_override_via_pytest_mark(mark, "pp_size")
+            parallel_sizes = try_to_override_via_pytest_mark(mark, "parallel_size")
+
+        # Catch world_size, tp_size or pp_size override via fixture.
+        def try_to_override_via_fixture(name, current_value):
+            if name in self._fixture_kwargs:
+                if current_value is not None:
+                    raise ValueError(f"It is not possible to override {name} both via pytest.mark and fixtures.")
+                return self._fixture_kwargs[name]
+            return None
+
+        world_size = try_to_override_via_fixture("world_size", world_size)
+        tp_size = try_to_override_via_fixture("tp_size", tp_size)
+        pp_size = try_to_override_via_fixture("pp_size", pp_size)
+        parallel_sizes = try_to_override_via_fixture("parallel_sizes", parallel_sizes)
+
+        if parallel_sizes is not None:
+            if not all(size is None for size in [world_size, tp_size, pp_size]):
+                raise ValueError("Either specify parallel_sizes or specific size (world_size, tp_size, pp_size)")
+            world_size, tp_size, pp_size = parallel_sizes
+
+        if world_size is None:
+            world_size = self.world_size
+        if tp_size is None:
+            tp_size = self.tp_size
+        if pp_size is None:
+            pp_size = self.pp_size
+
+        sizes = [world_size, tp_size, pp_size]
+        if all(isinstance(size, int) for size in sizes):
+            world_size = [world_size]
+            tp_size = [tp_size]
+            pp_size = [pp_size]
+        else:
+            lengths = [len(size) for size in sizes if not isinstance(size, int)]
+            if len(set(lengths)) != 1:
+                raise ValueError(
+                    "When providing multiple values for either world_size, tp_size or pp_size, you must provide the "
+                    f"same number of values. Here: {', '.join(lengths)}."
+                )
+            if not all(isinstance(size, (tuple, list)) for size in sizes):
+                length = lengths[0]
+                world_size = [world_size] * length if isinstance(world_size, int) else world_size
+                tp_size = [tp_size] * length if isinstance(tp_size, int) else tp_size
+                pp_size = [pp_size] * length if isinstance(pp_size, int) else pp_size
+
+        for sizes in zip(world_size, tp_size, pp_size):
+            self._launch_procs(*sizes)
+            time.sleep(0.5)
+
+    def _get_current_test_func(self, request):
+        # DistributedTest subclasses may have multiple test methods
+        func_name = request.function.__name__
+        return getattr(self, func_name)
diff --git a/tests/distributed/model_parallel_test_template.txt b/tests/distributed/model_parallel_test_template.txt
deleted file mode 100644
index ad6f8e530..000000000
--- a/tests/distributed/model_parallel_test_template.txt
+++ /dev/null
@@ -1,157 +0,0 @@
-# This is a template file for testing model parallelization.
-
-import os
-from contextlib import nullcontext
-from inspect import signature
-
-import torch
-import neuronx_distributed
-from neuronx_distributed import parallel_layers
-from neuronx_distributed.utils.model_utils import move_model_to_device
-import torch_xla.core.xla_model as xm
-
-from transformers import AutoConfig, AutoTokenizer, {model_class}
-from transformers.trainer_utils import set_seed
-
-import optimum
-from optimum.neuron.utils.training_utils import set_neuron_cc_optlevel_for_model
-from optimum.neuron.distributed import ParallelizersManager, lazy_load_for_parallelism
-
-from utils import gather_along_dim, generate_dummy_labels, create_static_seed_patcher
-
-
-if os.environ.get("TORCHELASTIC_RUN_ID"):
-    import torch_xla.distributed.xla_backend as xbn
-
-    if not isinstance(torch.distributed.group.WORLD, xbn.ProcessGroupXla):
-        torch.distributed.init_process_group(backend="xla")
-
-SEED = 42
-
-from_config = os.environ["from_config"] == "true"
-lazy_load = os.environ["lazy_load"] == "true"
-is_parallel = os.environ["is_parallel"] == "true"
-config_overwrite = os.environ.get("config_overwrite", "")
-parallelize_embeddings = is_parallel and os.environ["parallelize_embeddings"] == "true"
-sequence_parallel_enabled = os.environ["sequence_parallel_enabled"] == "true"
-computing_loss_is_supported = os.environ["computing_loss_is_supported"] == "true"
-
-# This is required to prevent `parallel_cross_entropy` to mutate the logits (which would make them not comparable).
-if is_parallel and parallelize_embeddings:
-    optimum.neuron.distributed.parallel_layers._PARALLEL_CROSS_ENTROPY_SHOULD_PRESERVE_INPUT = True
-
-# Initialize TP
-if is_parallel:
-  neuronx_distributed.parallel_layers.parallel_state.initialize_model_parallel(tensor_model_parallel_size={tp_size})
-
-
-config = AutoConfig.from_pretrained("{model_name_or_path}")
-config_overwrite = config_overwrite.split(",")
-for overwrite_info in config_overwrite:
-    if overwrite_info == "":
-      continue
-    attr_name, attr_value = overwrite_info.split("=")
-    attr_type = type(getattr(config, attr_name))
-    setattr(config, attr_name, attr_type(attr_value))
-
-if getattr(config, "problem_type", None) is None:
-    config.problem_type = "single_label_classification"
-
-if xm.get_ordinal() == 0:
-  print(config)
-
-preprocessor = AutoTokenizer.from_pretrained("{model_name_or_path}")
-
-inputs = preprocessor("This is a test to check that TP is working.", return_tensors="pt")
-
-if sequence_parallel_enabled:
-    for name, tensor in inputs.items():
-        if tensor.shape[1] % {tp_size} != 0:
-            tensor = torch.nn.functional.pad(
-              tensor, pad=(0, tensor.shape[1] % {tp_size}), value=1,
-            )
-            inputs[name] = tensor
-
-def load_model_with_seed(seed: int, from_config: bool):
-    set_seed(seed)
-    if from_config:
-        model = {model_class}(config)
-    else:
-      tp_size = {tp_size} if is_parallel else 1
-      ctx = lazy_load_for_parallelism(tensor_parallel_size=tp_size) if lazy_load else nullcontext()
-      with ctx:
-          model = {model_class}.from_pretrained("{model_name_or_path}", config=config, ignore_mismatched_sizes=True)
-    return model
-
-static_seed_patcher = create_static_seed_patcher({model_class}, SEED)
-with static_seed_patcher:
-    model = load_model_with_seed(SEED, from_config)
-    
-    set_neuron_cc_optlevel_for_model(model)
-    
-    vocab_size = getattr(model.config, "vocab_size", None)
-    
-    if is_parallel:
-        model = ParallelizersManager.parallelizer_for_model(model).parallelize(
-            model, 
-            parallelize_embeddings=parallelize_embeddings, 
-            sequence_parallel_enabled=sequence_parallel_enabled,
-        )
-        filename = "parallel.bin"
-    else:
-        filename = "original.bin"
-
-move_model_to_device(model, "xla")
-model = model.eval()
-
-xla_inputs = dict()
-sig = signature(model.forward)
-for k, v in inputs.items():
-    if k not in sig.parameters:
-        continue
-    xla_inputs[k] = v.to("xla")
-    decoder_input_name = "decoder_" + k
-    if model.config.is_encoder_decoder and decoder_input_name in sig.parameters:
-        xla_inputs[decoder_input_name] = v.to("xla")
-
-# We take the shape of the first input to "predict" the shape of the labels.
-# Might not work for every tasks.
-shape = list(xla_inputs.values())[0].shape
-
-if computing_loss_is_supported:
-    xla_inputs.update(generate_dummy_labels(model, shape, vocab_size=vocab_size, device="xla", seed=SEED))
-
-model_outputs = model(**xla_inputs, return_dict=True)
-xm.mark_step()
-
-axis_to_gather = dict()
-axis_to_gather["default"] = -1
-axis_to_gather["past_key_values"] = 1
-
-def gather_output(output, gather_dim):
-    if isinstance(output, (tuple, list, set)):
-        output_type = type(output)
-        gathered_output = []
-        for t in output:
-            gathered_output.append(gather_output(t, gather_dim))
-        result = output_type(gathered_output)
-    else:
-        result = gather_along_dim(output, gather_dim)
-    return result
-
-if is_parallel:
-    # Because of parallelism (embeddings and sequence parallelism), some outputs need to be gathered.
-    # Since it is not possible to generically know which one, we save both the "regular" output and the gathered 
-    # version of it. We then compare both of them to the original output and fail if both do not match.
-    gathered_model_outputs =  dict()
-    for name, output in model_outputs.items():
-        gathered_model_outputs[name] = output
-        if name == "loss" or output is None: 
-            gathered_output = output
-        else:
-            gathered_output = gather_output(output, axis_to_gather.get(name, axis_to_gather["default"]))
-        gathered_output_name = "gathered_" + name
-        gathered_model_outputs[gathered_output_name] = gathered_output
-    model_outputs = gathered_model_outputs
-
-xm.save(model_outputs, "{output_path}" + "/" + filename)
diff --git a/tests/distributed/test_common.py b/tests/distributed/test_common.py
new file mode 100644
index 000000000..4cc99a741
--- /dev/null
+++ b/tests/distributed/test_common.py
@@ -0,0 +1,415 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""General tests related to distributed training."""
+
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict
+
+import pytest
+import safetensors
+import torch
+from transformers import LlamaForCausalLM
+
+from optimum.neuron.accelerate.optimizer import NeuronAcceleratedOptimizer
+from optimum.neuron.accelerate.utils.dataclasses import NeuronDistributedType
+from optimum.neuron.distributed.utils import (
+    TENSOR_PARALLEL_SHARDS_DIR_NAME,
+    make_optimizer_constructor_lazy,
+)
+from optimum.neuron.utils.import_utils import (
+    is_neuronx_distributed_available,
+    is_torch_xla_available,
+)
+from optimum.neuron.utils.testing_utils import is_trainium_test
+
+from .distributed import DistributedTest
+from .utils import create_accelerator_for_mp, create_static_seed_patcher, get_model, get_model_inputs
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+if is_neuronx_distributed_available():
+    from neuronx_distributed.parallel_layers.parallel_state import (
+        get_data_parallel_rank,
+        get_pipeline_model_parallel_rank,
+        get_tensor_model_parallel_group,
+        get_tensor_model_parallel_rank,
+    )
+    from neuronx_distributed.parallel_layers.utils import move_all_tensor_to_cpu
+    from neuronx_distributed.pipeline import NxDPPModel
+    from neuronx_distributed.utils.model_utils import move_model_to_device
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+MODEL_NAME = "michaelbenayoun/llama-2-tiny-16layers-random"
+
+
+def get_tiny_llama_model(
+    tp_size: int = 1,
+    pp_size: int = 1,
+    lazy_load: bool = False,
+    from_config: bool = False,
+    use_static_seed_patcher: bool = False,
+    add_random_noise: bool = False,
+) -> "PreTrainedModel":
+    return get_model(
+        LlamaForCausalLM,
+        MODEL_NAME,
+        tp_size=tp_size,
+        pp_size=pp_size,
+        lazy_load=lazy_load,
+        from_config=from_config,
+        use_static_seed_patcher=use_static_seed_patcher,
+        add_random_noise=add_random_noise,
+    )
+
+
+def get_optimizer(model: torch.nn.Module, lazy: bool = False, with_groups: bool = True) -> torch.optim.Optimizer:
+    adam_cls = torch.optim.AdamW
+    if lazy:
+        adam_cls = make_optimizer_constructor_lazy(adam_cls)
+
+    if with_groups:
+        groups = [
+            {"params": (p for idx, p in enumerate(model.parameters()) if idx % 2 == 0), "lr": 1e-2},
+            {"params": (p for idx, p in enumerate(model.parameters()) if idx % 2 == 1), "lr": 1e-6},
+        ]
+    else:
+        groups = model.parameters()
+
+    return adam_cls(groups)
+
+
+def move_params_to_cpu(parameters):
+    parameters = list(parameters)
+    xm.mark_step()
+    # `move_all_tensor_to_cpu` only selects `torch.Tensor`, so we need to move the parameters' data.
+    cpu_params = move_all_tensor_to_cpu([p.data for p in parameters])
+    return cpu_params
+
+
+@is_trainium_test
+class TestCommonDistributed(DistributedTest):
+    # TODO: enable dp=4,tp=pp=2 when working on the multi-node training PR.
+    @pytest.fixture(
+        scope="class",
+        params=[[2, 1, 1], [2, 2, 1], [2, 1, 2]],
+        ids=["dp=2", "tp=2", "pp=2"],
+    )
+    def parallel_sizes(self, request):
+        return request.param
+
+    @pytest.fixture(scope="class", params=[False, True], ids=["no_lazy_load", "lazy_load"])
+    def lazy_load(self, request):
+        return request.param
+
+    @pytest.fixture(scope="class", params=[False, True], ids=["from_pretrained", "from_config"])
+    def from_config(self, request):
+        return request.param
+
+    @pytest.fixture(scope="class", params=[False, True], ids=["no_lazy_optimizer", "lazy_optimizer"])
+    def lazy_optimizer(self, request):
+        return request.param
+
+    @pytest.fixture(scope="class", params=[False, True], ids=["without_groups", "with_groups"])
+    def with_groups(self, request):
+        return request.param
+
+    @pytest.fixture(scope="class", params=[False, True], ids=["no_zero_1", "zero_1"])
+    def zero_1(self, request):
+        return request.param
+
+    @pytest.fixture(scope="class", params=[1, 12], ids=["no_grad_acc", "grad_acc=12"])
+    def gradient_accumulation_steps(self, request):
+        return request.param
+
+    @pytest.fixture(scope="class", params=[None, 0.01], ids=["no_clip_grad_norm", "clip_grad_norm"])
+    def max_grad_norm(self, request):
+        return request.param
+
+    def test_optimizer_parameters_match_model_parameters(
+        self, lazy_load, lazy_optimizer, with_groups, zero_1, parallel_sizes
+    ):
+        num_workers, tp_size, pp_size = parallel_sizes
+        dp_size = num_workers // (tp_size * pp_size)
+        if dp_size == 1 and zero_1:
+            pytest.skip("zero_1 needs to be tested only for dp_size > 1")
+
+        model = get_tiny_llama_model(tp_size=tp_size, pp_size=pp_size, lazy_load=lazy_load)
+        optimizer = get_optimizer(model, lazy_optimizer, with_groups)
+
+        accelerator = create_accelerator_for_mp(tp_size, pp_size, zero_1=zero_1)
+        if tp_size > 1 or pp_size > 1:
+            assert accelerator.state.distributed_type is NeuronDistributedType.MODEL_PARALLELISM
+
+        model = accelerator.prepare(model)
+
+        # Under DDP only setting, the optimizer needs to be created after the model has been moved.
+        if tp_size == 1 and pp_size == 1:
+            optimizer = get_optimizer(model, lazy_optimizer, with_groups)
+
+        optimizer = accelerator.prepare(optimizer)
+
+        assert isinstance(optimizer, NeuronAcceleratedOptimizer)
+
+        if isinstance(model, NxDPPModel):
+            model_parameters = set(model.local_parameters())
+        else:
+            model_parameters = set(model.parameters())
+        optimizer_parameters = {p for group in optimizer.param_groups for p in group["params"]}
+
+        assert model_parameters == optimizer_parameters
+
+    def test_optimizer_step(self, zero_1, gradient_accumulation_steps, max_grad_norm, parallel_sizes):
+        num_workers, tp_size, pp_size = parallel_sizes
+        dp_size = num_workers // (tp_size * pp_size)
+        if dp_size == 1 and zero_1:
+            pytest.skip("zero_1 needs to be tested only for dp_size > 1")
+
+        # TODO: investigate that with the AWS team to find a solution.
+        if dp_size > 1 and zero_1 and max_grad_norm is not None:
+            pytest.skip("Gradient clipping seems to not work properly with ZeRO-1.")
+
+        model = get_tiny_llama_model(tp_size=tp_size, pp_size=pp_size, use_static_seed_patcher=True)
+
+        if tp_size == pp_size == 1:
+            move_model_to_device(model, xm.xla_device())
+
+        optimizer = get_optimizer(model, with_groups=False)
+
+        accelerator = create_accelerator_for_mp(
+            tp_size, pp_size, zero_1=zero_1, gradient_accumulation_steps=gradient_accumulation_steps
+        )
+
+        model, optimizer = accelerator.prepare(model, optimizer)
+        assert isinstance(optimizer, NeuronAcceleratedOptimizer)
+
+        inputs = get_model_inputs(model, MODEL_NAME)
+
+        def move_grads_to_cpu(parameters):
+            grads = [p.grad for p in parameters]
+            grads = move_all_tensor_to_cpu(grads)
+            return grads
+
+        if pp_size == 1:
+            inputs = {k: v.to(xm.xla_device()) for k, v in inputs.items()}
+
+        current_parameters = move_params_to_cpu(
+            model.local_parameters() if isinstance(model, NxDPPModel) else model.parameters()
+        )
+
+        for step in range(int(1.5 * gradient_accumulation_steps)):
+            is_optimizer_update_step = (step + 1) % gradient_accumulation_steps == 0
+            with accelerator.accumulate(model):
+                if pp_size > 1:
+                    orig_parameters = current_parameters
+                    loss = model.run_train(**inputs)
+                    xm.mark_step()
+
+                    if max_grad_norm is not None:
+                        accelerator.clip_grad_norm_(model.local_parameters(), max_norm=max_grad_norm, norm_type=2)
+
+                    # Checking that at least some of the parameters have a gradient.
+                    grads_on_cpu = move_grads_to_cpu(model.local_parameters())
+                    assert any(torch.all(grad != 0) for grad in grads_on_cpu)
+
+                    optimizer.step()
+
+                    # Checking only after an actual optimizer step that the norm has been clipped because it happens
+                    # during the optimizer step in some cases.
+                    if is_optimizer_update_step and max_grad_norm is not None:
+                        grads_on_cpu = move_grads_to_cpu(model.local_parameters())
+                        norms = [torch.linalg.vector_norm(grad, 2) for grad in grads_on_cpu]
+                        total_norm = torch.linalg.vector_norm(torch.stack(norms), 2)
+                        assert total_norm <= max_grad_norm
+
+                    optimizer.zero_grad()
+
+                    grads_on_cpu = move_grads_to_cpu(model.local_parameters())
+                    if is_optimizer_update_step:
+                        # At this point, no parameter should have a gradient.
+                        assert all(torch.all(grad == 0) for grad in grads_on_cpu)
+
+                    current_parameters = move_params_to_cpu(model.local_parameters())
+                else:
+                    orig_parameters = current_parameters
+                    outputs = model(**inputs)
+                    loss = outputs["loss"]
+                    xm.mark_step()
+                    loss.backward()
+
+                    if max_grad_norm is not None:
+                        accelerator.clip_grad_norm_(model.parameters(), max_norm=max_grad_norm, norm_type=2)
+
+                    # Checking that at least some of the parameters have a gradient.
+                    grads_on_cpu = move_grads_to_cpu(model.parameters())
+                    assert any(torch.all(grad != 0) for grad in grads_on_cpu)
+
+                    optimizer.step()
+
+                    # Checking only after an actual optimizer step that the norm has been clipped because it happens
+                    # during the optimizer step in some cases.
+                    if is_optimizer_update_step and max_grad_norm is not None:
+                        grads_on_cpu = move_grads_to_cpu(model.parameters())
+                        norms = [torch.linalg.vector_norm(grad, 2) for grad in grads_on_cpu]
+                        total_norm = torch.linalg.vector_norm(torch.stack(norms), 2)
+                        assert total_norm <= max_grad_norm
+
+                    optimizer.zero_grad()
+
+                    # At this point, no parameter should have a gradient.
+                    if is_optimizer_update_step:
+                        grads_on_cpu = move_grads_to_cpu(model.parameters())
+                        assert all(torch.all(grad == 0) for grad in grads_on_cpu)
+
+                    current_parameters = move_params_to_cpu(model.parameters())
+
+                if is_optimizer_update_step:
+                    assert any(torch.any(p1 != p2) for (p1, p2) in zip(orig_parameters, current_parameters))
+                else:
+                    assert all(torch.all(p1 == p2) for (p1, p2) in zip(orig_parameters, current_parameters))
+
+    def test_lazy_load(self, from_config, parallel_sizes):
+        _, tp_size, pp_size = parallel_sizes
+
+        if from_config and (tp_size > 1 or pp_size > 1):
+            pytest.skip("It is not easy to compare parameters value in this case because of initialization.")
+
+        model = get_tiny_llama_model(
+            tp_size=1, pp_size=1, lazy_load=False, from_config=from_config, use_static_seed_patcher=True
+        )
+
+        orig_parameters: Dict[str, torch.nn.Parameter] = dict(model.named_parameters())
+
+        accelerator = create_accelerator_for_mp(tp_size, pp_size)
+        lazy_model = get_tiny_llama_model(
+            tp_size=tp_size, pp_size=pp_size, lazy_load=True, from_config=from_config, use_static_seed_patcher=True
+        )
+        static_seed_patcher = create_static_seed_patcher(model.__class__, 42)
+        with static_seed_patcher:
+            lazy_model = accelerator.prepare(lazy_model)
+
+        if pp_size > 1:
+            named_parameters = dict(lazy_model.local_named_parameters())
+        else:
+            named_parameters = dict(lazy_model.named_parameters())
+
+        xm.mark_step()
+
+        for name, param in named_parameters.items():
+            orig = orig_parameters[name]
+            if orig.shape != param.shape:
+                if orig.dim() == 1:
+                    gather_dim = 0
+                elif orig.dim() == 2:
+                    gather_dim = 1 if orig.shape[0] == param.shape[0] else 0
+                else:
+                    raise ValueError(f"The case where the weight as a rank of {orig.dim()} is not supported.")
+                gathered = [torch.empty(param.shape) for _ in range(tp_size)]
+                torch.distributed.all_gather(gathered, param, group=get_tensor_model_parallel_group())
+                gathered_param = torch.cat(gathered, dim=gather_dim)
+            else:
+                gathered_param = param
+
+            orig = orig.to("cpu")
+            gathered_param = gathered_param.to("cpu")
+            xm.mark_step()
+
+            print(f"Comparing parameter named {name}")
+            torch.testing.assert_close(orig, gathered_param)
+
+    def test_save_model_and_load_model(self, parallel_sizes, tmpdir, monkeypatch):
+        _, tp_size, pp_size = parallel_sizes
+        dp_rank = get_data_parallel_rank()
+        tp_rank = get_tensor_model_parallel_rank()
+        pp_rank = get_pipeline_model_parallel_rank()
+
+        tmpdir = Path(tmpdir)
+
+        model = get_tiny_llama_model(tp_size=tp_size, pp_size=pp_size, lazy_load=False, add_random_noise=True)
+
+        accelerator = create_accelerator_for_mp(tp_size, pp_size)
+        model = accelerator.prepare(model)
+        accelerator.save_state(tmpdir.as_posix())
+        accelerator.state._reset_state(reset_partial_state=True)
+        del accelerator
+
+        if pp_size > 1:
+            # We need to disable `NxDPPModel._set_distributed` since it is already done during the creation of the
+            # first model, otherwise creating new `NxDPPModel`s will fail.
+            monkeypatch.setattr(NxDPPModel, "_set_distributed", lambda _: _)
+
+        tmpdir_content = [path.name for path in tmpdir.glob("**/*")]
+        pytorch_checkpoint_exists = "pytorch_model.bin" in tmpdir_content
+        safetensors_checkpoint_exists = "model.safetensors" in tmpdir_content
+
+        if tp_size > 1 or pp_size > 1:
+            ref_data_file_name = f"tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:02d}"
+            tensors_directory = f"{ref_data_file_name}.tensors"
+            assert not pytorch_checkpoint_exists
+            assert not safetensors_checkpoint_exists
+            assert TENSOR_PARALLEL_SHARDS_DIR_NAME in tmpdir_content
+            assert ref_data_file_name in tmpdir_content
+            assert tensors_directory in tmpdir_content
+        else:
+            assert pytorch_checkpoint_exists or safetensors_checkpoint_exists
+
+        # Making sure that we end-up with a different model when starting over.
+        new_model = get_tiny_llama_model(tp_size=tp_size, pp_size=pp_size, lazy_load=False, add_random_noise=True)
+        new_accelerator = create_accelerator_for_mp(tp_size, pp_size)
+        new_model = new_accelerator.prepare(new_model)
+        new_accelerator.state._reset_state(reset_partial_state=True)
+        del new_accelerator
+
+        if pp_size == 1:
+            model_parameters = move_params_to_cpu(model.parameters())
+            new_model_parameters = move_params_to_cpu(new_model.parameters())
+        else:
+            model_parameters = move_params_to_cpu(model.local_parameters())
+            new_model_parameters = move_params_to_cpu(new_model.local_parameters())
+
+        assert any(
+            torch.all(p1 == 0.0) or torch.all(p1 == 1.0) or torch.all(p1 != p2)
+            for p1, p2 in zip(model_parameters, new_model_parameters)
+        )
+
+        # Checking that when providing a checkpoint, we end-up with the same model as the original.
+        new_model = get_tiny_llama_model(tp_size=tp_size, pp_size=pp_size, lazy_load=False, add_random_noise=True)
+        new_accelerator = create_accelerator_for_mp(tp_size, pp_size, checkpoint_dir=tmpdir)
+        new_model = new_accelerator.prepare(new_model)
+
+        # If there is no model parallelism, the checkpoint weights will not be loaded automatically since we do not
+        # call parallelize, so we do it manually.
+        if tp_size == pp_size == 1:
+            if pytorch_checkpoint_exists:
+                filename = "pytorch_model.bin"
+                checkpoint_path = tmpdir / filename
+                new_model.load_state_dict(torch.load(checkpoint_path))
+            else:
+                filename = "model.safetensors"
+                checkpoint_path = tmpdir / filename
+                new_model.load_state_dict(safetensors.torch.load_file(checkpoint_path))
+
+        if pp_size == 1:
+            model_parameters = move_params_to_cpu(model.parameters())
+            new_model_parameters = move_params_to_cpu(new_model.parameters())
+        else:
+            model_parameters = move_params_to_cpu(model.local_parameters())
+            new_model_parameters = move_params_to_cpu(new_model.local_parameters())
+
+        if dp_rank == 0:
+            assert all(torch.all(p1 == p2) for p1, p2 in zip(model_parameters, new_model_parameters))
diff --git a/tests/distributed/test_model_parallelization.py b/tests/distributed/test_model_parallelization.py
index 2d64e4b28..a7097dc4c 100644
--- a/tests/distributed/test_model_parallelization.py
+++ b/tests/distributed/test_model_parallelization.py
@@ -14,49 +14,65 @@
 # limitations under the License.
 """Tests validating that models can be parallelized correctly."""
 
-import os
-import subprocess
-from pathlib import Path
-from tempfile import TemporaryDirectory
-from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
-from unittest import TestCase
+from typing import TYPE_CHECKING, List, Optional, Type, Union
 
 import pytest
 import torch
-from parameterized import parameterized
+import torch.utils._pytree as pytree
+from transformers import LlamaForCausalLM
+from transformers.models.auto.configuration_auto import CONFIG_MAPPING
 from transformers.models.auto.modeling_auto import (
-    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_BACKBONE_MAPPING_NAMES,
-    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
-    MODEL_FOR_CTC_MAPPING_NAMES,
-    MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
-    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
-    MODEL_FOR_MASKED_LM_MAPPING_NAMES,
-    MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES,
-    MODEL_FOR_PRETRAINING_MAPPING_NAMES,
-    MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
-    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
-    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
-    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
-    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
+    MODEL_FOR_BACKBONE_MAPPING,
+    MODEL_FOR_CAUSAL_LM_MAPPING,
+    MODEL_FOR_CTC_MAPPING,
+    MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
+    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+    MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
+    MODEL_FOR_MASKED_LM_MAPPING,
+    MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
+    MODEL_FOR_PRETRAINING_MAPPING,
+    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+    MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
+    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+    MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
 )
 
+import optimum
+from optimum.neuron.accelerate.accelerator import NeuronAccelerator
+from optimum.neuron.distributed.parallelizers_manager import ParallelizersManager
 from optimum.neuron.utils.cache_utils import (
     get_num_neuron_cores,
-    set_neuron_cache_path,
 )
-from optimum.neuron.utils.import_utils import is_neuronx_available
-from optimum.neuron.utils.runner import run_command_with_realtime_output
+from optimum.neuron.utils.import_utils import (
+    is_neuronx_available,
+    is_neuronx_distributed_available,
+    is_torch_xla_available,
+)
+from optimum.neuron.utils.testing_utils import is_trainium_test
+from optimum.neuron.utils.training_utils import set_neuron_cc_optlevel_for_model
+
+from .distributed import DistributedTest
+from .utils import SEED, create_accelerator_for_mp, get_model, get_model_inputs
 
-from ..test_utils import is_trainium_test
-from ..utils import TrainiumTestMixin
 
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+if is_neuronx_distributed_available():
+    from neuronx_distributed.parallel_layers.parallel_state import (
+        get_pipeline_model_parallel_rank,
+        get_tensor_model_parallel_group,
+        get_tensor_model_parallel_size,
+    )
+    from neuronx_distributed.parallel_layers.utils import move_all_tensor_to_cpu
+    from neuronx_distributed.utils.model_utils import move_model_to_device
 
 if TYPE_CHECKING:
-    from transformers import PretrainedConfig
+    from transformers import PreTrainedModel
 
 
 TEMPLATE_FILE_NAME = "model_parallel_test_template.txt"
@@ -71,46 +87,47 @@
 ]
 
 
-def _generate_supported_model_class_names(
-    model_name: Type["PretrainedConfig"],
+def _generate_supported_model_classes(
+    model_type: str,
     supported_tasks: Optional[Union[str, List[str]]] = None,
-) -> List[str]:
+) -> List[Type["PreTrainedModel"]]:
     task_mapping = {
         # TODO: enable that when base models are supported.
-        # "default": MODEL_MAPPING_NAMES,
-        "pretraining": MODEL_FOR_PRETRAINING_MAPPING_NAMES,
-        "next-sentence-prediction": MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES,
-        "masked-lm": MODEL_FOR_MASKED_LM_MAPPING_NAMES,
-        "causal-lm": MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
-        "seq2seq-lm": MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
-        "speech-seq2seq": MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
+        # "default": MODEL_MAPPING,
+        "pretraining": MODEL_FOR_PRETRAINING_MAPPING,
+        "next-sentence-prediction": MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
+        "masked-lm": MODEL_FOR_MASKED_LM_MAPPING,
+        "causal-lm": MODEL_FOR_CAUSAL_LM_MAPPING,
+        "seq2seq-lm": MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+        "speech-seq2seq": MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
         # Those architectures are more painful to deal with because the input is different.
-        # "multiple-choice": MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
-        "document-question-answering": MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
-        "question-answering": MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
-        "sequence-classification": MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
-        "token-classification": MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
-        "masked-image-modeling": MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
-        "image-classification": MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-        "zero-shot-image-classification": MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-        "ctc": MODEL_FOR_CTC_MAPPING_NAMES,
-        "audio-classification": MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
-        "semantic-segmentation": MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
-        "backbone": MODEL_FOR_BACKBONE_MAPPING_NAMES,
+        # "multiple-choice": MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
+        "document-question-answering": MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
+        "question-answering": MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        "sequence-classification": MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+        "token-classification": MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        "masked-image-modeling": MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
+        "image-classification": MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+        "zero-shot-image-classification": MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
+        "ctc": MODEL_FOR_CTC_MAPPING,
+        "audio-classification": MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
+        "semantic-segmentation": MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
+        "backbone": MODEL_FOR_BACKBONE_MAPPING,
     }
 
     if supported_tasks is None:
-        supported_tasks = task_mapping.keys()
+        supported_tasks = list(task_mapping.keys())
     if isinstance(supported_tasks, str):
         supported_tasks = [supported_tasks]
 
-    model_class_names = []
+    model_classes = []
     for task in supported_tasks:
-        class_name = task_mapping[task].get(model_name, None)
-        if class_name is not None and class_name not in CLASSES_TO_IGNORE:
-            model_class_names.append(class_name)
+        config_class = CONFIG_MAPPING[model_type]
+        model_class = task_mapping[task].get(config_class, None)
+        if model_class is not None and model_class not in CLASSES_TO_IGNORE:
+            model_classes.append(model_class)
 
-    return list(set(model_class_names))
+    return list(set(model_classes))
 
 
 MODEL_TYPES_TO_TEST = [
@@ -125,10 +142,13 @@ def _generate_supported_model_class_names(
     ),
     (
         "gpt_neox",
-        "hf-tiny-model-private/tiny-random-GPTNeoXModel",
-        {"num_hidden_layers": "2", "intermediate_size": "36"},
+        "michaelbenayoun/gpt-neox-tiny-4layers-random",
+        {"num_hidden_layers": "2"},
+    ),
+    (
+        "llama",
+        "michaelbenayoun/llama-2-tiny-16layers-random",
     ),
-    ("llama", "yujiepan/llama-2-tiny-3layers-random", {"num_hidden_layers": "2"}),
     (
         "t5",
         "hf-internal-testing/tiny-random-T5Model",
@@ -141,390 +161,305 @@ def _generate_supported_model_class_names(
 for entry in MODEL_TYPES_TO_TEST:
     if len(entry) == 2:
         model_type, model_name_or_path = entry
-        config_overwrite = {}
+        config_overwrite = None
     else:
         model_type, model_name_or_path, config_overwrite = entry
-    for model_class_name in _generate_supported_model_class_names(model_type):
-        MODELS_TO_TEST.append((model_class_name, model_name_or_path, config_overwrite))
+    for model_class in _generate_supported_model_classes(model_type):
+        entry = (model_type, model_class, model_name_or_path, config_overwrite)
+        if entry not in MODELS_TO_TEST:
+            MODELS_TO_TEST.append(entry)
+
+
+MODEL_CLASSES_TO_IGNORE = [
+    "BertForPreTraining",  # There is a compilation issue, and testing TP for BertForPretraining is not really important.
+    # TODO
+    # GPTNeo's attention mechanism is broken in transformers==4.36.2, this should be re-enabled once there is a release
+    # containing this PR: https://github.com/huggingface/transformers/pull/28533
+    "GPTNeoForSequenceClassification",
+    "GPTNeoForTokenClassification",
+    "GPTNeoForQuestionAnswering",
+    "GPTNeoForCausalLM",
+]
 
 
-# When doing from pretrained + lazy loading, it is not always easy to initiliazed the remaining weights in a similar
-# fashion than in the regular model. So we do not check for them under this specific setting. It does not mean that
-# parallelization does not work for them, only that some weights cannot be initialized exactly the same way.
-MODEL_CLASSES_TO_IGNORE_ON_LAZY_LOAD_FOR_FROM_PRETRAINED = [
-    "T5ForQuestionAnswering",
-]
+LLAMA_GQA_VARIANTS_TO_TEST = {
+    "MHA-setup": (
+        8,
+        2,
+        1,
+        {
+            "num_hidden_layers": "2",
+            "num_attention_heads": "8",
+            "num_key_value_heads": "8",
+        },
+    ),
+    "num_key_value_heads > tp_size": (
+        8,
+        2,
+        1,
+        {
+            "num_hidden_layers": "2",
+            "num_attention_heads": "8",
+            "num_key_value_heads": "4",
+        },
+    ),
+    "num_key_value_heads = tp_size": (
+        8,
+        8,
+        1,
+        {
+            "num_hidden_layers": "2",
+            "hidden_size": "32",
+            "num_attention_heads": "16",
+            "num_key_value_heads": "8",
+        },
+    ),
+    "num_key_value_heads < tp_size": (
+        8,
+        8,
+        1,
+        {
+            "num_hidden_layers": "2",
+            "hidden_size": "32",
+            "num_attention_heads": "16",
+            "num_key_value_heads": "2",
+        },
+    ),
+    "MQA-setup": (
+        8,
+        8,
+        1,
+        {
+            "num_hidden_layers": "2",
+            "hidden_size": "32",
+            "num_attention_heads": "16",
+            "num_key_value_heads": "1",
+        },
+    ),
+}
+# LLAMA_V2_MODEL_NAME = "michaelbenayoun/llama-2-tiny-16layers-32kv-heads-random"
+LLAMA_V2_MODEL_NAME = "anushehchaudry/llama-2-tiny-random"
+# LLAMA_V2_MODEL_NAME = "michaelbenayoun/llama-2-tiny-16layers-random"
+# LLAMA_V2_MODEL_NAME = "michaelbenayoun/llama-2-tiny-16layers-32kv-heads-random"
 
 
 @is_trainium_test
-class ModelParallelizationTestCase(TrainiumTestMixin, TestCase):
+class TestModelParallelization(DistributedTest):
     OUTPUTS_TO_IGNORE = {
         # It might not match in the sequence parallel setting because of mistmatched shapes.
         # Since these outputs are not needed during training, we do not want to perform an expensive gather for them.
         "encoder_last_hidden_state",
     }
 
-    def _check_output(self, name: str, original_output, output, lazy_load: bool):
+    @pytest.fixture(scope="class", params=[[2, 2, 1], [2, 1, 2], [16, 2, 2]], ids=["tp=2", "pp=2", "dp=4,tp=pp=2"])
+    def parallel_sizes(self, request):
+        return request.param
+
+    @pytest.fixture(scope="class", params=MODELS_TO_TEST, ids=[specs[1].__name__ for specs in MODELS_TO_TEST])
+    def model_specs(self, request):
+        return request.param
+
+    def early_skip(self, fixtures_kwargs):
+        pp_size = fixtures_kwargs.get("pp_size", None)
+        parallel_sizes = fixtures_kwargs.get("parallel_sizes", None)
+        if pp_size is None and parallel_sizes is not None:
+            pp_size = parallel_sizes[-1]
+        model_specs = fixtures_kwargs.get("model_specs", None)
+
+        if pp_size > 1 and model_specs is not None:
+            model_type = model_specs[0]
+            manager = ParallelizersManager.parallelizer_for_model(model_type)
+            if not manager.supports_pipeline_parallelism():
+                pytest.skip(f"Pipeline parallelism is not supported for {model_class.__name__}.")
+
+        return super().early_skip(fixtures_kwargs)
+
+    def _check_output(self, name: str, original_output, output):
         assert type(original_output) is type(output)
         if isinstance(original_output, (tuple, list, set)):
             for idx, orig_output in enumerate(original_output):
                 new_name = f"{name}.{idx}"
-                self._check_output(new_name, orig_output, output[idx], lazy_load)
+                self._check_output(new_name, orig_output, output[idx])
         elif isinstance(original_output, dict):
             for output_name in original_output:
                 new_name = f"{name}.{output_name}"
-                self._check_output(new_name, original_output[name], output[name], lazy_load)
+                self._check_output(new_name, original_output[name], output[name])
         elif isinstance(original_output, torch.Tensor):
-            print(f"Original {name}:\nShape: {original_output.shape}\nValue: {original_output}")
-            print(f"Parallel {name}:\nShape: {output.shape}\nValue: {output}")
+            xm.master_print(f"Comparing output named {name}")
+            tp_size = get_tensor_model_parallel_size()
+            if original_output.shape != output.shape:
+                gather_dim = min(
+                    idx for idx in range(original_output.dim()) if original_output.shape[idx] != output.shape[idx]
+                )
+                output = output.to(xm.xla_device())
+                gathered = [torch.empty_like(output) for _ in range(tp_size)]
+                torch.distributed.all_gather(gathered, output, group=get_tensor_model_parallel_group())
+                gathered_output = torch.cat(gathered, dim=gather_dim)
+                xm.mark_step()
+                output = gathered_output.to("cpu")
             torch.testing.assert_close(original_output, output)
         else:
             assert original_output == output, f"Output named {name} do not match."
 
-    def _test_model_parallel(
+    def _parallel_model_matches_original_model(
         self,
-        tp_size: int,
-        model_class_name: str,
-        model_name_or_path: str,
-        from_config: bool,
-        with_lazy_load: bool,
-        parallelize_embeddings: bool,
-        sequence_parallel_enabled: bool,
-        num_neuron_cores: int = NUM_NEURON_CORES_AVAILABLE,
-        run_test_in_parallel: bool = False,
-        overwrite_model_config: Optional[Dict[str, str]] = None,
+        model_class,
+        model_name_or_path,
+        config_overwrite,
+        parallel_sizes,
+        from_pretrained,
+        lazy_load,
+        sequence_parallel_enabled,
+        parallelize_embeddings,
     ):
-        if "GPTNeoX" in model_class_name:
-            self.skipTest("GPTNeoX test is flaky, needs to be fixed.")
-
-        if num_neuron_cores < tp_size:
-            raise ValueError(
-                "The number of Neuron cores available is lower than the TP size, failing since the test might not be "
-                "testing what is expected."
-            )
-
-        if run_test_in_parallel and (NUM_NEURON_CORES_AVAILABLE // num_neuron_cores) < 2:
-            raise ValueError(
-                "The test cannot be run in parallel because there is not enough Neuron cores available to preserve the "
-                f"number of Neuron cores requested ({NUM_NEURON_CORES_AVAILABLE} cores available and {num_neuron_cores} "
-                "were requested)"
-            )
-
-        template_content = None
-        current_directory = Path(__file__).parent.resolve()
-        template_file_path = current_directory / TEMPLATE_FILE_NAME
-        with open(template_file_path, "r") as fp:
-            template_content = fp.read()
-
-        specialization_env = {
-            "from_config": "true" if from_config else "false",
-            "lazy_load": "true" if with_lazy_load else "false",
-            "parallelize_embeddings": "true" if parallelize_embeddings else "false",
-            "sequence_parallel_enabled": "true" if sequence_parallel_enabled else "false",
-            "computing_loss_is_supported": "true",
-            **os.environ,
-        }
-
-        # Updating the Python path to be able to use `tests/distributed/utils.py`.
-        python_path = specialization_env.get("PYTHONPATH", "")
-        python_path = f"{current_directory}:{python_path}"
-        specialization_env["PYTHONPATH"] = python_path
-
-        if overwrite_model_config is not None:
-            specialization_env["config_overwrite"] = ",".join(
-                f"{key}={value}" for key, value in overwrite_model_config.items()
-            )
-
-        with TemporaryDirectory() as tmpdirname:
-            specialization_data = {
-                "model_class": model_class_name,
-                "model_name_or_path": model_name_or_path,
-                "parallelize_embeddings": "True" if parallelize_embeddings else "False",
-                "tp_size": tp_size,
-                "output_path": tmpdirname,
-            }
-            specialized_content = template_content.format(**specialization_data)
-            with open(f"{tmpdirname}/code.py", "w") as fp:
-                fp.write(specialized_content)
-
-            cmd = ["torchrun", f"--nproc_per_node={num_neuron_cores}", f"{tmpdirname}/code.py"]
-
-            # When running the test in parallel, we need 2 rendez-vous endpoints: one for the script running the
-            # original model and one for the script running the parallel model.
-            rdzv_endpoint_host = "localhost"
-            rdzv_endpoint_port = 29400
-
-            orig_neuron_cc_flags = os.environ.get("NEURON_CC_FLAGS", "")
-            set_neuron_cache_path(tmpdirname)
-            neuron_cc_flags = os.environ["NEURON_CC_FLAGS"]
-            os.environ["NEURON_CC_FLAGS"] = orig_neuron_cc_flags
-
-            # Original model.
-            env = {"is_parallel": "false", **specialization_env, "NEURON_CC_FLAGS": neuron_cc_flags}
-            if run_test_in_parallel:
-                # Setting the rendez-vous endpoint for the original model process.
-                cmd.insert(1, f"--rdzv_endpoint={rdzv_endpoint_host}:{rdzv_endpoint_port}")
-                env["NEURON_RT_VISIBLE_CORES"] = f"0-{num_neuron_cores - 1}"
-
-            # When running tests in parallel, synchronization is done after both processes started.
-            if not run_test_in_parallel:
-                p_original_returncode, stdout = run_command_with_realtime_output(cmd, env=env)
-            else:
-                p_original = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env)
+        if model_class.__name__ in MODEL_CLASSES_TO_IGNORE:
+            pytest.skip(f"Skipping test for {model_class.__name__} since it is buggy or a special case.")
+
+        world_size, tp_size, pp_size = parallel_sizes
+        dp_size = world_size // (tp_size * pp_size)
+        pp_rank = get_pipeline_model_parallel_rank()
+
+        orig_model = get_model(
+            model_class,
+            model_name_or_path,
+            from_config=not from_pretrained,
+            config_overwrite=config_overwrite,
+            use_static_seed_patcher=True,
+        )
+        orig_model = NeuronAccelerator.patch_model_for_neuron(orig_model)
 
-            # Parallel model.
-            env = {"is_parallel": "true", **specialization_env, "NEURON_CC_FLAGS": neuron_cc_flags}
-            if run_test_in_parallel:
-                # Updating the rendez-vous endpoint for the parallel model process.
-                cmd[1] = f"--rdzv_endpoint={rdzv_endpoint_host}:{rdzv_endpoint_port + 1}"
-                env["NEURON_RT_VISIBLE_CORES"] = f"{num_neuron_cores}-{2 * num_neuron_cores - 1}"
+        set_neuron_cc_optlevel_for_model(orig_model)
 
-                p_parallel = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env)
+        move_model_to_device(orig_model, xm.xla_device())
+        orig_model = orig_model.eval()
 
-                stdout, _ = p_original.communicate()
-                p_original_returncode = p_original.returncode
-                stdout = stdout.decode("utf-8")
-                full_output = f"Original model standard output:\n{stdout}"
-                print(full_output)
+        manager = ParallelizersManager.parallelizer_for_model(orig_model)
 
-                stdout, _ = p_parallel.communicate()
-                p_parallel_returncode = p_parallel.returncode
-                stdout = stdout.decode("utf-8")
-                full_output = f"Parallel model standard output:\n{stdout}"
-                print(full_output)
+        if pp_size > 1 and not manager.supports_pipeline_parallelism():
+            pytest.skip(f"Pipeline parallelism is not supported for {model_class.__name__}.")
 
-            else:
-                p_parallel_returncode, stdout = run_command_with_realtime_output(cmd, env=env)
-
-            assert p_original_returncode == 0
-            assert p_parallel_returncode == 0
-
-            temporary_dir = Path(tmpdirname)
-            original_model_outputs = torch.load(temporary_dir / "original.bin")
-            parallel_model_outputs = torch.load(temporary_dir / "parallel.bin")
-
-            if (
-                not from_config
-                and with_lazy_load
-                and model_class_name in MODEL_CLASSES_TO_IGNORE_ON_LAZY_LOAD_FOR_FROM_PRETRAINED
-            ):
-                self.skipTest(
-                    f"Cannot compare outputs for {model_class_name} when doing from_pretrained + lazy loading."
-                )
+        if sequence_parallel_enabled and not manager.supports_sequence_parallelism():
+            pytest.skip(f"Sequence parallelism is not supported for {model_class.__name__}.")
 
-            for name, t in original_model_outputs.items():
-                if name in self.OUTPUTS_TO_IGNORE:
-                    continue
-                print(f"Testing that {name} match.")
-                regular_parallel_outputs_error_msg = None
-                gathered_parallel_outputs_error_msg = None
-                try:
-                    self._check_output(name, t, parallel_model_outputs[name], with_lazy_load)
-                except AssertionError as e:
-                    regular_parallel_outputs_error_msg = str(e)
-                if regular_parallel_outputs_error_msg is not None:
-                    print("Regular output did not match, testing with the gathered output...")
-                    try:
-                        self._check_output(name, t, parallel_model_outputs[f"gathered_{name}"], with_lazy_load)
-                    except AssertionError as e:
-                        gathered_parallel_outputs_error_msg = str(e)
-                if regular_parallel_outputs_error_msg is not None and gathered_parallel_outputs_error_msg is not None:
-                    msg = (
-                        "Output did not matched.\nTest with non-gathered parallel outputs error:\n"
-                        f"{regular_parallel_outputs_error_msg}\nTest with gathered parallel outputs error:\n"
-                        f"{gathered_parallel_outputs_error_msg}"
-                    )
-                    raise AssertionError(msg)
-                print("Ok!")
-
-    @parameterized.expand(MODELS_TO_TEST)
-    def test_model_parallel_from_config_no_lazy_load(
-        self, model_class_name: str, model_name_or_path: str, config_overwrite: Dict[str, str]
-    ):
-        # In this test, we:
-        #   1. Test parallelism when initializing from a config.
-        #   2. Do not enable embedding parallelization => while behaviour could differ between a model initialized
-        #      lazily or not, the risk is minimal. This feature is tested on the next test with lazy loading.
-        #   3. Do not enable sequence parallelism => this feature should not depend on whether the model is initialized
-        #      lazily or not.
-        self._test_model_parallel(
-            num_neuron_cores=8,
-            tp_size=2,
-            run_test_in_parallel=True,
-            model_class_name=model_class_name,
-            model_name_or_path=model_name_or_path,
-            from_config=True,
-            with_lazy_load=False,
-            parallelize_embeddings=False,
-            sequence_parallel_enabled=False,
-            overwrite_model_config=config_overwrite,
+        pad_to_multiple_of = None if not sequence_parallel_enabled else tp_size
+        inputs = get_model_inputs(
+            orig_model, model_name_or_path, batch_size=dp_size, pad_to_multiple_of=pad_to_multiple_of
         )
 
-    @parameterized.expand(MODELS_TO_TEST)
-    def test_model_parallel_from_config_lazy_load(
-        self, model_class_name: str, model_name_or_path: str, config_overwrite: Dict[str, str]
-    ):
-        # In this test, we:
-        #   1. Test parallelism when initializing lazily from a config.
-        #   2. Enable embedding parallelization.
-        #   3. Enable sequence parallelism.
-        self._test_model_parallel(
-            num_neuron_cores=8,
-            tp_size=2,
-            run_test_in_parallel=True,
-            model_class_name=model_class_name,
-            model_name_or_path=model_name_or_path,
-            from_config=True,
-            with_lazy_load=True,
-            parallelize_embeddings=True,
-            sequence_parallel_enabled=True,
-            overwrite_model_config=config_overwrite,
+        xla_inputs = {k: v.to(xm.xla_device()) for k, v in inputs.items()}
+        xm.mark_step()
+
+        with torch.no_grad():
+            orig_model_outputs = orig_model(**xla_inputs)
+
+        xm.mark_step()
+
+        # The parallel model needs to be defined after the forward pass of the first model because there is a
+        # global monkey patching of the `torch.nn.CrossEntropyLoss` class when doing sequence parallelism.
+        model = get_model(
+            model_class,
+            model_name_or_path,
+            tp_size=tp_size,
+            pp_size=pp_size,
+            lazy_load=lazy_load,
+            from_config=not from_pretrained,
+            config_overwrite=config_overwrite,
+            use_static_seed_patcher=True,
+        )
+
+        accelerator = create_accelerator_for_mp(
+            tp_size,
+            pp_size,
+            parallelize_embeddings=parallelize_embeddings,
+            sequence_parallel_enabled=sequence_parallel_enabled,
         )
 
-    @parameterized.expand(MODELS_TO_TEST)
-    def test_model_parallel_from_pretrained_no_lazy_load(
-        self, model_class_name: str, model_name_or_path: str, config_overwrite: Dict[str, str]
+        from .utils import create_static_seed_patcher
+
+        static_seed_patcher = create_static_seed_patcher(model.__class__, SEED)
+        with static_seed_patcher:
+            model = accelerator.prepare(model)
+
+        xm.mark_step()
+
+        model = accelerator.patch_model_for_neuron(model)
+        with torch.no_grad():
+            if pp_size == 1:
+                model = model.eval()
+                model_outputs = model(**xla_inputs)
+            else:
+                loss = model.run_eval(**inputs)
+                model_outputs = {"loss": loss}
+
+        xm.mark_step()
+
+        outputs_to_consider = [
+            output_name for output_name in orig_model_outputs if output_name not in self.OUTPUTS_TO_IGNORE
+        ]
+
+        if pp_size > 1:
+            outputs_to_consider = ["loss"]
+
+        outputs_to_check = [
+            (orig_model_outputs[output_name], model_outputs[output_name]) for output_name in outputs_to_consider
+        ]
+        outputs_to_check = pytree.tree_map(move_all_tensor_to_cpu, outputs_to_check)
+
+        for output_name, outputs in zip(outputs_to_consider, outputs_to_check):
+            if all(output is None for output in outputs):
+                continue
+            if pp_size == 1 or pp_rank == pp_size - 1:
+                self._check_output(output_name, outputs[0], outputs[1])
+
+    def test_parallel_model_matches_original_model_from_pretrained_with_parallel_embeddings_and_sequence_parallel(
+        self,
+        model_specs,
+        parallel_sizes,
+        monkeypatch,
     ):
-        # In this test, we:
-        #   1. Test parallelism when initializing from pretrained weights.
-        #   2. Do not enable embedding parallelization => while behaviour could differ between a model initialized
-        #      lazily or not, the risk is minimal. This feature is tested on the next test with lazy loading.
-        #   3. Do not enable sequence parallelism => this feature should not depend on whether the model is initialized
-        #      lazily or not.
-        self._test_model_parallel(
-            num_neuron_cores=8,
-            tp_size=2,
-            run_test_in_parallel=True,
-            model_class_name=model_class_name,
-            model_name_or_path=model_name_or_path,
-            from_config=False,
-            with_lazy_load=False,
-            parallelize_embeddings=False,
-            sequence_parallel_enabled=False,
-            overwrite_model_config=config_overwrite,
+        _, model_class, model_name_or_path, config_overwrite = model_specs
+        monkeypatch.setattr(
+            optimum.neuron.distributed.parallel_layers, "_PARALLEL_CROSS_ENTROPY_SHOULD_PRESERVE_INPUT", True
+        )
+        return self._parallel_model_matches_original_model(
+            model_class, model_name_or_path, config_overwrite, parallel_sizes, True, True, True, True
         )
 
-    @parameterized.expand(MODELS_TO_TEST)
-    def test_model_parallel_from_pretrained_lazy_load(
-        self, model_class_name: str, model_name_or_path: str, config_overwrite: Dict[str, str]
+    @pytest.mark.skip("Model parallelism from config is not fully supported yet.")
+    def test_parallel_model_matches_original_model_from_config(
+        self,
+        model_specs,
+        parallel_sizes,
+        monkeypatch,
     ):
-        # In this test, we:
-        #   1. Test parallelism when initializing lazily from pretrained weights.
-        #   2. Enable embedding parallelization.
-        #   3. Enable sequence parallelism.
-        self._test_model_parallel(
-            num_neuron_cores=8,
-            tp_size=2,
-            run_test_in_parallel=True,
-            model_class_name=model_class_name,
-            model_name_or_path=model_name_or_path,
-            from_config=False,
-            with_lazy_load=True,
-            parallelize_embeddings=True,
-            sequence_parallel_enabled=True,
-            overwrite_model_config=config_overwrite,
+        _, model_class, model_name_or_path, config_overwrite = model_specs
+        monkeypatch.setattr(
+            optimum.neuron.distributed.parallel_layers, "_PARALLEL_CROSS_ENTROPY_SHOULD_PRESERVE_INPUT", True
+        )
+        return self._parallel_model_matches_original_model(
+            model_class, model_name_or_path, config_overwrite, parallel_sizes, False, True, False, False
         )
 
     @pytest.mark.skipif(
         NUM_NEURON_CORES_AVAILABLE < 32,
         reason=f"This test requires 32 Neuron cores, but only {NUM_NEURON_CORES_AVAILABLE} are available",
     )
-    def test_llama_v2_gqa_variants(self):
-        llama_v2_model_name = "anushehchaudry/llama-2-tiny-random"
-        # MHA setup
-        # TP size = 2, num_attention_heads = 8, num_key_value_heads = 8
-        self._test_model_parallel(
-            num_neuron_cores=8,
-            tp_size=2,
-            run_test_in_parallel=True,
-            model_class_name="LlamaForCausalLM",
-            model_name_or_path=llama_v2_model_name,
-            from_config=True,
-            with_lazy_load=False,
-            parallelize_embeddings=False,
-            sequence_parallel_enabled=False,
-            overwrite_model_config={
-                "num_hidden_layers": "2",
-                "num_attention_heads": "8",
-                "num_key_value_heads": "8",
-            },
-        )
-
-        # GQA setup with num_key_value_heads > tp_size.
-        # TP size = 2, num_attention_heads = 8, num_key_value_heads = 4
-        self._test_model_parallel(
-            num_neuron_cores=8,
-            tp_size=2,
-            run_test_in_parallel=True,
-            model_class_name="LlamaForCausalLM",
-            model_name_or_path=llama_v2_model_name,
-            from_config=True,
-            with_lazy_load=False,
-            parallelize_embeddings=False,
-            sequence_parallel_enabled=False,
-            overwrite_model_config={
-                "num_hidden_layers": "2",
-                "num_attention_heads": "8",
-                "num_key_value_heads": "4",
-            },
-        )
-
-        # GQA setup with num_key_value_heads = tp_size.
-        # TP size = 8, num_attention_heads = 16, num_key_value_heads = 8
-        self._test_model_parallel(
-            num_neuron_cores=8,
-            tp_size=8,
-            run_test_in_parallel=True,
-            model_class_name="LlamaForCausalLM",
-            model_name_or_path=llama_v2_model_name,
-            from_config=True,
-            with_lazy_load=False,
-            parallelize_embeddings=False,
-            sequence_parallel_enabled=False,
-            overwrite_model_config={
-                "num_hidden_layers": "2",
-                "hidden_size": "32",
-                "num_attention_heads": "16",
-                "num_key_value_heads": "8",
-            },
-        )
-
-        # GQA setup with num_key_value_heads < tp_size.
-        # TP size = 8, num_attention_heads = 16, num_key_value_heads = 2
-        self._test_model_parallel(
-            num_neuron_cores=8,
-            tp_size=8,
-            run_test_in_parallel=True,
-            model_class_name="LlamaForCausalLM",
-            model_name_or_path=llama_v2_model_name,
-            from_config=True,
-            with_lazy_load=False,
-            parallelize_embeddings=False,
-            sequence_parallel_enabled=False,
-            overwrite_model_config={
-                "num_hidden_layers": "2",
-                "hidden_size": "32",
-                "num_attention_heads": "16",
-                "num_key_value_heads": "2",
-            },
+    @pytest.mark.parametrize(
+        "world_size,tp_size,pp_size,config_overwrite",
+        LLAMA_GQA_VARIANTS_TO_TEST.values(),
+        ids=LLAMA_GQA_VARIANTS_TO_TEST.keys(),
+    )
+    def test_llama_v2_gqa_variants(self, world_size, tp_size, pp_size, config_overwrite, monkeypatch):
+        monkeypatch.setattr(
+            optimum.neuron.distributed.parallel_layers, "_PARALLEL_CROSS_ENTROPY_SHOULD_PRESERVE_INPUT", True
         )
-
-        # MQA setup
-        # TP size = 8, num_attention_heads = 16, num_key_value_heads = 1
-        self._test_model_parallel(
-            num_neuron_cores=8,
-            tp_size=8,
-            run_test_in_parallel=True,
-            model_class_name="LlamaForCausalLM",
-            model_name_or_path=llama_v2_model_name,
-            from_config=True,
-            with_lazy_load=False,
-            parallelize_embeddings=False,
-            sequence_parallel_enabled=False,
-            overwrite_model_config={
-                "num_hidden_layers": "2",
-                "hidden_size": "32",
-                "num_attention_heads": "16",
-                "num_key_value_heads": "1",
-            },
+        return self._parallel_model_matches_original_model(
+            LlamaForCausalLM,
+            LLAMA_V2_MODEL_NAME,
+            config_overwrite,
+            (world_size, tp_size, pp_size),
+            False,
+            False,
+            False,
+            False,
         )
diff --git a/tests/distributed/test_training.py b/tests/distributed/test_training.py
index f0bfc7351..9067495c3 100644
--- a/tests/distributed/test_training.py
+++ b/tests/distributed/test_training.py
@@ -14,118 +14,154 @@
 # limitations under the License.
 """Tests related to training with `neuronx_distributed`."""
 
-import os
+import json
 from pathlib import Path
-from tempfile import TemporaryDirectory
-from unittest import TestCase
 
-from huggingface_hub import HfFolder
+import pytest
+from datasets import load_dataset
+from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
 
-from optimum.neuron.utils.cache_utils import (
-    delete_custom_cache_repo_name_from_hf_home,
-    load_custom_cache_repo_name_from_hf_home,
-    set_custom_cache_repo_name_in_hf_home,
-)
-from optimum.neuron.utils.runner import ExampleRunner
+from optimum.neuron.training_args import NeuronTrainingArguments
 from optimum.neuron.utils.testing_utils import is_trainium_test
 
+from .distributed import DistributedTest
 
-_TINY_BERT_MODEL_NAME = "hf-internal-testing/tiny-random-bert"
+
+MODEL_NAME = "michaelbenayoun/llama-2-tiny-16layers-random"
 
 
 @is_trainium_test
-class DistributedTrainingTestCase(TestCase):
+class TestDistributedTraining(DistributedTest):
     CACHE_REPO_NAME = "optimum-internal-testing/optimum-neuron-cache-for-testing"
 
-    @classmethod
-    def setUpClass(cls):
-        orig_token = HfFolder.get_token()
-        orig_cache_repo = load_custom_cache_repo_name_from_hf_home()
-        ci_token = os.environ.get("HF_TOKEN_OPTIMUM_NEURON_CI", None)
-        if ci_token is not None:
-            HfFolder.save_token(ci_token)
-            set_custom_cache_repo_name_in_hf_home(cls.CACHE_REPO_NAME)
-        cls._token = orig_token
-        cls._cache_repo = orig_cache_repo
-        cls._env = dict(os.environ)
-
-    @classmethod
-    def tearDownClass(cls):
-        os.environ = cls._env
-        if cls._token is not None:
-            HfFolder.save_token(cls._token)
-        if cls._cache_repo is not None:
-            set_custom_cache_repo_name_in_hf_home(cls._cache_repo)
-        else:
-            delete_custom_cache_repo_name_from_hf_home()
-
-    def test_tp_save_and_resume_from_checkpoint(self):
-        num_cores = 8
-        precision = "bf16"
-        tensor_parallel_size = 2
+    @pytest.fixture(
+        scope="class",
+        params=[[2, 1, 1], [2, 2, 1], [2, 1, 2]],
+        ids=["dp=2", "tp=2", "pp=2"],
+    )
+    def parallel_sizes(self, request):
+        return request.param
+
+    def test_save_and_resume_from_checkpoint(self, parallel_sizes, tmpdir):
+        from optimum.neuron.trainers import NeuronTrainer
+
+        tmpdir = Path(tmpdir)
+        _, tp_size, pp_size = parallel_sizes
         train_batch_size = 2
         eval_batch_size = 2
-        sequence_length = 16
         max_steps = 10
-        save_steps = 2
         do_eval = True
+        max_train_samples = 100
         max_eval_samples = 16
 
-        with TemporaryDirectory() as tmpdirname:
-            output_dir = Path(tmpdirname)
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        tokenizer.pad_token = tokenizer.eos_token
 
-            runner = ExampleRunner(_TINY_BERT_MODEL_NAME, "text-classification")
-
-            first_output_dir = output_dir / "first_run"
-            returncode, _ = runner.run(
-                num_cores,
-                precision,
-                train_batch_size,
-                eval_batch_size=eval_batch_size,
-                sequence_length=sequence_length,
-                tensor_parallel_size=tensor_parallel_size,
+        def create_training_args(output_dir, resume_from_checkpoint=None, max_steps=max_steps):
+            if isinstance(output_dir, Path):
+                output_dir = output_dir.as_posix()
+            if isinstance(resume_from_checkpoint, Path):
+                resume_from_checkpoint = resume_from_checkpoint.as_posix()
+            args = NeuronTrainingArguments(
+                tensor_parallel_size=tp_size,
+                pipeline_parallel_size=pp_size,
+                bf16=True,
+                per_device_train_batch_size=train_batch_size,
+                per_device_eval_batch_size=eval_batch_size,
                 max_steps=max_steps,
-                save_steps=save_steps,
+                logging_steps=1,
+                save_steps=2,
                 do_eval=do_eval,
-                max_eval_samples=max_eval_samples,
-                output_dir=first_output_dir,
-                print_outputs=True,
+                output_dir=output_dir,
+                resume_from_checkpoint=resume_from_checkpoint,
+                skip_cache_push=True,
             )
-            assert returncode == 0, "First run failed."
-
-            # Case 1: Resuming from checkpoint by specifying a checkpoint directory.
-            second_output_dir = output_dir / "second_run"
-            returncode, _ = runner.run(
-                num_cores,
-                precision,
-                train_batch_size,
-                eval_batch_size=eval_batch_size,
-                sequence_length=sequence_length,
-                tensor_parallel_size=tensor_parallel_size,
-                max_steps=max_steps,
-                save_steps=save_steps,
-                do_eval=do_eval,
-                max_eval_samples=max_eval_samples,
-                output_dir=second_output_dir,
-                resume_from_checkpoint=first_output_dir / "checkpoint-4",
-                print_outputs=True,
+            return args
+
+        def create_model():
+            config = AutoConfig.from_pretrained(MODEL_NAME)
+            config.num_hidden_layers = 2 * max(1, pp_size)
+            config.num_attention_heads = 2
+            config.num_key_value_heads = 2
+            config.problem_type = "single_label_classification"
+            # config.use_cache = False
+            model = AutoModelForSequenceClassification.from_pretrained(
+                MODEL_NAME, config=config, ignore_mismatched_sizes=True
             )
-            assert returncode == 0, "Second run failed."
-
-            # Case 2: Resuming from checkpoint by specifying a boolean, in this case it should look inside the output
-            # directory.
-            returncode, _ = runner.run(
-                num_cores,
-                precision,
-                train_batch_size,
-                eval_batch_size=eval_batch_size,
-                sequence_length=sequence_length,
-                tensor_parallel_size=tensor_parallel_size,
-                max_steps=max_steps + 10,  # So that it makes more steps since we are restauring from the third run.
-                save_steps=save_steps,
-                do_eval=do_eval,
-                max_eval_samples=max_eval_samples,
-                output_dir=second_output_dir,
-                print_outputs=True,
+            return model
+
+        # First run setting.
+        first_output_dir = tmpdir / "first_run"
+        args = create_training_args(first_output_dir)
+        model = create_model()
+
+        # Dataset preprocessing
+        raw_datasets = load_dataset("glue", "sst2")
+        sentence1_key = "sentence"
+        sentence2_key = None
+        label_to_id = None
+        max_seq_length = 32
+        padding = "max_length"
+
+        def preprocess_function(examples):
+            # Tokenize the texts
+            args = (
+                (examples[sentence1_key],)
+                if sentence2_key is None
+                else (examples[sentence1_key], examples[sentence2_key])
             )
-            assert returncode == 0, "Third run failed."
+            result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
+
+            # Map labels to IDs (not necessary for GLUE tasks)
+            if label_to_id is not None and "label" in examples:
+                result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
+            return result
+
+        with args.main_process_first(desc="dataset map pre-processing"):
+            raw_datasets = raw_datasets.map(preprocess_function, batched=True)
+            train_dataset = raw_datasets["train"]
+            train_dataset = train_dataset.select(range(max_train_samples))
+            eval_dataset = raw_datasets["validation"]
+            eval_dataset = eval_dataset.select(range(max_eval_samples))
+
+        trainer = NeuronTrainer(
+            model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer
+        )
+
+        train_result = trainer.train()
+        trainer.evaluate()
+        trainer.save_metrics("train", train_result.metrics)
+
+        with open(first_output_dir / "train_results.json") as fp:
+            first_training_report = json.load(fp)
+
+        # Case 1: Resuming from checkpoint by specifying a checkpoint directory.
+        second_output_dir = tmpdir / "second_run"
+        resume_from_checkpoint = first_output_dir / "checkpoint-4"
+        args = create_training_args(second_output_dir, resume_from_checkpoint=resume_from_checkpoint)
+        model = create_model()
+        trainer = NeuronTrainer(
+            model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer
+        )
+
+        train_result = trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+        trainer.evaluate()
+        trainer.save_metrics("train", train_result.metrics)
+
+        with open(first_output_dir / "train_results.json") as fp:
+            second_training_report = json.load(fp)
+
+        assert first_training_report["train_loss"] == second_training_report["train_loss"]
+
+        # Case 2: Resuming from checkpoint by specifying an output_dir with checkpoints.
+        # max_steps + 10 to do a some training steps than the previous run.
+        second_output_dir = first_output_dir
+        args = create_training_args(second_output_dir, max_steps=max_steps + 10)
+        model = create_model()
+
+        trainer = NeuronTrainer(
+            model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer
+        )
+
+        trainer.train(resume_from_checkpoint=True)
+        trainer.evaluate()
diff --git a/tests/distributed/utils.py b/tests/distributed/utils.py
index b021ae4aa..8cd35f214 100644
--- a/tests/distributed/utils.py
+++ b/tests/distributed/utils.py
@@ -14,12 +14,14 @@
 # limitations under the License.
 """Utilities for tests distributed."""
 
+import contextlib
 import functools
 import inspect
-from contextlib import contextmanager
+from pathlib import Path
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Type, Union
 
 import torch
+from transformers import AutoConfig, AutoTokenizer
 from transformers.models.auto import get_values
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
@@ -39,6 +41,8 @@
     MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
 )
 
+from optimum.neuron import ModelParallelismPlugin, NeuronAccelerator
+from optimum.neuron.distributed import lazy_load_for_parallelism
 from optimum.neuron.utils.patching import DynamicPatch, Patcher
 from optimum.neuron.utils.require_utils import requires_neuronx_distributed, requires_torch_xla
 
@@ -47,6 +51,10 @@
     from transformers import PreTrainedModel
 
 
+SEED = 42
+
+
+@requires_neuronx_distributed
 def generate_dummy_labels(
     model: "PreTrainedModel",
     shape: List[int],
@@ -55,8 +63,13 @@ def generate_dummy_labels(
     device: Optional[Union[str, torch.device]] = None,
 ) -> Dict[str, torch.Tensor]:
     """Generates dummy labels."""
+    from neuronx_distributed.pipeline import NxDPPModel
+
+    if isinstance(model, NxDPPModel):
+        model_class_name = model.original_torch_module.__class__.__name__
+    else:
+        model_class_name = model.__class__.__name__
 
-    model_class_name = model.__class__.__name__
     labels = {}
 
     batch_size = shape[0]
@@ -99,10 +112,9 @@ def generate_dummy_labels(
                 f', or "multi_label_classification", but "{model.config.problem_type}" was provided.'
             )
         labels["labels"] = torch.zeros(*labels_shape, dtype=labels_dtype, device=device)
-
     elif model_class_name in [
-        *get_values(MODEL_FOR_PRETRAINING_MAPPING_NAMES),
         *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES),
+        *get_values(MODEL_FOR_PRETRAINING_MAPPING_NAMES),
         *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
         *get_values(MODEL_FOR_MASKED_LM_MAPPING_NAMES),
         *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES),
@@ -113,12 +125,16 @@ def generate_dummy_labels(
     ]:
         if vocab_size is None:
             raise ValueError(
-                "The vocabulary size needs to be specified to generte dummy labels for language-modeling tasks."
+                "The vocabulary size needs to be specified to generate dummy labels for language-modeling tasks."
             )
         if seed is not None:
             orig_seed = torch.seed()
             torch.manual_seed(seed)
-        random_labels = torch.randint(0, vocab_size, shape, dtype=torch.long)
+        if model_class_name in get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES):
+            max_value = model.config.num_labels
+        else:
+            max_value = vocab_size
+        random_labels = torch.randint(0, max_value, shape, dtype=torch.long)
         if device is not None:
             random_labels = random_labels.to(device)
         labels["labels"] = random_labels
@@ -211,7 +227,7 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-@contextmanager
+@contextlib.contextmanager
 def create_static_seed_patcher(model_class: Type["PreTrainedModel"], seed: int):
     """
     Context manager that resets the seed to a given value for every initialization function.
@@ -220,14 +236,14 @@ def create_static_seed_patcher(model_class: Type["PreTrainedModel"], seed: int):
     """
     specialized_static_initializer_seed = functools.partial(static_initializer_seed, seed=seed)
 
-    class_module_name = inspect.getmodule(model_class).__name__
-    fully_qualified_method_name = f"{class_module_name}.{model_class.__name__}._init_weights"
+    inspect.getmodule(model_class).__name__
     dynamic_patch = DynamicPatch(specialized_static_initializer_seed)
     patcher = Patcher(
         [
-            (fully_qualified_method_name, dynamic_patch),
+            # (fully_qualified_method_name, dynamic_patch),
             ("torch.nn.Embedding.reset_parameters", dynamic_patch),
             ("torch.nn.Linear.reset_parameters", dynamic_patch),
+            ("torch.Tensor.normal_", dynamic_patch),
             ("neuronx_distributed.parallel_layers.layers.ColumnParallelLinear.init_weight_cpu", dynamic_patch),
             ("neuronx_distributed.parallel_layers.layers.RowParallelLinear.init_weight_cpu", dynamic_patch),
         ]
@@ -237,3 +253,116 @@ def create_static_seed_patcher(model_class: Type["PreTrainedModel"], seed: int):
             yield
         finally:
             pass
+
+
+def get_model(
+    model_class: Type["PreTrainedModel"],
+    model_name_or_path: str,
+    tp_size: int = 1,
+    pp_size: int = 1,
+    lazy_load: bool = False,
+    from_config: bool = False,
+    use_static_seed_patcher: bool = False,
+    add_random_noise: bool = False,
+    config_overwrite: Optional[Dict[str, str]] = None,
+) -> "PreTrainedModel":
+    if lazy_load:
+        ctx = lazy_load_for_parallelism(tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size)
+    else:
+        ctx = contextlib.nullcontext()
+    if use_static_seed_patcher:
+        seed_patcher = create_static_seed_patcher(model_class, SEED)
+    else:
+        seed_patcher = contextlib.nullcontext()
+    with ctx:
+        with seed_patcher:
+            config = AutoConfig.from_pretrained(model_name_or_path)
+            if config_overwrite is not None:
+                for key, value in config_overwrite.items():
+                    attr_type = type(getattr(config, key))
+                    setattr(config, key, attr_type(value))
+            if from_config:
+                model = model_class(config)
+            else:
+                model = model_class.from_pretrained(model_name_or_path, config=config, ignore_mismatched_sizes=True)
+
+    if getattr(model.config, "problem_type", None) is None:
+        model.config.problem_type = "single_label_classification"
+
+    if add_random_noise:
+        for param in model.parameters():
+            param.data.add_(torch.randn_like(param))
+
+    return model
+
+
+def get_model_inputs(
+    model: "PreTrainedModel",
+    model_name_or_path: str,
+    include_labels: bool = True,
+    random_labels: bool = True,
+    batch_size: int = 1,
+    pad_to_multiple_of: Optional[int] = None,
+):
+    input_str = "Hello there, I'm Michael and I live in Paris!"
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+
+    inputs = tokenizer(input_str, return_tensors="pt")
+
+    if model.config.is_encoder_decoder:
+        sig = inspect.signature(model.forward)
+        for input_name in inputs:
+            decoder_input_name = f"decoder_{input_name}"
+            if decoder_input_name in sig.parameters:
+                inputs[decoder_input_name] = inputs[input_name].clone()
+
+    if include_labels:
+        if random_labels:
+            labels = generate_dummy_labels(model, inputs["input_ids"].shape, vocab_size=model.config.vocab_size)
+            inputs.update(**labels)
+        else:
+            labels = tokenizer(input_str, return_tensors="pt")["input_ids"]
+            inputs["labels"] = labels
+
+    if batch_size > 1:
+        for name, tensor in inputs.items():
+            repeat = [batch_size] + [1] * (tensor.dim() - 1)
+            tensor = tensor.repeat(*repeat)
+            inputs[name] = tensor
+
+    if pad_to_multiple_of is not None:
+        pad_token_id = getattr(model.config, "pad_token_id", 1)
+        for name, tensor in inputs.items():
+            if tensor.dim() == 2 and tensor.shape[1] % pad_to_multiple_of != 0:
+                if "attention_mask" not in name:
+                    pad_value = pad_token_id
+                else:
+                    pad_value = 1
+                tensor = torch.nn.functional.pad(
+                    tensor,
+                    pad=(0, pad_to_multiple_of - tensor.shape[1] % pad_to_multiple_of),
+                    value=pad_value,
+                )
+                inputs[name] = tensor
+    return inputs
+
+
+def create_accelerator_for_mp(
+    tp_size: int,
+    pp_size: int,
+    zero_1: bool = False,
+    gradient_accumulation_steps: int = 1,
+    parallelize_embeddings: bool = True,
+    sequence_parallel_enabled: bool = True,
+    checkpoint_dir: Optional[Union[Path, str]] = None,
+) -> NeuronAccelerator:
+    mp_plugin = ModelParallelismPlugin(
+        tensor_parallel_size=tp_size,
+        parallelize_embeddings=parallelize_embeddings,
+        sequence_parallel_enabled=sequence_parallel_enabled,
+        pipeline_parallel_size=pp_size,
+        checkpoint_dir=checkpoint_dir,
+    )
+    return NeuronAccelerator(
+        mp_plugin=mp_plugin, zero_1=zero_1, gradient_accumulation_steps=gradient_accumulation_steps
+    )
diff --git a/tests/test_cache_utils.py b/tests/test_cache_utils.py
index f92dba1d1..ffd2c2e7d 100644
--- a/tests/test_cache_utils.py
+++ b/tests/test_cache_utils.py
@@ -24,8 +24,9 @@
 from typing import List
 from unittest import TestCase
 
+import huggingface_hub
 import torch
-from huggingface_hub import HfApi, HfFolder, create_repo, delete_repo, hf_hub_download
+from huggingface_hub import HfApi, create_repo, delete_repo, get_token, hf_hub_download, login
 from transformers import BertConfig, BertModel, set_seed
 from transformers.testing_utils import TOKEN as TRANSFORMERS_TOKEN
 from transformers.testing_utils import USER as TRANSFORMERS_USER
@@ -246,8 +247,8 @@ def test_list_in_registry_dict(self):
 @is_staging_test
 class StagingNeuronUtilsTestCase(StagingTestMixin, TestCase):
     def test_set_custom_cache_repo_name_in_hf_home(self):
-        orig_token = HfFolder.get_token()
-        HfFolder.save_token(TOKEN)
+        orig_token = get_token()
+        login(TOKEN)
 
         repo_name = f"blablabla-{self.seed}"
         repo_id = f"{USER}/{repo_name}"
@@ -262,7 +263,7 @@ def remove_repo():
             except ValueError as e:
                 remove_repo()
                 if orig_token:
-                    HfFolder.save_token(orig_token)
+                    login(orig_token)
                 self.fail(str(e))
 
             with open(f"{tmpdirname}/{CACHE_REPO_FILENAME}", "r") as fp:
@@ -276,20 +277,25 @@ def remove_repo():
 
             remove_repo()
             if orig_token:
-                HfFolder.save_token(orig_token)
+                login(orig_token)
 
     def test_has_write_access_to_repo(self):
+        orig_token = get_token()
+
         wrong_token = "random_string"
-        HfFolder.save_token(wrong_token)
+        path = Path(huggingface_hub.constants.HF_TOKEN_PATH)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(wrong_token)
 
         self.assertFalse(has_write_access_to_repo(self.CUSTOM_CACHE_REPO))
         self.assertFalse(has_write_access_to_repo(self.CUSTOM_PRIVATE_CACHE_REPO))
 
-        HfFolder.save_token(self._staging_token)
+        login(orig_token)
 
         self.assertTrue(has_write_access_to_repo(self.CUSTOM_CACHE_REPO))
         self.assertTrue(has_write_access_to_repo(self.CUSTOM_PRIVATE_CACHE_REPO))
 
+    @is_trainium_test
     def test_list_in_registry(self):
         def _test_list_in_registry(use_private_cache_repo: bool):
             if use_private_cache_repo:
@@ -341,6 +347,7 @@ def _test_list_in_registry(use_private_cache_repo: bool):
         _test_list_in_registry(True)
 
 
+@is_trainium_test
 class NeuronHashTestCase(TestCase):
     def test_neuron_hash_is_not_mutable(self):
         bert_model = BertModel(BertConfig())
diff --git a/tests/test_examples.py b/tests/test_examples.py
index 149486e65..38e1d23a1 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -24,7 +24,7 @@
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar, Union
 from unittest import TestCase
 
-import huggingface_hub
+from huggingface_hub import get_token
 from transformers import (
     CONFIG_MAPPING,
     MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
@@ -40,7 +40,9 @@
 )
 from transformers.testing_utils import slow
 
+from optimum.neuron.distributed.parallelizers_manager import ParallelizersManager
 from optimum.neuron.utils.cache_utils import load_custom_cache_repo_name_from_hf_home
+from optimum.neuron.utils.import_utils import is_neuronx_distributed_available
 from optimum.neuron.utils.misc import string_to_bool
 from optimum.neuron.utils.runner import ExampleRunner
 from optimum.neuron.utils.testing_utils import is_trainium_test
@@ -56,7 +58,9 @@
 TypeOrDictOfType = Union[T, Dict[str, T]]
 
 
-TOKEN = huggingface_hub.get_token()
+TOKEN = get_token()
+if os.environ.get("HF_TOKEN", None) is not None:
+    TOKEN = os.environ.get("HF_TOKEN")
 
 DEFAULT_CACHE_REPO = "optimum-internal-testing/optimum-neuron-cache-for-testing"
 SAVED_CUSTOM_CACHE_REPO = load_custom_cache_repo_name_from_hf_home()
@@ -267,7 +271,7 @@ def __new__(cls, name, bases, attrs, example_name=None):
         for model_type, model_name_or_path, tp_support, config_overrides in models_to_test:
             # Regular training.
             attrs[f"test_{example_name}_{model_type}"] = cls._create_test(
-                model_type, model_name_or_path, 1, True, False, config_overrides
+                model_type, model_name_or_path, 1, 1, True, False, config_overrides
             )
 
             # Training with ZeRO-1.
@@ -277,13 +281,21 @@ def __new__(cls, name, bases, attrs, example_name=None):
             # )
 
             tensor_parallel_size = 2 if tp_support is not TPSupport.NONE else 1
+
+            if not is_neuronx_distributed_available():
+                pp_support = False
+            else:
+                pp_support = ParallelizersManager.parallelizer_for_model(model_type).supports_pipeline_parallelism()
+            pipeline_parallel_size = 4 if pp_support else 1
+
             disable_embedding_parallelization = tp_support is TPSupport.PARTIAL
             if tensor_parallel_size > 1:
                 # Training with TP if supported.
-                attrs[f"test_{example_name}_{model_type}_with_tp"] = cls._create_test(
+                attrs[f"test_{example_name}_{model_type}_with_tp_only"] = cls._create_test(
                     model_type,
                     model_name_or_path,
                     tensor_parallel_size,
+                    1,  # No pipeline parallelism in this test.
                     disable_embedding_parallelization,
                     False,
                     config_overrides,
@@ -294,6 +306,40 @@ def __new__(cls, name, bases, attrs, example_name=None):
                 #     model_type,
                 #     model_name_or_path,
                 #     tensor_parallel_size,
+                #     1, # No pipeline parallelism in this test.
+                #     disable_embedding_parallelization,
+                #     True,
+                #     config_overrides,
+                # )
+
+            if pipeline_parallel_size > 1:
+                # Training with PP if supported.
+                attrs[f"test_{example_name}_{model_type}_with_pp_only"] = cls._create_test(
+                    model_type,
+                    model_name_or_path,
+                    1,  # No tensor parallelism in this test.
+                    pipeline_parallel_size,
+                    disable_embedding_parallelization,
+                    False,
+                    config_overrides,
+                )
+
+            if tensor_parallel_size > 1 and pipeline_parallel_size > 1:
+                attrs[f"test_{example_name}_{model_type}_with_tp_and_pp"] = cls._create_test(
+                    model_type,
+                    model_name_or_path,
+                    tensor_parallel_size,
+                    pipeline_parallel_size,
+                    disable_embedding_parallelization,
+                    False,
+                    config_overrides,
+                )
+                # TODO: enable when working on the multi-node training PR.
+                # attrs[f"test_{example_name}_{model_type}_with_tp_and_pp_and_zero1"] = cls._create_test(
+                #     model_type,
+                #     model_name_or_path,
+                #     tensor_parallel_size,
+                #     pipeline_parallel_size,
                 #     disable_embedding_parallelization,
                 #     True,
                 #     config_overrides,
@@ -344,6 +390,7 @@ def _create_test(
         model_type: str,
         model_name_or_path: str,
         tensor_parallel_size: int,
+        pipeline_parallel_size: int,
         disable_embedding_parallelization: bool,
         zero_1: bool,
         config_overrides: Optional[Dict[str, Any]] = None,
@@ -351,9 +398,6 @@ def _create_test(
         """
         Creates a test function that runs an example for a model_name.
 
-        Args:
-            model_name (`str`): the model_name_or_path.
-
         Returns:
             `Callable[[ExampleTesterBase], None]`: The test function that runs the example.
         """
@@ -395,6 +439,7 @@ def test(self):
                     save_total_limit=1,
                     learning_rate=self.LEARNING_RATE,
                     tensor_parallel_size=tensor_parallel_size,
+                    pipeline_parallel_size=pipeline_parallel_size,
                     disable_embedding_parallelization=disable_embedding_parallelization,
                     zero_1=zero_1,
                     output_dir=tmpdirname,
diff --git a/tests/test_runner.py b/tests/test_runner.py
index 56c18dc38..56a2a3e19 100644
--- a/tests/test_runner.py
+++ b/tests/test_runner.py
@@ -17,7 +17,7 @@
 import os
 from unittest import TestCase
 
-import huggingface_hub
+from huggingface_hub import get_token, login
 from parameterized import parameterized
 
 from optimum.neuron.utils.cache_utils import (
@@ -58,14 +58,22 @@ class TestExampleRunner(TestCase):
 
     @classmethod
     def setUpClass(cls):
-        cls._token = huggingface_hub.get_token()
+        cls._token = get_token()
         cls._cache_repo = load_custom_cache_repo_name_from_hf_home()
         cls._env = dict(os.environ)
-        set_custom_cache_repo_name_in_hf_home(cls.CACHE_REPO_NAME)
+        if os.environ.get("HF_TOKEN", None) is not None:
+            token = os.environ.get("HF_TOKEN")
+
+            login(token)
+            set_custom_cache_repo_name_in_hf_home(cls.CACHE_REPO_NAME)
+        else:
+            raise RuntimeError("Please specify the token via the HF_TOKEN environment variable.")
 
     @classmethod
     def tearDownClass(cls):
         os.environ = cls._env
+        if cls._token is not None:
+            login(cls._token)
         if cls._cache_repo is not None:
             try:
                 set_custom_cache_repo_name_in_hf_home(cls._cache_repo)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 4fc002bee..d10082ccf 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -21,7 +21,7 @@
 from torch.utils.data import DataLoader, Dataset, IterableDataset
 from transformers import BertConfig, BertForSequenceClassification, PreTrainedModel, Wav2Vec2Config, Wav2Vec2Model
 
-from optimum.neuron.trainers import MODEL_PATCHING_SPECS
+from optimum.neuron.accelerate.accelerator import MODEL_PATCHING_SPECS
 from optimum.neuron.utils import ModelPatcher
 from optimum.neuron.utils.testing_utils import is_trainium_test
 from optimum.neuron.utils.training_utils import FirstAndLastDataset, is_model_officially_supported
diff --git a/tests/utils.py b/tests/utils.py
index 2b6caf8e8..f4b584e8c 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -24,7 +24,7 @@
 
 import torch
 from datasets import Dataset, DatasetDict
-from huggingface_hub import CommitOperationDelete, HfApi, HfFolder, create_repo, delete_repo, logout
+from huggingface_hub import CommitOperationDelete, HfApi, create_repo, delete_repo, get_token, login, logout
 from huggingface_hub.utils import RepositoryNotFoundError
 from transformers import PretrainedConfig, PreTrainedModel
 from transformers.testing_utils import ENDPOINT_STAGING
@@ -135,7 +135,7 @@ def create_tiny_pretrained_model(
 class TrainiumTestMixin:
     @classmethod
     def setUpClass(cls):
-        cls._token = HfFolder.get_token()
+        cls._token = get_token()
         cls._cache_repo = load_custom_cache_repo_name_from_hf_home()
         cls._env = dict(os.environ)
 
@@ -143,7 +143,7 @@ def setUpClass(cls):
     def tearDownClass(cls):
         os.environ = cls._env
         if cls._token is not None:
-            HfFolder.save_token(cls._token)
+            login(cls._token)
         if cls._cache_repo is not None:
             try:
                 set_custom_cache_repo_name_in_hf_home(cls._cache_repo)
@@ -161,10 +161,11 @@ class StagingTestMixin:
     MAX_NUM_LINEARS = 20
 
     @classmethod
-    def set_hf_hub_token(cls, token: str) -> str:
-        orig_token = HfFolder.get_token()
+    def set_hf_hub_token(cls, token: Optional[str]) -> Optional[str]:
+        orig_token = get_token()
+        login(token=token)
         if token is not None:
-            HfFolder.save_token(token)
+            login(token=token)
         else:
             logout()
         cls._env = dict(os.environ, HF_ENDPOINT=ENDPOINT_STAGING)
@@ -214,8 +215,8 @@ def remove_all_files_in_repo(self, repo_id: str):
         except RepositoryNotFoundError:
             pass
 
-    def tearDown(self) -> None:
-        HfFolder.save_token(TOKEN)
+    def tearDown(self):
+        login(TOKEN)
         self.remove_all_files_in_repo(self.CUSTOM_CACHE_REPO)
         self.remove_all_files_in_repo(self.CUSTOM_PRIVATE_CACHE_REPO)
 
diff --git a/tools/create_examples_from_transformers.py b/tools/create_examples_from_transformers.py
index 61d25030d..c95b6a7c9 100755
--- a/tools/create_examples_from_transformers.py
+++ b/tools/create_examples_from_transformers.py
@@ -177,7 +177,10 @@ def wrap_with_lazy_load_for_parallelism(file_content: str) -> str:
         # Adding one tab to indent from the lazy_load_for_parallelism context manager.
         number_of_spaces += 4
         model_loading_content = " " * number_of_spaces + model_loading_content
-        new_content = f"with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):\n{model_loading_content}\n"
+        new_content = (
+            "with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size, "
+            f"pipeline_parallel_size=training_args.pipeline_parallel_size):\n{model_loading_content}\n"
+        )
         file_content = file_content[:start] + new_content + file_content[position + 1 :]
         shift += len(new_content) - initial_length