diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml
index fe788bc34..7410502b0 100644
--- a/.github/workflows/check_code_quality.yml
+++ b/.github/workflows/check_code_quality.yml
@@ -48,4 +48,5 @@ jobs:
     - name: Check style with ruff
       run: |
         source venv/bin/activate
+        ruff format . --diff
         ruff check .
diff --git a/Makefile b/Makefile
index 12febb89c..c9444e23f 100644
--- a/Makefile
+++ b/Makefile
@@ -60,9 +60,11 @@ transformers_examples:
 # Run code quality checks
 style_check:
 	ruff check .
+	ruff format . --diff
 
 style:
 	ruff check . --fix
+	ruff format .
 
 # Utilities to release to PyPi
 build_dist_install_tools:
diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py
index bedf48ec9..8b7134647 100755
--- a/examples/language-modeling/run_clm.py
+++ b/examples/language-modeling/run_clm.py
@@ -462,7 +462,7 @@ def main():
             model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
 
         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
-        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
+        logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
diff --git a/examples/question-answering/trainer_qa.py b/examples/question-answering/trainer_qa.py
index 8243448a0..1e1119de3 100644
--- a/examples/question-answering/trainer_qa.py
+++ b/examples/question-answering/trainer_qa.py
@@ -15,6 +15,7 @@
 """
 A subclass of `Trainer` specific to Question-Answering tasks
 """
+
 import math
 import time
 
diff --git a/examples/question-answering/trainer_seq2seq_qa.py b/examples/question-answering/trainer_seq2seq_qa.py
index 6e04bf3f6..2a3dbe5ca 100644
--- a/examples/question-answering/trainer_seq2seq_qa.py
+++ b/examples/question-answering/trainer_seq2seq_qa.py
@@ -15,6 +15,7 @@
 """
 A subclass of `Trainer` specific to Question-Answering tasks
 """
+
 import math
 import time
 from typing import Dict, List, Optional
diff --git a/examples/question-answering/utils_qa.py b/examples/question-answering/utils_qa.py
index 23a46370d..79497dbb8 100644
--- a/examples/question-answering/utils_qa.py
+++ b/examples/question-answering/utils_qa.py
@@ -15,6 +15,7 @@
 """
 Post-processing utilities for question answering.
 """
+
 import collections
 import json
 import logging
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index 5a442c075..90be3c604 100755
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -525,9 +525,9 @@ def main():
         return
 
     if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
-        assert (
-            data_args.lang is not None
-        ), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
+        assert data_args.lang is not None, (
+            f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
+        )
 
         tokenizer.src_lang = data_args.lang
         tokenizer.tgt_lang = data_args.lang
diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py
index 75b321be0..e9f1fb6f0 100755
--- a/examples/text-classification/run_glue.py
+++ b/examples/text-classification/run_glue.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE."""
+"""Finetuning the library models for sequence classification on GLUE."""
 # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 
 import logging
@@ -158,9 +158,9 @@ def __post_init__(self):
             train_extension = self.train_file.split(".")[-1]
             assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
             validation_extension = self.validation_file.split(".")[-1]
-            assert (
-                validation_extension == train_extension
-            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+            assert validation_extension == train_extension, (
+                "`validation_file` should have the same extension (csv or json) as `train_file`."
+            )
 
 
 @dataclass
@@ -329,9 +329,9 @@ def main():
             if data_args.test_file is not None:
                 train_extension = data_args.train_file.split(".")[-1]
                 test_extension = data_args.test_file.split(".")[-1]
-                assert (
-                    test_extension == train_extension
-                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                assert test_extension == train_extension, (
+                    "`test_file` should have the same extension (csv or json) as `train_file`."
+                )
                 data_files["test"] = data_args.test_file
             else:
                 raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
diff --git a/examples/text-classification/run_xnli.py b/examples/text-classification/run_xnli.py
index 4b06d2653..23f79a5bb 100755
--- a/examples/text-classification/run_xnli.py
+++ b/examples/text-classification/run_xnli.py
@@ -14,8 +14,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning multi-lingual models on XNLI (e.g. Bert, DistilBERT, XLM).
-    Adapted from `examples/text-classification/run_glue.py`"""
+"""Finetuning multi-lingual models on XNLI (e.g. Bert, DistilBERT, XLM).
+Adapted from `examples/text-classification/run_glue.py`"""
 
 import logging
 import os
diff --git a/notebooks/sentence-transformers/getting-started.ipynb b/notebooks/sentence-transformers/getting-started.ipynb
index 148022fe8..17aaadb74 100644
--- a/notebooks/sentence-transformers/getting-started.ipynb
+++ b/notebooks/sentence-transformers/getting-started.ipynb
@@ -103,14 +103,14 @@
     "\n",
     "# Run inference\n",
     "prompt = \"I like to eat apples\"\n",
-    "encoded_input = tokenizer(prompt, return_tensors='pt')\n",
+    "encoded_input = tokenizer(prompt, return_tensors=\"pt\")\n",
     "outputs = model(**encoded_input)\n",
     "\n",
     "token_embeddings = outputs.token_embeddings\n",
     "sentence_embedding = outputs.sentence_embedding\n",
     "\n",
-    "print(f\"token embeddings: {token_embeddings.shape}\") # torch.Size([1, 7, 384])\n",
-    "print(f\"sentence_embedding: {sentence_embedding.shape}\") # torch.Size([1, 384])"
+    "print(f\"token embeddings: {token_embeddings.shape}\")  # torch.Size([1, 7, 384])\n",
+    "print(f\"sentence_embedding: {sentence_embedding.shape}\")  # torch.Size([1, 384])"
    ]
   },
   {
diff --git a/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb b/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb
index 775fd254c..cba09eb74 100644
--- a/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb
+++ b/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb
@@ -86,7 +86,9 @@
    "outputs": [],
    "source": [
     "# Push and share your model to the HuggingFace hub\n",
-    "repository_id = \"your-username/your-awesome-model\"  # Replace with your repo id, eg. \"Jingya/stable-diffusion-2-1-neuronx\".\n",
+    "repository_id = (\n",
+    "    \"your-username/your-awesome-model\"  # Replace with your repo id, eg. \"Jingya/stable-diffusion-2-1-neuronx\".\n",
+    ")\n",
     "stable_diffusion.push_to_hub(save_directory, repository_id=repository_id, use_auth_token=True)"
    ]
   },
@@ -659,7 +661,7 @@
     "    \"engineers eating lunch at the opera\",\n",
     "    \"panda eating bamboo on a plane\",\n",
     "    \"A digital illustration of a steampunk flying machine in the sky with cogs and mechanisms, 4k, detailed, trending in artstation, fantasy vivid colors\",\n",
-    "    \"kids playing soccer at the FIFA World Cup\"\n",
+    "    \"kids playing soccer at the FIFA World Cup\",\n",
     "]\n",
     "\n",
     "\n",
@@ -675,7 +677,7 @@
     "    print(f\"[Inference Time] {np.round(inf_time, 2)} seconds.\")\n",
     "    image.save(\"image.png\")\n",
     "    image = mpimg.imread(\"image.png\")\n",
-    "    #clear_output(wait=True)\n",
+    "    # clear_output(wait=True)\n",
     "    plt.imshow(image)\n",
     "    plt.show()"
    ]
diff --git a/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb b/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb
index c8fafda7c..abbd735ce 100644
--- a/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb
+++ b/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb
@@ -87,7 +87,9 @@
    "outputs": [],
    "source": [
     "# Push and share your model to the HuggingFace hub\n",
-    "repository_id = \"your-username/your-awesome-model\"  # Replace with your repo id, eg. \"Jingya/stable-diffusion-xl-base-1.0-neuronx\".\n",
+    "repository_id = (\n",
+    "    \"your-username/your-awesome-model\"  # Replace with your repo id, eg. \"Jingya/stable-diffusion-xl-base-1.0-neuronx\".\n",
+    ")\n",
     "stable_diffusion_xl.push_to_hub(save_directory, repository_id=repository_id, use_auth_token=True)"
    ]
   },
@@ -708,7 +710,7 @@
     "    \"engineers eating lunch at the opera\",\n",
     "    \"panda eating bamboo on a plane\",\n",
     "    \"A digital illustration of a steampunk flying machine in the sky with cogs and mechanisms, 4k, detailed, trending in artstation, fantasy vivid colors\",\n",
-    "    \"kids playing soccer at the FIFA World Cup\"\n",
+    "    \"kids playing soccer at the FIFA World Cup\",\n",
     "]\n",
     "\n",
     "\n",
@@ -724,7 +726,7 @@
     "    print(f\"[Inference Time] {np.round(inf_time, 2)} seconds.\")\n",
     "    image.save(\"image.png\")\n",
     "    image = mpimg.imread(\"image.png\")\n",
-    "    #clear_output(wait=True)\n",
+    "    # clear_output(wait=True)\n",
     "    plt.imshow(image)\n",
     "    plt.show()"
    ]
diff --git a/notebooks/text-classification/notebook.ipynb b/notebooks/text-classification/notebook.ipynb
index b03ac1502..86e1feac1 100644
--- a/notebooks/text-classification/notebook.ipynb
+++ b/notebooks/text-classification/notebook.ipynb
@@ -118,8 +118,8 @@
     "from random import randrange\n",
     "\n",
     "\n",
-    "random_id = randrange(len(raw_dataset['train']))\n",
-    "raw_dataset['train'][random_id]\n",
+    "random_id = randrange(len(raw_dataset[\"train\"]))\n",
+    "raw_dataset[\"train\"][random_id]\n",
     "# {'text': 'i feel isolated and alone in my trade', 'label': 0}"
    ]
   },
@@ -152,18 +152,20 @@
     "# Load Tokenizer\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
     "\n",
+    "\n",
     "# Tokenize helper function\n",
     "def tokenize(batch):\n",
-    "    return tokenizer(batch['text'], padding='max_length', truncation=True,return_tensors=\"pt\")\n",
+    "    return tokenizer(batch[\"text\"], padding=\"max_length\", truncation=True, return_tensors=\"pt\")\n",
+    "\n",
     "\n",
     "# Tokenize dataset\n",
-    "raw_dataset =  raw_dataset.rename_column(\"label\", \"labels\") # to match Trainer\n",
+    "raw_dataset = raw_dataset.rename_column(\"label\", \"labels\")  # to match Trainer\n",
     "tokenized_dataset = raw_dataset.map(tokenize, batched=True, remove_columns=[\"text\"])\n",
     "tokenized_dataset = tokenized_dataset.with_format(\"torch\")\n",
     "\n",
     "# save dataset to disk\n",
-    "tokenized_dataset[\"train\"].save_to_disk(os.path.join(save_dataset_path,\"train\"))\n",
-    "tokenized_dataset[\"test\"].save_to_disk(os.path.join(save_dataset_path,\"eval\"))"
+    "tokenized_dataset[\"train\"].save_to_disk(os.path.join(save_dataset_path, \"train\"))\n",
+    "tokenized_dataset[\"test\"].save_to_disk(os.path.join(save_dataset_path, \"eval\"))"
    ]
   },
   {
diff --git a/notebooks/text-generation/CodeLlama-7B-Compilation.ipynb b/notebooks/text-generation/CodeLlama-7B-Compilation.ipynb
index f3ebf98fc..318da4ef5 100644
--- a/notebooks/text-generation/CodeLlama-7B-Compilation.ipynb
+++ b/notebooks/text-generation/CodeLlama-7B-Compilation.ipynb
@@ -97,8 +97,9 @@
     "from optimum.neuron import pipeline\n",
     "\n",
     "\n",
-    "p = pipeline('text-generation', 'aws-neuron/CodeLlama-7b-hf-neuron-8xlarge')\n",
-    "p(\"import socket\\n\\ndef ping_exponential_backoff(host: str):\",\n",
+    "p = pipeline(\"text-generation\", \"aws-neuron/CodeLlama-7b-hf-neuron-8xlarge\")\n",
+    "p(\n",
+    "    \"import socket\\n\\ndef ping_exponential_backoff(host: str):\",\n",
     "    do_sample=True,\n",
     "    top_k=10,\n",
     "    temperature=0.1,\n",
@@ -191,10 +192,12 @@
     "from optimum.neuron import NeuronModelForCausalLM\n",
     "\n",
     "\n",
-    "#num_cores should be changed based on the instance.  inf2.24xlarge has 6 neuron processors (they have two cores each) so 12 total\n",
-    "compiler_args = {\"num_cores\": 2, \"auto_cast_type\": 'fp16'}\n",
+    "# num_cores should be changed based on the instance.  inf2.24xlarge has 6 neuron processors (they have two cores each) so 12 total\n",
+    "compiler_args = {\"num_cores\": 2, \"auto_cast_type\": \"fp16\"}\n",
     "input_shapes = {\"batch_size\": 1, \"sequence_length\": 2048}\n",
-    "model = NeuronModelForCausalLM.from_pretrained(\"codellama/CodeLlama-7b-hf\", export=True, **compiler_args, **input_shapes)"
+    "model = NeuronModelForCausalLM.from_pretrained(\n",
+    "    \"codellama/CodeLlama-7b-hf\", export=True, **compiler_args, **input_shapes\n",
+    ")"
    ]
   },
   {
@@ -214,7 +217,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.save_pretrained(\"CodeLlama-7b-hf-neuron-8xlarge\")\n"
+    "model.save_pretrained(\"CodeLlama-7b-hf-neuron-8xlarge\")"
    ]
   },
   {
@@ -255,7 +258,7 @@
     "from huggingface_hub.hf_api import HfFolder\n",
     "\n",
     "\n",
-    "HfFolder.save_token('MY_HUGGINGFACE_TOKEN_HERE')"
+    "HfFolder.save_token(\"MY_HUGGINGFACE_TOKEN_HERE\")"
    ]
   },
   {
diff --git a/notebooks/text-generation/llama2-13b-chatbot.ipynb b/notebooks/text-generation/llama2-13b-chatbot.ipynb
index 788ee756f..38e7abdec 100644
--- a/notebooks/text-generation/llama2-13b-chatbot.ipynb
+++ b/notebooks/text-generation/llama2-13b-chatbot.ipynb
@@ -103,13 +103,11 @@
     "from optimum.neuron import NeuronModelForCausalLM\n",
     "\n",
     "\n",
-    "compiler_args = {\"num_cores\": 24, \"auto_cast_type\": 'fp16'}\n",
+    "compiler_args = {\"num_cores\": 24, \"auto_cast_type\": \"fp16\"}\n",
     "input_shapes = {\"batch_size\": 1, \"sequence_length\": 2048}\n",
     "model = NeuronModelForCausalLM.from_pretrained(\n",
-    "        \"NousResearch/Llama-2-13b-chat-hf\",\n",
-    "        export=True,\n",
-    "        **compiler_args,\n",
-    "        **input_shapes)"
+    "    \"NousResearch/Llama-2-13b-chat-hf\", export=True, **compiler_args, **input_shapes\n",
+    ")"
    ]
   },
   {
@@ -177,7 +175,7 @@
     "from huggingface_hub import whoami\n",
     "\n",
     "\n",
-    "org = whoami()['name']\n",
+    "org = whoami()[\"name\"]\n",
     "\n",
     "repo_id = f\"{org}/llama-2-13b-chat-neuron\"\n",
     "\n",
@@ -245,7 +243,7 @@
     "    model\n",
     "except NameError:\n",
     "    # Edit this to use another base model\n",
-    "    model = NeuronModelForCausalLM.from_pretrained('aws-neuron/Llama-2-13b-chat-hf-neuron-latency')"
+    "    model = NeuronModelForCausalLM.from_pretrained(\"aws-neuron/Llama-2-13b-chat-hf-neuron-latency\")"
    ]
   },
   {
@@ -290,12 +288,7 @@
    "outputs": [],
    "source": [
     "inputs = tokenizer(\"What is deep-learning ?\", return_tensors=\"pt\")\n",
-    "outputs = model.generate(**inputs,\n",
-    "                         max_new_tokens=128,\n",
-    "                         do_sample=True,\n",
-    "                         temperature=0.9,\n",
-    "                         top_k=50,\n",
-    "                         top_p=0.9)\n",
+    "outputs = model.generate(**inputs, max_new_tokens=128, do_sample=True, temperature=0.9, top_k=50, top_p=0.9)\n",
     "tokenizer.batch_decode(outputs, skip_special_tokens=True)"
    ]
   },
@@ -323,7 +316,7 @@
    "outputs": [],
    "source": [
     "def format_chat_prompt(message, history, max_tokens):\n",
-    "    \"\"\" Convert a history of messages to a chat prompt\n",
+    "    \"\"\"Convert a history of messages to a chat prompt\n",
     "    Args:\n",
     "        message(str): the new user message.\n",
     "        history (List[str]): the list of user messages and assistant responses.\n",
@@ -334,10 +327,10 @@
     "    chat = []\n",
     "    # Convert all messages in history to chat interactions\n",
     "    for interaction in history:\n",
-    "        chat.append({\"role\": \"user\", \"content\" : interaction[0]})\n",
-    "        chat.append({\"role\": \"assistant\", \"content\" : interaction[1]})\n",
+    "        chat.append({\"role\": \"user\", \"content\": interaction[0]})\n",
+    "        chat.append({\"role\": \"assistant\", \"content\": interaction[1]})\n",
     "    # Add the new message\n",
-    "    chat.append({\"role\": \"user\", \"content\" : message})\n",
+    "    chat.append({\"role\": \"user\", \"content\": message})\n",
     "    # Generate the prompt, verifying that we don't go beyond the maximum number of tokens\n",
     "    for i in range(0, len(chat), 2):\n",
     "        # Generate candidate prompt with the last n-i entries\n",
@@ -372,19 +365,17 @@
     "history = []\n",
     "max_tokens = 1024\n",
     "\n",
+    "\n",
     "def chat(message, history, max_tokens):\n",
     "    prompt = format_chat_prompt(message, history, max_tokens)\n",
     "    # Uncomment the line below to see what the formatted prompt looks like\n",
-    "    #print(prompt)\n",
+    "    # print(prompt)\n",
     "    inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
-    "    outputs = model.generate(**inputs,\n",
-    "                             max_length=2048,\n",
-    "                             do_sample=True,\n",
-    "                             temperature=0.9,\n",
-    "                             top_k=50,\n",
-    "                             repetition_penalty=1.2)\n",
+    "    outputs = model.generate(\n",
+    "        **inputs, max_length=2048, do_sample=True, temperature=0.9, top_k=50, repetition_penalty=1.2\n",
+    "    )\n",
     "    # Do not include the input tokens\n",
-    "    outputs = outputs[0, inputs.input_ids.size(-1):]\n",
+    "    outputs = outputs[0, inputs.input_ids.size(-1) :]\n",
     "    response = tokenizer.decode(outputs, skip_special_tokens=True)\n",
     "    history.append([message, response])\n",
     "    return response"
diff --git a/notebooks/text-generation/llama2-7b-fine-tuning.ipynb b/notebooks/text-generation/llama2-7b-fine-tuning.ipynb
index 08e41a58b..6456cacbf 100644
--- a/notebooks/text-generation/llama2-7b-fine-tuning.ipynb
+++ b/notebooks/text-generation/llama2-7b-fine-tuning.ipynb
@@ -238,7 +238,7 @@
     "\n",
     "\n",
     "# Hugging Face model id\n",
-    "model_id = \"philschmid/Llama-2-7b-hf\" # ungated\n",
+    "model_id = \"philschmid/Llama-2-7b-hf\"  # ungated\n",
     "# model_id = \"meta-llama/Llama-2-7b-hf\" # gated\n",
     "\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id)"
@@ -266,7 +266,7 @@
     "from random import randint\n",
     "\n",
     "\n",
-    "sys.path.append(\"./scripts/utils\") # make sure you change this to the correct path\n",
+    "sys.path.append(\"./scripts/utils\")  # make sure you change this to the correct path\n",
     "from pack_dataset import pack_dataset\n",
     "\n",
     "\n",
@@ -275,18 +275,17 @@
     "    sample[\"text\"] = f\"{format_dolly(sample)}{tokenizer.eos_token}\"\n",
     "    return sample\n",
     "\n",
+    "\n",
     "# apply prompt template per sample\n",
     "dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))\n",
     "# print random sample\n",
     "print(dataset[randint(0, len(dataset))][\"text\"])\n",
     "\n",
     "# tokenize dataset\n",
-    "dataset = dataset.map(\n",
-    "    lambda sample: tokenizer(sample[\"text\"]), batched=True, remove_columns=list(dataset.features)\n",
-    ")\n",
+    "dataset = dataset.map(lambda sample: tokenizer(sample[\"text\"]), batched=True, remove_columns=list(dataset.features))\n",
     "\n",
     "# chunk dataset\n",
-    "lm_dataset = pack_dataset(dataset, chunk_length=2048) # We use 2048 as the maximum length for packing"
+    "lm_dataset = pack_dataset(dataset, chunk_length=2048)  # We use 2048 as the maximum length for packing"
    ]
   },
   {
@@ -466,15 +465,11 @@
     "from optimum.neuron import NeuronModelForCausalLM\n",
     "\n",
     "\n",
-    "compiler_args = {\"num_cores\": 2, \"auto_cast_type\": 'fp16'}\n",
+    "compiler_args = {\"num_cores\": 2, \"auto_cast_type\": \"fp16\"}\n",
     "input_shapes = {\"batch_size\": 1, \"sequence_length\": 2048}\n",
     "\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"dolly_llama\")\n",
-    "model = NeuronModelForCausalLM.from_pretrained(\n",
-    "        \"dolly_llama\",\n",
-    "        export=True,\n",
-    "        **compiler_args,\n",
-    "        **input_shapes)\n"
+    "model = NeuronModelForCausalLM.from_pretrained(\"dolly_llama\", export=True, **compiler_args, **input_shapes)"
    ]
   },
   {
@@ -519,13 +514,8 @@
     "def generate(sample):\n",
     "    prompt = format_dolly_infernece(sample)\n",
     "    inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
-    "    outputs = model.generate(**inputs,\n",
-    "                         max_new_tokens=512,\n",
-    "                         do_sample=True,\n",
-    "                         temperature=0.9,\n",
-    "                         top_k=50,\n",
-    "                         top_p=0.9)\n",
-    "    return tokenizer.decode(outputs[0], skip_special_tokens=False)[len(prompt):]"
+    "    outputs = model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.9, top_k=50, top_p=0.9)\n",
+    "    return tokenizer.decode(outputs[0], skip_special_tokens=False)[len(prompt) :]"
    ]
   },
   {
@@ -543,9 +533,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "prompt = {\n",
-    "  \"instruction\": \"Can you tell me something about AWS?\"\n",
-    "}\n",
+    "prompt = {\"instruction\": \"Can you tell me something about AWS?\"}\n",
     "res = generate(prompt)\n",
     "\n",
     "print(res)"
@@ -572,8 +560,8 @@
    "outputs": [],
    "source": [
     "prompt = {\n",
-    "  \"instruction\": \"How can train models on AWS Trainium?\",\n",
-    "  \"context\": \"🤗 Optimum Neuron is the interface between the 🤗 Transformers library and AWS Accelerators including [AWS Trainium](https://aws.amazon.com/machine-learning/trainium/?nc1=h_ls) and [AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/?nc1=h_ls). It provides a set of tools enabling easy model loading, training and inference on single- and multi-Accelerator settings for different downstream tasks.\"\n",
+    "    \"instruction\": \"How can train models on AWS Trainium?\",\n",
+    "    \"context\": \"🤗 Optimum Neuron is the interface between the 🤗 Transformers library and AWS Accelerators including [AWS Trainium](https://aws.amazon.com/machine-learning/trainium/?nc1=h_ls) and [AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/?nc1=h_ls). It provides a set of tools enabling easy model loading, training and inference on single- and multi-Accelerator settings for different downstream tasks.\",\n",
     "}\n",
     "res = generate(prompt)\n",
     "\n",
diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
index 3e2d00ef6..5a549a697 100644
--- a/optimum/exporters/neuron/__main__.py
+++ b/optimum/exporters/neuron/__main__.py
@@ -662,7 +662,7 @@ def main_export(
             )
 
             logger.info(
-                f"The {NEURON_COMPILER} export succeeded and the exported model was saved at: " f"{output.as_posix()}"
+                f"The {NEURON_COMPILER} export succeeded and the exported model was saved at: {output.as_posix()}"
             )
         except ShapeError as e:
             raise e
@@ -678,8 +678,7 @@ def main_export(
             )
         except Exception as e:
             logger.error(
-                f"An error occured with the error message: {e}.\n The exported model was saved at: "
-                f"{output.as_posix()}"
+                f"An error occured with the error message: {e}.\n The exported model was saved at: {output.as_posix()}"
             )
 
 
diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py
index 82e842954..80958f70b 100644
--- a/optimum/exporters/neuron/config.py
+++ b/optimum/exporters/neuron/config.py
@@ -16,6 +16,7 @@
 Common Neuron configuration classes that handle most of the features for building model specific
 configurations.
 """
+
 from typing import List
 
 from ...utils import (
diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index b9b1a79c5..a30a7ba47 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Neuron compiled model check and export functions."""
+
 import copy
 import time
 from collections import OrderedDict
@@ -282,7 +283,7 @@ def validate_model_outputs(
 
     if shape_failures:
         msg = "\n".join(f"- {t[0]}: got {t[1]} (reference) and {t[2]} (neuron)" for t in shape_failures)
-        raise ShapeError("Output shapes do not match between reference model and the Neuron exported model:\n" "{msg}")
+        raise ShapeError("Output shapes do not match between reference model and the Neuron exported model:\n{msg}")
 
     if value_failures:
         msg = "\n".join(f"- {t[0]}: max diff = {t[1]}" for t in value_failures)
diff --git a/optimum/exporters/neuron/model_configs/decoder_configs.py b/optimum/exporters/neuron/model_configs/decoder_configs.py
index 30ddc808e..e2273610f 100644
--- a/optimum/exporters/neuron/model_configs/decoder_configs.py
+++ b/optimum/exporters/neuron/model_configs/decoder_configs.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Neuron export configurations for models using transformers_neuronx."""
 
-
 from optimum.exporters.tasks import TasksManager
 
 from ....neuron.models.granite.model import GraniteForSampling
diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py
index f1fb2995e..2e1c15639 100644
--- a/optimum/exporters/neuron/model_wrappers.py
+++ b/optimum/exporters/neuron/model_wrappers.py
@@ -247,13 +247,13 @@ def forward(self, input_ids, attention_mask):
         batch_size = input_ids.shape[0]
         sequence_length = input_ids.shape[1]
         if self.sequence_length is not None:
-            assert (
-                self.sequence_length
-            ), f"Different sequence length for the parallel partition({self.sequence_length}) and for dummy inputs({sequence_length}). Make sure that they have the same value."
+            assert self.sequence_length, (
+                f"Different sequence length for the parallel partition({self.sequence_length}) and for dummy inputs({sequence_length}). Make sure that they have the same value."
+            )
         if self.batch_size is not None:
-            assert (
-                self.batch_size
-            ), f"Different batch size for the parallel partition({self.batch_size}) and for dummy inputs({batch_size}). Make sure that they have the same value."
+            assert self.batch_size, (
+                f"Different batch size for the parallel partition({self.batch_size}) and for dummy inputs({batch_size}). Make sure that they have the same value."
+            )
 
         encoder_output = self.model.encoder(
             input_ids=input_ids, attention_mask=attention_mask, output_attentions=False, output_hidden_states=False
diff --git a/optimum/neuron/accelerate/utils/operations.py b/optimum/neuron/accelerate/utils/operations.py
index 11345ca10..9e241dd6e 100644
--- a/optimum/neuron/accelerate/utils/operations.py
+++ b/optimum/neuron/accelerate/utils/operations.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Custom operations related to accelerate for Neuron."""
 
-
 import torch
 from accelerate.utils.operations import recursively_apply
 
diff --git a/optimum/neuron/distributed/encoder_decoder_models.py b/optimum/neuron/distributed/encoder_decoder_models.py
index 0af70494c..b86df32a8 100644
--- a/optimum/neuron/distributed/encoder_decoder_models.py
+++ b/optimum/neuron/distributed/encoder_decoder_models.py
@@ -268,7 +268,7 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value):
             if past_key_value is not None:
                 if len(past_key_value) != 2:
                     raise ValueError(
-                        f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                        f"past_key_value should have 2 past states: keys and values. Got {len(past_key_value)} past states"
                     )
                 real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
 
diff --git a/optimum/neuron/distributed/parallel_layers.py b/optimum/neuron/distributed/parallel_layers.py
index bbba16c88..fd489e4eb 100644
--- a/optimum/neuron/distributed/parallel_layers.py
+++ b/optimum/neuron/distributed/parallel_layers.py
@@ -108,7 +108,7 @@ def prepare_parallel_layer_specific_kwargs(cls, **parallel_layer_specific_kwargs
                 name for name in parallel_layer_specific_kwargs if name not in default_parallel_layer_specific_kwargs
             ]
             logger.debug(
-                f'The following arguments are not allowed for {cls.__name__}: {", ".join(wrong_argument_names)}, they '
+                f"The following arguments are not allowed for {cls.__name__}: {', '.join(wrong_argument_names)}, they "
                 "will be ignored."
             )
 
diff --git a/optimum/neuron/distributed/parallelizers_manager.py b/optimum/neuron/distributed/parallelizers_manager.py
index 5f3cc2df3..a0a4fce89 100644
--- a/optimum/neuron/distributed/parallelizers_manager.py
+++ b/optimum/neuron/distributed/parallelizers_manager.py
@@ -28,7 +28,7 @@
 
 
 def parallelizer_classes_resolver(
-    model_type_to_parallelizer_class_name: Dict[str, str]
+    model_type_to_parallelizer_class_name: Dict[str, str],
 ) -> Dict[str, Type[Parallelizer]]:
     modules = []
     for module_name in _PARALLELIZER_CLASSES_MODULE_NAMES:
diff --git a/optimum/neuron/modeling.py b/optimum/neuron/modeling.py
index fa5681d26..7edbf1c61 100644
--- a/optimum/neuron/modeling.py
+++ b/optimum/neuron/modeling.py
@@ -179,9 +179,7 @@ def forward(
             # last_hidden_state -> (batch_size, sequencen_len, hidden_size)
             last_hidden_state = self.remove_padding(
                 [outputs[0]], dims=[0, 1], indices=[input_ids.shape[0], input_ids.shape[1]]
-            )[
-                0
-            ]  # Remove padding on batch_size(0), and sequence_length(1)
+            )[0]  # Remove padding on batch_size(0), and sequence_length(1)
             if len(outputs) > 1:
                 # pooler_output -> (batch_size, hidden_size)
                 pooler_output = self.remove_padding([outputs[1]], dims=[0], indices=[input_ids.shape[0]])[
@@ -264,9 +262,7 @@ def forward(
                 # token_embeddings -> (batch_size, sequencen_len, hidden_size)
                 token_embeddings = self.remove_padding(
                     [outputs[0]], dims=[0, 1], indices=[input_ids.shape[0], input_ids.shape[1]]
-                )[
-                    0
-                ]  # Remove padding on batch_size(0), and sequence_length(1)
+                )[0]  # Remove padding on batch_size(0), and sequence_length(1)
                 # sentence_embedding -> (batch_size, hidden_size)
                 sentence_embedding = self.remove_padding([outputs[1]], dims=[0], indices=[input_ids.shape[0]])[
                     0
diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py
index 53f2df76e..944d76ebe 100644
--- a/optimum/neuron/modeling_diffusion.py
+++ b/optimum/neuron/modeling_diffusion.py
@@ -1183,7 +1183,9 @@ def forward(
         if output_hidden_states:
             assert (
                 self.config.output_hidden_states or self.config.neuron.get("output_hidden_states")
-            ) == output_hidden_states, "output_hidden_states is expected to be False since the model was compiled without hidden_states as output."
+            ) == output_hidden_states, (
+                "output_hidden_states is expected to be False since the model was compiled without hidden_states as output."
+            )
 
         input_ids = input_ids.to(torch.long)  # dummy generator uses long int for tracing
         inputs = (input_ids,)
diff --git a/optimum/neuron/models/granite/hlo.py b/optimum/neuron/models/granite/hlo.py
index d66f12b8d..59330b438 100644
--- a/optimum/neuron/models/granite/hlo.py
+++ b/optimum/neuron/models/granite/hlo.py
@@ -35,7 +35,6 @@ def scale_mul(t, scale):
 
 
 class GraniteForSamplingNoEmbeddingHlo:
-
     def __init__(self, config: GraniteConfig, neuron_config: Optional[NeuronConfig] = None):
         self.config = config
         self.neuron_config = neuron_config
@@ -324,9 +323,9 @@ def layer(
             tp_degree=self.config.tp_degree,
         )
         if self.neuron_config.fuse_mlp:
-            assert all(
-                (not (x) for x in [in0_weight, in1_weight, out_weight, in0_scales, in1_scales, out_scales])
-            ), "in0, in1 and out weights have to be None"
+            assert all((not (x) for x in [in0_weight, in1_weight, out_weight, in0_scales, in1_scales, out_scales])), (
+                "in0, in1 and out weights have to be None"
+            )
             in0_weight, in0_scales = mlp_in_weight, mlp_in_scales
             out_weight, out_scales = mlp_out_weight, mlp_out_scales
 
@@ -688,7 +687,6 @@ def attention(
 
         # Single Token Generation ("Prefetch"-style) ans speculative forward
         if active_mask is not None:
-
             n_active_tokens = key.sizes[1] if bsh_cache_layout else key.sizes[0]
             if n_active_tokens > 1 and self.neuron_config and self.neuron_config.continuous_batching:
                 # For speculative forward + continuous batching, slice out samples in the batch size
diff --git a/optimum/neuron/models/granite/model.py b/optimum/neuron/models/granite/model.py
index ddd3aecf2..7b706ef19 100644
--- a/optimum/neuron/models/granite/model.py
+++ b/optimum/neuron/models/granite/model.py
@@ -159,9 +159,9 @@ def load_weights(self):
 
             # Note: Automatic MLP padding is safe since zeros are *only* introduced to intermediary state
             if self.neuron_config.fuse_mlp:
-                assert all(
-                    getattr(mlp, attr, None) for attr in ["gate_proj", "up_proj"]
-                ), "fuse_mlp need to have gate and up proj weights"
+                assert all(getattr(mlp, attr, None) for attr in ["gate_proj", "up_proj"]), (
+                    "fuse_mlp need to have gate and up proj weights"
+                )
                 assert all(
                     getattr(mlp, attr, None).weight.shape[0] % self.config.tp_degree == 0
                     for attr in ["gate_proj", "up_proj"]
diff --git a/optimum/neuron/models/granite/modules.py b/optimum/neuron/models/granite/modules.py
index 4cbbcc9f3..84b36a38d 100644
--- a/optimum/neuron/models/granite/modules.py
+++ b/optimum/neuron/models/granite/modules.py
@@ -18,7 +18,6 @@
 
 
 class GraniteForCausalLM(module.PretrainedModel):
-
     def __init__(self, config: GraniteConfig):
         super().__init__()
         dtype, _, _ = utils.parse_amp(config.amp)
@@ -34,7 +33,6 @@ def get_base_model(self):
 
 
 class GraniteModel(module.LowMemoryModule):
-
     def __init__(self, config: GraniteConfig):
         super().__init__()
         self.embed_tokens = module.LowMemoryEmbedding(config.vocab_size, config.hidden_size)
@@ -45,14 +43,12 @@ def __init__(self, config: GraniteConfig):
 
 
 class GraniteRMSNorm(module.LowMemoryModule):
-
     def __init__(self, config: GraniteConfig) -> None:
         super().__init__()
         self.weight = module.UninitializedParameter()
 
 
 class GraniteDecoderLayer(module.LowMemoryModule):
-
     def __init__(self, config: GraniteConfig):
         super().__init__()
         self.self_attn = GraniteAttention(config)
@@ -62,7 +58,6 @@ def __init__(self, config: GraniteConfig):
 
 
 class GraniteAttention(module.LowMemoryModule):
-
     def __init__(self, config: GraniteConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -77,7 +72,6 @@ def __init__(self, config: GraniteConfig):
 
 
 class GraniteMLP(module.LowMemoryModule):
-
     def __init__(self, config: GraniteConfig):
         super().__init__()
         dtype, _, _ = utils.parse_amp(config.amp)
diff --git a/optimum/neuron/models/qwen2/model.py b/optimum/neuron/models/qwen2/model.py
index 8ee60d9b4..8396a8fba 100644
--- a/optimum/neuron/models/qwen2/model.py
+++ b/optimum/neuron/models/qwen2/model.py
@@ -156,9 +156,9 @@ def load_weights(self):
 
             # Note: Automatic MLP padding is safe since zeros are *only* introduced to intermediary state
             if self.neuron_config.fuse_mlp:
-                assert all(
-                    getattr(mlp, attr, None) for attr in ["gate_proj", "up_proj"]
-                ), "fuse_mlp need to have gate and up proj weights"
+                assert all(getattr(mlp, attr, None) for attr in ["gate_proj", "up_proj"]), (
+                    "fuse_mlp need to have gate and up proj weights"
+                )
                 assert all(
                     getattr(mlp, attr, None).weight.shape[0] % self.config.tp_degree == 0
                     for attr in ["gate_proj", "up_proj"]
diff --git a/optimum/neuron/utils/cache_utils.py b/optimum/neuron/utils/cache_utils.py
index 28845713c..84a760988 100644
--- a/optimum/neuron/utils/cache_utils.py
+++ b/optimum/neuron/utils/cache_utils.py
@@ -62,7 +62,7 @@
 
 
 def load_custom_cache_repo_name_from_hf_home(
-    hf_home_cache_repo_file: Union[str, Path] = HF_HOME_CACHE_REPO_FILE
+    hf_home_cache_repo_file: Union[str, Path] = HF_HOME_CACHE_REPO_FILE,
 ) -> Optional[str]:
     if Path(hf_home_cache_repo_file).exists():
         with open(hf_home_cache_repo_file, "r") as fp:
diff --git a/optimum/neuron/utils/peft_utils.py b/optimum/neuron/utils/peft_utils.py
index 7780ff7ed..4866669ac 100644
--- a/optimum/neuron/utils/peft_utils.py
+++ b/optimum/neuron/utils/peft_utils.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Utilities related to the PEFT library and support."""
+
 import collections
 import functools
 import os
diff --git a/optimum/neuron/utils/version_utils.py b/optimum/neuron/utils/version_utils.py
index 818e2bc1e..368c4d186 100644
--- a/optimum/neuron/utils/version_utils.py
+++ b/optimum/neuron/utils/version_utils.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Version utilities."""
+
 from typing import Optional
 
 from packaging import version
diff --git a/tests/cli/test_neuron_cache_cli.py b/tests/cli/test_neuron_cache_cli.py
index 0d7887445..c8b6da574 100644
--- a/tests/cli/test_neuron_cache_cli.py
+++ b/tests/cli/test_neuron_cache_cli.py
@@ -50,18 +50,18 @@ def _optimum_neuron_cache_create(self, cache_repo_id: Optional[str] = None, publ
         try:
             repo_id = cache_repo_id if cache_repo_id is not None else CACHE_REPO_NAME
             info = HfApi().repo_info(repo_id, repo_type="model")
-            assert info.private == (
-                not public
-            ), "The privacy of the repo should match the presence of the --public flag."
+            assert info.private == (not public), (
+                "The privacy of the repo should match the presence of the --public flag."
+            )
 
         except RepositoryNotFoundError:
             pytest.fail("The repo was not created.")
         finally:
             delete_repo(repo_id)
 
-        assert (
-            repo_id == load_custom_cache_repo_name_from_hf_home()
-        ), f"Saved local Neuron cache name should be equal to {repo_id}."
+        assert repo_id == load_custom_cache_repo_name_from_hf_home(), (
+            f"Saved local Neuron cache name should be equal to {repo_id}."
+        )
 
     def test_optimum_neuron_cache_create_with_custom_name(self, hub_test):
         seed = random.randint(0, 100)
@@ -79,9 +79,9 @@ def test_optimum_neuron_cache_set(self, hub_test):
         p = subprocess.Popen(command)
         returncode = p.wait()
         assert returncode == 0
-        assert (
-            repo_id == load_custom_cache_repo_name_from_hf_home()
-        ), f"Saved local Neuron cache name should be equal to {repo_id}."
+        assert repo_id == load_custom_cache_repo_name_from_hf_home(), (
+            f"Saved local Neuron cache name should be equal to {repo_id}."
+        )
 
     def test_optimum_neuron_cache_add(self, hub_test):
         with TemporaryDirectory() as tmpdir:
diff --git a/tests/test_generate.py b/tests/test_generate.py
index 706e3538b..a4b054240 100644
--- a/tests/test_generate.py
+++ b/tests/test_generate.py
@@ -104,12 +104,12 @@ def test_greedy_decoding(self, model_name, use_cache, decoder_only, compiler_fla
 
         cpu_samples = _test_generative_decoding(model_name=model_name, device="cpu", decoder_only=decoder_only)
 
-        assert np.array_equal(
-            cpu_samples, xla_neuron_samples_fp32
-        ), "XLA Neuron FP32 output doesn't match CPU only output"
-        assert np.array_equal(
-            cpu_samples, xla_neuron_samples_bf16
-        ), "XLA Neuron bf16 output doesn't match CPU only output"
+        assert np.array_equal(cpu_samples, xla_neuron_samples_fp32), (
+            "XLA Neuron FP32 output doesn't match CPU only output"
+        )
+        assert np.array_equal(cpu_samples, xla_neuron_samples_bf16), (
+            "XLA Neuron bf16 output doesn't match CPU only output"
+        )
 
     @parameterized.expand(BEAM_SEARCH_TESTDATA)
     @pytest.mark.skip("Remove once generate fix (#262) has been merged.")
@@ -130,9 +130,9 @@ def test_beam_search_decoding(self, model_name, use_cache, decoder_only, compile
             model_name=model_name, device="cpu", decoder_only=decoder_only, generation_config_update=config_update
         )
 
-        assert np.array_equal(
-            cpu_samples, xla_neuron_samples_fp32
-        ), "XLA Neuron FP32 output doesn't match CPU only output"
-        assert np.array_equal(
-            cpu_samples, xla_neuron_samples_bf16
-        ), "XLA Neuron bf16 output doesn't match CPU only output"
+        assert np.array_equal(cpu_samples, xla_neuron_samples_fp32), (
+            "XLA Neuron FP32 output doesn't match CPU only output"
+        )
+        assert np.array_equal(cpu_samples, xla_neuron_samples_bf16), (
+            "XLA Neuron bf16 output doesn't match CPU only output"
+        )
diff --git a/tests/test_trainers.py b/tests/test_trainers.py
index 17f79248c..0d1650362 100644
--- a/tests/test_trainers.py
+++ b/tests/test_trainers.py
@@ -267,12 +267,12 @@ def test_train_and_eval_use_remote_cache(self, hub_test_with_local_cache, tmpdir
 
         # TODO: investigate that, not urgent.
         assert files_in_repo == last_files_in_repo, "No file should have been added to the Hub after first training."
-        assert (
-            files_in_cache == last_files_in_cache
-        ), "No file should have been added to the cache after first training."
-        assert (
-            second_training_duration < first_training_duration
-        ), "Second training should be faster because cached graphs can be used."
+        assert files_in_cache == last_files_in_cache, (
+            "No file should have been added to the cache after first training."
+        )
+        assert second_training_duration < first_training_duration, (
+            "Second training should be faster because cached graphs can be used."
+        )
 
     @pytest.mark.skip("Test in later release")
     def test_save_and_resume_from_checkpoint(self, parallel_sizes, tmpdir):
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 2b84d7a52..4614f4944 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -53,9 +53,9 @@ def test_patch_model():
             pass
 
     wav2vec2_model = Wav2Vec2Model(Wav2Vec2Config())
-    assert (
-        wav2vec2_model.config.layerdrop > 0
-    ), "Default Wav2vec2Config layerdrop value is already 0 so the test will not check anything."
+    assert wav2vec2_model.config.layerdrop > 0, (
+        "Default Wav2vec2Config layerdrop value is already 0 so the test will not check anything."
+    )
     patching_specs = []
     for spec in MODEL_PATCHING_SPECS:
         patching_specs.append((wav2vec2_model,) + spec)
diff --git a/text-generation-inference/server/text_generation_server/model.py b/text-generation-inference/server/text_generation_server/model.py
index c4a692c95..e8cb34ee1 100644
--- a/text-generation-inference/server/text_generation_server/model.py
+++ b/text-generation-inference/server/text_generation_server/model.py
@@ -56,7 +56,7 @@ def log_cache_size():
     if os.path.exists(path):
         usage = shutil.disk_usage(path)
         gb = 2**30
-        logger.info(f"Cache disk [{path}]: total = {usage.total/gb:.2f} G, free = {usage.free/gb:.2f} G")
+        logger.info(f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G")
     else:
         raise ValueError(f"The cache directory ({path}) does not exist.")
 
diff --git a/text-generation-inference/tgi_env.py b/text-generation-inference/tgi_env.py
index 6855b468a..ff647c988 100755
--- a/text-generation-inference/tgi_env.py
+++ b/text-generation-inference/tgi_env.py
@@ -50,7 +50,7 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
     args = parser.parse_known_args(argv)[0]
 
     if not args.model_id:
-        raise Exception("No model id provided ! Either specify it using --model-id cmdline " "or MODEL_ID env var")
+        raise Exception("No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var")
 
     # Override env with cmdline params
     os.environ["MODEL_ID"] = args.model_id
@@ -109,7 +109,7 @@ def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Op
 
     if not all_compatible:
         logger.debug(
-            "No compatible cached entry found for model %s, env %s, available cores %s, " "neuronxcc version %s",
+            "No compatible cached entry found for model %s, env %s, available cores %s, neuronxcc version %s",
             model_id,
             get_env_dict(),
             available_cores,
@@ -139,7 +139,7 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che
 
     if check_compiler_version and neuron_config["compiler_version"] != neuronxcc_version:
         logger.debug(
-            "Compiler version conflict, the local one " "(%s) differs from the one used to compile the model (%s)",
+            "Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)",
             neuronxcc_version,
             neuron_config["compiler_version"],
         )
@@ -163,7 +163,7 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che
         sequence_length = neuron_config["sequence_length"]
         if max_input_tokens >= sequence_length:
             logger.debug(
-                "Specified max input tokens is not compatible with config sequence length " "( %s >= %s)",
+                "Specified max input tokens is not compatible with config sequence length ( %s >= %s)",
                 max_input_tokens,
                 sequence_length,
             )
@@ -205,7 +205,7 @@ def main():
         if not compatible:
             env_dict = get_env_dict()
             msg = (
-                "Invalid neuron config and env. Config {}, env {}, available cores {}, " "neuronxcc version {}"
+                "Invalid neuron config and env. Config {}, env {}, available cores {}, neuronxcc version {}"
             ).format(neuron_config, env_dict, available_cores, neuronxcc_version)
             logger.error(msg)
             raise Exception(msg)
@@ -213,9 +213,9 @@ def main():
         neuron_config = lookup_compatible_cached_model(args.model_id, args.revision)
 
     if not neuron_config:
-        msg = (
-            "No compatible neuron config found. Provided env {}, " "available cores {}, neuronxcc version {}"
-        ).format(get_env_dict(), available_cores, neuronxcc_version)
+        msg = ("No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}").format(
+            get_env_dict(), available_cores, neuronxcc_version
+        )
         logger.error(msg)
         raise Exception(msg)
 
diff --git a/tools/auto_fill_inference_cache.py b/tools/auto_fill_inference_cache.py
index faa1e6e2d..e1772d784 100644
--- a/tools/auto_fill_inference_cache.py
+++ b/tools/auto_fill_inference_cache.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Script to cache models for inference."""
+
 import argparse
 import json
 import logging