diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml index fe788bc34..7410502b0 100644 --- a/.github/workflows/check_code_quality.yml +++ b/.github/workflows/check_code_quality.yml @@ -48,4 +48,5 @@ jobs: - name: Check style with ruff run: | source venv/bin/activate + ruff format . --diff ruff check . diff --git a/Makefile b/Makefile index 12febb89c..c9444e23f 100644 --- a/Makefile +++ b/Makefile @@ -60,9 +60,11 @@ transformers_examples: # Run code quality checks style_check: ruff check . + ruff format . --diff style: ruff check . --fix + ruff format . # Utilities to release to PyPi build_dist_install_tools: diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py index bedf48ec9..8b7134647 100755 --- a/examples/language-modeling/run_clm.py +++ b/examples/language-modeling/run_clm.py @@ -462,7 +462,7 @@ def main(): model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code) n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) - logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params") # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. diff --git a/examples/question-answering/trainer_qa.py b/examples/question-answering/trainer_qa.py index 8243448a0..1e1119de3 100644 --- a/examples/question-answering/trainer_qa.py +++ b/examples/question-answering/trainer_qa.py @@ -15,6 +15,7 @@ """ A subclass of `Trainer` specific to Question-Answering tasks """ + import math import time diff --git a/examples/question-answering/trainer_seq2seq_qa.py b/examples/question-answering/trainer_seq2seq_qa.py index 6e04bf3f6..2a3dbe5ca 100644 --- a/examples/question-answering/trainer_seq2seq_qa.py +++ b/examples/question-answering/trainer_seq2seq_qa.py @@ -15,6 +15,7 @@ """ A subclass of `Trainer` specific to Question-Answering tasks """ + import math import time from typing import Dict, List, Optional diff --git a/examples/question-answering/utils_qa.py b/examples/question-answering/utils_qa.py index 23a46370d..79497dbb8 100644 --- a/examples/question-answering/utils_qa.py +++ b/examples/question-answering/utils_qa.py @@ -15,6 +15,7 @@ """ Post-processing utilities for question answering. """ + import collections import json import logging diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py index 5a442c075..90be3c604 100755 --- a/examples/summarization/run_summarization.py +++ b/examples/summarization/run_summarization.py @@ -525,9 +525,9 @@ def main(): return if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): - assert ( - data_args.lang is not None - ), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument" + assert data_args.lang is not None, ( + f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument" + ) tokenizer.src_lang = data_args.lang tokenizer.tgt_lang = data_args.lang diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py index 75b321be0..e9f1fb6f0 100755 --- a/examples/text-classification/run_glue.py +++ b/examples/text-classification/run_glue.py @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Finetuning the library models for sequence classification on GLUE.""" +"""Finetuning the library models for sequence classification on GLUE.""" # You can also adapt this script on your own text classification task. Pointers for this are left as comments. import logging @@ -158,9 +158,9 @@ def __post_init__(self): train_extension = self.train_file.split(".")[-1] assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file." validation_extension = self.validation_file.split(".")[-1] - assert ( - validation_extension == train_extension - ), "`validation_file` should have the same extension (csv or json) as `train_file`." + assert validation_extension == train_extension, ( + "`validation_file` should have the same extension (csv or json) as `train_file`." + ) @dataclass @@ -329,9 +329,9 @@ def main(): if data_args.test_file is not None: train_extension = data_args.train_file.split(".")[-1] test_extension = data_args.test_file.split(".")[-1] - assert ( - test_extension == train_extension - ), "`test_file` should have the same extension (csv or json) as `train_file`." + assert test_extension == train_extension, ( + "`test_file` should have the same extension (csv or json) as `train_file`." + ) data_files["test"] = data_args.test_file else: raise ValueError("Need either a GLUE task or a test file for `do_predict`.") diff --git a/examples/text-classification/run_xnli.py b/examples/text-classification/run_xnli.py index 4b06d2653..23f79a5bb 100755 --- a/examples/text-classification/run_xnli.py +++ b/examples/text-classification/run_xnli.py @@ -14,8 +14,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Finetuning multi-lingual models on XNLI (e.g. Bert, DistilBERT, XLM). - Adapted from `examples/text-classification/run_glue.py`""" +"""Finetuning multi-lingual models on XNLI (e.g. Bert, DistilBERT, XLM). +Adapted from `examples/text-classification/run_glue.py`""" import logging import os diff --git a/notebooks/sentence-transformers/getting-started.ipynb b/notebooks/sentence-transformers/getting-started.ipynb index 148022fe8..17aaadb74 100644 --- a/notebooks/sentence-transformers/getting-started.ipynb +++ b/notebooks/sentence-transformers/getting-started.ipynb @@ -103,14 +103,14 @@ "\n", "# Run inference\n", "prompt = \"I like to eat apples\"\n", - "encoded_input = tokenizer(prompt, return_tensors='pt')\n", + "encoded_input = tokenizer(prompt, return_tensors=\"pt\")\n", "outputs = model(**encoded_input)\n", "\n", "token_embeddings = outputs.token_embeddings\n", "sentence_embedding = outputs.sentence_embedding\n", "\n", - "print(f\"token embeddings: {token_embeddings.shape}\") # torch.Size([1, 7, 384])\n", - "print(f\"sentence_embedding: {sentence_embedding.shape}\") # torch.Size([1, 384])" + "print(f\"token embeddings: {token_embeddings.shape}\") # torch.Size([1, 7, 384])\n", + "print(f\"sentence_embedding: {sentence_embedding.shape}\") # torch.Size([1, 384])" ] }, { diff --git a/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb b/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb index 775fd254c..cba09eb74 100644 --- a/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb +++ b/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb @@ -86,7 +86,9 @@ "outputs": [], "source": [ "# Push and share your model to the HuggingFace hub\n", - "repository_id = \"your-username/your-awesome-model\" # Replace with your repo id, eg. \"Jingya/stable-diffusion-2-1-neuronx\".\n", + "repository_id = (\n", + " \"your-username/your-awesome-model\" # Replace with your repo id, eg. \"Jingya/stable-diffusion-2-1-neuronx\".\n", + ")\n", "stable_diffusion.push_to_hub(save_directory, repository_id=repository_id, use_auth_token=True)" ] }, @@ -659,7 +661,7 @@ " \"engineers eating lunch at the opera\",\n", " \"panda eating bamboo on a plane\",\n", " \"A digital illustration of a steampunk flying machine in the sky with cogs and mechanisms, 4k, detailed, trending in artstation, fantasy vivid colors\",\n", - " \"kids playing soccer at the FIFA World Cup\"\n", + " \"kids playing soccer at the FIFA World Cup\",\n", "]\n", "\n", "\n", @@ -675,7 +677,7 @@ " print(f\"[Inference Time] {np.round(inf_time, 2)} seconds.\")\n", " image.save(\"image.png\")\n", " image = mpimg.imread(\"image.png\")\n", - " #clear_output(wait=True)\n", + " # clear_output(wait=True)\n", " plt.imshow(image)\n", " plt.show()" ] diff --git a/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb b/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb index c8fafda7c..abbd735ce 100644 --- a/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb +++ b/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb @@ -87,7 +87,9 @@ "outputs": [], "source": [ "# Push and share your model to the HuggingFace hub\n", - "repository_id = \"your-username/your-awesome-model\" # Replace with your repo id, eg. \"Jingya/stable-diffusion-xl-base-1.0-neuronx\".\n", + "repository_id = (\n", + " \"your-username/your-awesome-model\" # Replace with your repo id, eg. \"Jingya/stable-diffusion-xl-base-1.0-neuronx\".\n", + ")\n", "stable_diffusion_xl.push_to_hub(save_directory, repository_id=repository_id, use_auth_token=True)" ] }, @@ -708,7 +710,7 @@ " \"engineers eating lunch at the opera\",\n", " \"panda eating bamboo on a plane\",\n", " \"A digital illustration of a steampunk flying machine in the sky with cogs and mechanisms, 4k, detailed, trending in artstation, fantasy vivid colors\",\n", - " \"kids playing soccer at the FIFA World Cup\"\n", + " \"kids playing soccer at the FIFA World Cup\",\n", "]\n", "\n", "\n", @@ -724,7 +726,7 @@ " print(f\"[Inference Time] {np.round(inf_time, 2)} seconds.\")\n", " image.save(\"image.png\")\n", " image = mpimg.imread(\"image.png\")\n", - " #clear_output(wait=True)\n", + " # clear_output(wait=True)\n", " plt.imshow(image)\n", " plt.show()" ] diff --git a/notebooks/text-classification/notebook.ipynb b/notebooks/text-classification/notebook.ipynb index b03ac1502..86e1feac1 100644 --- a/notebooks/text-classification/notebook.ipynb +++ b/notebooks/text-classification/notebook.ipynb @@ -118,8 +118,8 @@ "from random import randrange\n", "\n", "\n", - "random_id = randrange(len(raw_dataset['train']))\n", - "raw_dataset['train'][random_id]\n", + "random_id = randrange(len(raw_dataset[\"train\"]))\n", + "raw_dataset[\"train\"][random_id]\n", "# {'text': 'i feel isolated and alone in my trade', 'label': 0}" ] }, @@ -152,18 +152,20 @@ "# Load Tokenizer\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "\n", + "\n", "# Tokenize helper function\n", "def tokenize(batch):\n", - " return tokenizer(batch['text'], padding='max_length', truncation=True,return_tensors=\"pt\")\n", + " return tokenizer(batch[\"text\"], padding=\"max_length\", truncation=True, return_tensors=\"pt\")\n", + "\n", "\n", "# Tokenize dataset\n", - "raw_dataset = raw_dataset.rename_column(\"label\", \"labels\") # to match Trainer\n", + "raw_dataset = raw_dataset.rename_column(\"label\", \"labels\") # to match Trainer\n", "tokenized_dataset = raw_dataset.map(tokenize, batched=True, remove_columns=[\"text\"])\n", "tokenized_dataset = tokenized_dataset.with_format(\"torch\")\n", "\n", "# save dataset to disk\n", - "tokenized_dataset[\"train\"].save_to_disk(os.path.join(save_dataset_path,\"train\"))\n", - "tokenized_dataset[\"test\"].save_to_disk(os.path.join(save_dataset_path,\"eval\"))" + "tokenized_dataset[\"train\"].save_to_disk(os.path.join(save_dataset_path, \"train\"))\n", + "tokenized_dataset[\"test\"].save_to_disk(os.path.join(save_dataset_path, \"eval\"))" ] }, { diff --git a/notebooks/text-generation/CodeLlama-7B-Compilation.ipynb b/notebooks/text-generation/CodeLlama-7B-Compilation.ipynb index f3ebf98fc..318da4ef5 100644 --- a/notebooks/text-generation/CodeLlama-7B-Compilation.ipynb +++ b/notebooks/text-generation/CodeLlama-7B-Compilation.ipynb @@ -97,8 +97,9 @@ "from optimum.neuron import pipeline\n", "\n", "\n", - "p = pipeline('text-generation', 'aws-neuron/CodeLlama-7b-hf-neuron-8xlarge')\n", - "p(\"import socket\\n\\ndef ping_exponential_backoff(host: str):\",\n", + "p = pipeline(\"text-generation\", \"aws-neuron/CodeLlama-7b-hf-neuron-8xlarge\")\n", + "p(\n", + " \"import socket\\n\\ndef ping_exponential_backoff(host: str):\",\n", " do_sample=True,\n", " top_k=10,\n", " temperature=0.1,\n", @@ -191,10 +192,12 @@ "from optimum.neuron import NeuronModelForCausalLM\n", "\n", "\n", - "#num_cores should be changed based on the instance. inf2.24xlarge has 6 neuron processors (they have two cores each) so 12 total\n", - "compiler_args = {\"num_cores\": 2, \"auto_cast_type\": 'fp16'}\n", + "# num_cores should be changed based on the instance. inf2.24xlarge has 6 neuron processors (they have two cores each) so 12 total\n", + "compiler_args = {\"num_cores\": 2, \"auto_cast_type\": \"fp16\"}\n", "input_shapes = {\"batch_size\": 1, \"sequence_length\": 2048}\n", - "model = NeuronModelForCausalLM.from_pretrained(\"codellama/CodeLlama-7b-hf\", export=True, **compiler_args, **input_shapes)" + "model = NeuronModelForCausalLM.from_pretrained(\n", + " \"codellama/CodeLlama-7b-hf\", export=True, **compiler_args, **input_shapes\n", + ")" ] }, { @@ -214,7 +217,7 @@ "metadata": {}, "outputs": [], "source": [ - "model.save_pretrained(\"CodeLlama-7b-hf-neuron-8xlarge\")\n" + "model.save_pretrained(\"CodeLlama-7b-hf-neuron-8xlarge\")" ] }, { @@ -255,7 +258,7 @@ "from huggingface_hub.hf_api import HfFolder\n", "\n", "\n", - "HfFolder.save_token('MY_HUGGINGFACE_TOKEN_HERE')" + "HfFolder.save_token(\"MY_HUGGINGFACE_TOKEN_HERE\")" ] }, { diff --git a/notebooks/text-generation/llama2-13b-chatbot.ipynb b/notebooks/text-generation/llama2-13b-chatbot.ipynb index 788ee756f..38e7abdec 100644 --- a/notebooks/text-generation/llama2-13b-chatbot.ipynb +++ b/notebooks/text-generation/llama2-13b-chatbot.ipynb @@ -103,13 +103,11 @@ "from optimum.neuron import NeuronModelForCausalLM\n", "\n", "\n", - "compiler_args = {\"num_cores\": 24, \"auto_cast_type\": 'fp16'}\n", + "compiler_args = {\"num_cores\": 24, \"auto_cast_type\": \"fp16\"}\n", "input_shapes = {\"batch_size\": 1, \"sequence_length\": 2048}\n", "model = NeuronModelForCausalLM.from_pretrained(\n", - " \"NousResearch/Llama-2-13b-chat-hf\",\n", - " export=True,\n", - " **compiler_args,\n", - " **input_shapes)" + " \"NousResearch/Llama-2-13b-chat-hf\", export=True, **compiler_args, **input_shapes\n", + ")" ] }, { @@ -177,7 +175,7 @@ "from huggingface_hub import whoami\n", "\n", "\n", - "org = whoami()['name']\n", + "org = whoami()[\"name\"]\n", "\n", "repo_id = f\"{org}/llama-2-13b-chat-neuron\"\n", "\n", @@ -245,7 +243,7 @@ " model\n", "except NameError:\n", " # Edit this to use another base model\n", - " model = NeuronModelForCausalLM.from_pretrained('aws-neuron/Llama-2-13b-chat-hf-neuron-latency')" + " model = NeuronModelForCausalLM.from_pretrained(\"aws-neuron/Llama-2-13b-chat-hf-neuron-latency\")" ] }, { @@ -290,12 +288,7 @@ "outputs": [], "source": [ "inputs = tokenizer(\"What is deep-learning ?\", return_tensors=\"pt\")\n", - "outputs = model.generate(**inputs,\n", - " max_new_tokens=128,\n", - " do_sample=True,\n", - " temperature=0.9,\n", - " top_k=50,\n", - " top_p=0.9)\n", + "outputs = model.generate(**inputs, max_new_tokens=128, do_sample=True, temperature=0.9, top_k=50, top_p=0.9)\n", "tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, @@ -323,7 +316,7 @@ "outputs": [], "source": [ "def format_chat_prompt(message, history, max_tokens):\n", - " \"\"\" Convert a history of messages to a chat prompt\n", + " \"\"\"Convert a history of messages to a chat prompt\n", " Args:\n", " message(str): the new user message.\n", " history (List[str]): the list of user messages and assistant responses.\n", @@ -334,10 +327,10 @@ " chat = []\n", " # Convert all messages in history to chat interactions\n", " for interaction in history:\n", - " chat.append({\"role\": \"user\", \"content\" : interaction[0]})\n", - " chat.append({\"role\": \"assistant\", \"content\" : interaction[1]})\n", + " chat.append({\"role\": \"user\", \"content\": interaction[0]})\n", + " chat.append({\"role\": \"assistant\", \"content\": interaction[1]})\n", " # Add the new message\n", - " chat.append({\"role\": \"user\", \"content\" : message})\n", + " chat.append({\"role\": \"user\", \"content\": message})\n", " # Generate the prompt, verifying that we don't go beyond the maximum number of tokens\n", " for i in range(0, len(chat), 2):\n", " # Generate candidate prompt with the last n-i entries\n", @@ -372,19 +365,17 @@ "history = []\n", "max_tokens = 1024\n", "\n", + "\n", "def chat(message, history, max_tokens):\n", " prompt = format_chat_prompt(message, history, max_tokens)\n", " # Uncomment the line below to see what the formatted prompt looks like\n", - " #print(prompt)\n", + " # print(prompt)\n", " inputs = tokenizer(prompt, return_tensors=\"pt\")\n", - " outputs = model.generate(**inputs,\n", - " max_length=2048,\n", - " do_sample=True,\n", - " temperature=0.9,\n", - " top_k=50,\n", - " repetition_penalty=1.2)\n", + " outputs = model.generate(\n", + " **inputs, max_length=2048, do_sample=True, temperature=0.9, top_k=50, repetition_penalty=1.2\n", + " )\n", " # Do not include the input tokens\n", - " outputs = outputs[0, inputs.input_ids.size(-1):]\n", + " outputs = outputs[0, inputs.input_ids.size(-1) :]\n", " response = tokenizer.decode(outputs, skip_special_tokens=True)\n", " history.append([message, response])\n", " return response" diff --git a/notebooks/text-generation/llama2-7b-fine-tuning.ipynb b/notebooks/text-generation/llama2-7b-fine-tuning.ipynb index 08e41a58b..6456cacbf 100644 --- a/notebooks/text-generation/llama2-7b-fine-tuning.ipynb +++ b/notebooks/text-generation/llama2-7b-fine-tuning.ipynb @@ -238,7 +238,7 @@ "\n", "\n", "# Hugging Face model id\n", - "model_id = \"philschmid/Llama-2-7b-hf\" # ungated\n", + "model_id = \"philschmid/Llama-2-7b-hf\" # ungated\n", "# model_id = \"meta-llama/Llama-2-7b-hf\" # gated\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)" @@ -266,7 +266,7 @@ "from random import randint\n", "\n", "\n", - "sys.path.append(\"./scripts/utils\") # make sure you change this to the correct path\n", + "sys.path.append(\"./scripts/utils\") # make sure you change this to the correct path\n", "from pack_dataset import pack_dataset\n", "\n", "\n", @@ -275,18 +275,17 @@ " sample[\"text\"] = f\"{format_dolly(sample)}{tokenizer.eos_token}\"\n", " return sample\n", "\n", + "\n", "# apply prompt template per sample\n", "dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))\n", "# print random sample\n", "print(dataset[randint(0, len(dataset))][\"text\"])\n", "\n", "# tokenize dataset\n", - "dataset = dataset.map(\n", - " lambda sample: tokenizer(sample[\"text\"]), batched=True, remove_columns=list(dataset.features)\n", - ")\n", + "dataset = dataset.map(lambda sample: tokenizer(sample[\"text\"]), batched=True, remove_columns=list(dataset.features))\n", "\n", "# chunk dataset\n", - "lm_dataset = pack_dataset(dataset, chunk_length=2048) # We use 2048 as the maximum length for packing" + "lm_dataset = pack_dataset(dataset, chunk_length=2048) # We use 2048 as the maximum length for packing" ] }, { @@ -466,15 +465,11 @@ "from optimum.neuron import NeuronModelForCausalLM\n", "\n", "\n", - "compiler_args = {\"num_cores\": 2, \"auto_cast_type\": 'fp16'}\n", + "compiler_args = {\"num_cores\": 2, \"auto_cast_type\": \"fp16\"}\n", "input_shapes = {\"batch_size\": 1, \"sequence_length\": 2048}\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"dolly_llama\")\n", - "model = NeuronModelForCausalLM.from_pretrained(\n", - " \"dolly_llama\",\n", - " export=True,\n", - " **compiler_args,\n", - " **input_shapes)\n" + "model = NeuronModelForCausalLM.from_pretrained(\"dolly_llama\", export=True, **compiler_args, **input_shapes)" ] }, { @@ -519,13 +514,8 @@ "def generate(sample):\n", " prompt = format_dolly_infernece(sample)\n", " inputs = tokenizer(prompt, return_tensors=\"pt\")\n", - " outputs = model.generate(**inputs,\n", - " max_new_tokens=512,\n", - " do_sample=True,\n", - " temperature=0.9,\n", - " top_k=50,\n", - " top_p=0.9)\n", - " return tokenizer.decode(outputs[0], skip_special_tokens=False)[len(prompt):]" + " outputs = model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.9, top_k=50, top_p=0.9)\n", + " return tokenizer.decode(outputs[0], skip_special_tokens=False)[len(prompt) :]" ] }, { @@ -543,9 +533,7 @@ "metadata": {}, "outputs": [], "source": [ - "prompt = {\n", - " \"instruction\": \"Can you tell me something about AWS?\"\n", - "}\n", + "prompt = {\"instruction\": \"Can you tell me something about AWS?\"}\n", "res = generate(prompt)\n", "\n", "print(res)" @@ -572,8 +560,8 @@ "outputs": [], "source": [ "prompt = {\n", - " \"instruction\": \"How can train models on AWS Trainium?\",\n", - " \"context\": \"🤗 Optimum Neuron is the interface between the 🤗 Transformers library and AWS Accelerators including [AWS Trainium](https://aws.amazon.com/machine-learning/trainium/?nc1=h_ls) and [AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/?nc1=h_ls). It provides a set of tools enabling easy model loading, training and inference on single- and multi-Accelerator settings for different downstream tasks.\"\n", + " \"instruction\": \"How can train models on AWS Trainium?\",\n", + " \"context\": \"🤗 Optimum Neuron is the interface between the 🤗 Transformers library and AWS Accelerators including [AWS Trainium](https://aws.amazon.com/machine-learning/trainium/?nc1=h_ls) and [AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/?nc1=h_ls). It provides a set of tools enabling easy model loading, training and inference on single- and multi-Accelerator settings for different downstream tasks.\",\n", "}\n", "res = generate(prompt)\n", "\n", diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index 3e2d00ef6..5a549a697 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -662,7 +662,7 @@ def main_export( ) logger.info( - f"The {NEURON_COMPILER} export succeeded and the exported model was saved at: " f"{output.as_posix()}" + f"The {NEURON_COMPILER} export succeeded and the exported model was saved at: {output.as_posix()}" ) except ShapeError as e: raise e @@ -678,8 +678,7 @@ def main_export( ) except Exception as e: logger.error( - f"An error occured with the error message: {e}.\n The exported model was saved at: " - f"{output.as_posix()}" + f"An error occured with the error message: {e}.\n The exported model was saved at: {output.as_posix()}" ) diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py index 82e842954..80958f70b 100644 --- a/optimum/exporters/neuron/config.py +++ b/optimum/exporters/neuron/config.py @@ -16,6 +16,7 @@ Common Neuron configuration classes that handle most of the features for building model specific configurations. """ + from typing import List from ...utils import ( diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index b9b1a79c5..a30a7ba47 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Neuron compiled model check and export functions.""" + import copy import time from collections import OrderedDict @@ -282,7 +283,7 @@ def validate_model_outputs( if shape_failures: msg = "\n".join(f"- {t[0]}: got {t[1]} (reference) and {t[2]} (neuron)" for t in shape_failures) - raise ShapeError("Output shapes do not match between reference model and the Neuron exported model:\n" "{msg}") + raise ShapeError("Output shapes do not match between reference model and the Neuron exported model:\n{msg}") if value_failures: msg = "\n".join(f"- {t[0]}: max diff = {t[1]}" for t in value_failures) diff --git a/optimum/exporters/neuron/model_configs/decoder_configs.py b/optimum/exporters/neuron/model_configs/decoder_configs.py index 30ddc808e..e2273610f 100644 --- a/optimum/exporters/neuron/model_configs/decoder_configs.py +++ b/optimum/exporters/neuron/model_configs/decoder_configs.py @@ -14,7 +14,6 @@ # limitations under the License. """Neuron export configurations for models using transformers_neuronx.""" - from optimum.exporters.tasks import TasksManager from ....neuron.models.granite.model import GraniteForSampling diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py index f1fb2995e..2e1c15639 100644 --- a/optimum/exporters/neuron/model_wrappers.py +++ b/optimum/exporters/neuron/model_wrappers.py @@ -247,13 +247,13 @@ def forward(self, input_ids, attention_mask): batch_size = input_ids.shape[0] sequence_length = input_ids.shape[1] if self.sequence_length is not None: - assert ( - self.sequence_length - ), f"Different sequence length for the parallel partition({self.sequence_length}) and for dummy inputs({sequence_length}). Make sure that they have the same value." + assert self.sequence_length, ( + f"Different sequence length for the parallel partition({self.sequence_length}) and for dummy inputs({sequence_length}). Make sure that they have the same value." + ) if self.batch_size is not None: - assert ( - self.batch_size - ), f"Different batch size for the parallel partition({self.batch_size}) and for dummy inputs({batch_size}). Make sure that they have the same value." + assert self.batch_size, ( + f"Different batch size for the parallel partition({self.batch_size}) and for dummy inputs({batch_size}). Make sure that they have the same value." + ) encoder_output = self.model.encoder( input_ids=input_ids, attention_mask=attention_mask, output_attentions=False, output_hidden_states=False diff --git a/optimum/neuron/accelerate/utils/operations.py b/optimum/neuron/accelerate/utils/operations.py index 11345ca10..9e241dd6e 100644 --- a/optimum/neuron/accelerate/utils/operations.py +++ b/optimum/neuron/accelerate/utils/operations.py @@ -14,7 +14,6 @@ # limitations under the License. """Custom operations related to accelerate for Neuron.""" - import torch from accelerate.utils.operations import recursively_apply diff --git a/optimum/neuron/distributed/encoder_decoder_models.py b/optimum/neuron/distributed/encoder_decoder_models.py index 0af70494c..b86df32a8 100644 --- a/optimum/neuron/distributed/encoder_decoder_models.py +++ b/optimum/neuron/distributed/encoder_decoder_models.py @@ -268,7 +268,7 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): if past_key_value is not None: if len(past_key_value) != 2: raise ValueError( - f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" + f"past_key_value should have 2 past states: keys and values. Got {len(past_key_value)} past states" ) real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length diff --git a/optimum/neuron/distributed/parallel_layers.py b/optimum/neuron/distributed/parallel_layers.py index bbba16c88..fd489e4eb 100644 --- a/optimum/neuron/distributed/parallel_layers.py +++ b/optimum/neuron/distributed/parallel_layers.py @@ -108,7 +108,7 @@ def prepare_parallel_layer_specific_kwargs(cls, **parallel_layer_specific_kwargs name for name in parallel_layer_specific_kwargs if name not in default_parallel_layer_specific_kwargs ] logger.debug( - f'The following arguments are not allowed for {cls.__name__}: {", ".join(wrong_argument_names)}, they ' + f"The following arguments are not allowed for {cls.__name__}: {', '.join(wrong_argument_names)}, they " "will be ignored." ) diff --git a/optimum/neuron/distributed/parallelizers_manager.py b/optimum/neuron/distributed/parallelizers_manager.py index 5f3cc2df3..a0a4fce89 100644 --- a/optimum/neuron/distributed/parallelizers_manager.py +++ b/optimum/neuron/distributed/parallelizers_manager.py @@ -28,7 +28,7 @@ def parallelizer_classes_resolver( - model_type_to_parallelizer_class_name: Dict[str, str] + model_type_to_parallelizer_class_name: Dict[str, str], ) -> Dict[str, Type[Parallelizer]]: modules = [] for module_name in _PARALLELIZER_CLASSES_MODULE_NAMES: diff --git a/optimum/neuron/modeling.py b/optimum/neuron/modeling.py index fa5681d26..7edbf1c61 100644 --- a/optimum/neuron/modeling.py +++ b/optimum/neuron/modeling.py @@ -179,9 +179,7 @@ def forward( # last_hidden_state -> (batch_size, sequencen_len, hidden_size) last_hidden_state = self.remove_padding( [outputs[0]], dims=[0, 1], indices=[input_ids.shape[0], input_ids.shape[1]] - )[ - 0 - ] # Remove padding on batch_size(0), and sequence_length(1) + )[0] # Remove padding on batch_size(0), and sequence_length(1) if len(outputs) > 1: # pooler_output -> (batch_size, hidden_size) pooler_output = self.remove_padding([outputs[1]], dims=[0], indices=[input_ids.shape[0]])[ @@ -264,9 +262,7 @@ def forward( # token_embeddings -> (batch_size, sequencen_len, hidden_size) token_embeddings = self.remove_padding( [outputs[0]], dims=[0, 1], indices=[input_ids.shape[0], input_ids.shape[1]] - )[ - 0 - ] # Remove padding on batch_size(0), and sequence_length(1) + )[0] # Remove padding on batch_size(0), and sequence_length(1) # sentence_embedding -> (batch_size, hidden_size) sentence_embedding = self.remove_padding([outputs[1]], dims=[0], indices=[input_ids.shape[0]])[ 0 diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py index 53f2df76e..944d76ebe 100644 --- a/optimum/neuron/modeling_diffusion.py +++ b/optimum/neuron/modeling_diffusion.py @@ -1183,7 +1183,9 @@ def forward( if output_hidden_states: assert ( self.config.output_hidden_states or self.config.neuron.get("output_hidden_states") - ) == output_hidden_states, "output_hidden_states is expected to be False since the model was compiled without hidden_states as output." + ) == output_hidden_states, ( + "output_hidden_states is expected to be False since the model was compiled without hidden_states as output." + ) input_ids = input_ids.to(torch.long) # dummy generator uses long int for tracing inputs = (input_ids,) diff --git a/optimum/neuron/models/granite/hlo.py b/optimum/neuron/models/granite/hlo.py index d66f12b8d..59330b438 100644 --- a/optimum/neuron/models/granite/hlo.py +++ b/optimum/neuron/models/granite/hlo.py @@ -35,7 +35,6 @@ def scale_mul(t, scale): class GraniteForSamplingNoEmbeddingHlo: - def __init__(self, config: GraniteConfig, neuron_config: Optional[NeuronConfig] = None): self.config = config self.neuron_config = neuron_config @@ -324,9 +323,9 @@ def layer( tp_degree=self.config.tp_degree, ) if self.neuron_config.fuse_mlp: - assert all( - (not (x) for x in [in0_weight, in1_weight, out_weight, in0_scales, in1_scales, out_scales]) - ), "in0, in1 and out weights have to be None" + assert all((not (x) for x in [in0_weight, in1_weight, out_weight, in0_scales, in1_scales, out_scales])), ( + "in0, in1 and out weights have to be None" + ) in0_weight, in0_scales = mlp_in_weight, mlp_in_scales out_weight, out_scales = mlp_out_weight, mlp_out_scales @@ -688,7 +687,6 @@ def attention( # Single Token Generation ("Prefetch"-style) ans speculative forward if active_mask is not None: - n_active_tokens = key.sizes[1] if bsh_cache_layout else key.sizes[0] if n_active_tokens > 1 and self.neuron_config and self.neuron_config.continuous_batching: # For speculative forward + continuous batching, slice out samples in the batch size diff --git a/optimum/neuron/models/granite/model.py b/optimum/neuron/models/granite/model.py index ddd3aecf2..7b706ef19 100644 --- a/optimum/neuron/models/granite/model.py +++ b/optimum/neuron/models/granite/model.py @@ -159,9 +159,9 @@ def load_weights(self): # Note: Automatic MLP padding is safe since zeros are *only* introduced to intermediary state if self.neuron_config.fuse_mlp: - assert all( - getattr(mlp, attr, None) for attr in ["gate_proj", "up_proj"] - ), "fuse_mlp need to have gate and up proj weights" + assert all(getattr(mlp, attr, None) for attr in ["gate_proj", "up_proj"]), ( + "fuse_mlp need to have gate and up proj weights" + ) assert all( getattr(mlp, attr, None).weight.shape[0] % self.config.tp_degree == 0 for attr in ["gate_proj", "up_proj"] diff --git a/optimum/neuron/models/granite/modules.py b/optimum/neuron/models/granite/modules.py index 4cbbcc9f3..84b36a38d 100644 --- a/optimum/neuron/models/granite/modules.py +++ b/optimum/neuron/models/granite/modules.py @@ -18,7 +18,6 @@ class GraniteForCausalLM(module.PretrainedModel): - def __init__(self, config: GraniteConfig): super().__init__() dtype, _, _ = utils.parse_amp(config.amp) @@ -34,7 +33,6 @@ def get_base_model(self): class GraniteModel(module.LowMemoryModule): - def __init__(self, config: GraniteConfig): super().__init__() self.embed_tokens = module.LowMemoryEmbedding(config.vocab_size, config.hidden_size) @@ -45,14 +43,12 @@ def __init__(self, config: GraniteConfig): class GraniteRMSNorm(module.LowMemoryModule): - def __init__(self, config: GraniteConfig) -> None: super().__init__() self.weight = module.UninitializedParameter() class GraniteDecoderLayer(module.LowMemoryModule): - def __init__(self, config: GraniteConfig): super().__init__() self.self_attn = GraniteAttention(config) @@ -62,7 +58,6 @@ def __init__(self, config: GraniteConfig): class GraniteAttention(module.LowMemoryModule): - def __init__(self, config: GraniteConfig): super().__init__() self.hidden_size = config.hidden_size @@ -77,7 +72,6 @@ def __init__(self, config: GraniteConfig): class GraniteMLP(module.LowMemoryModule): - def __init__(self, config: GraniteConfig): super().__init__() dtype, _, _ = utils.parse_amp(config.amp) diff --git a/optimum/neuron/models/qwen2/model.py b/optimum/neuron/models/qwen2/model.py index 8ee60d9b4..8396a8fba 100644 --- a/optimum/neuron/models/qwen2/model.py +++ b/optimum/neuron/models/qwen2/model.py @@ -156,9 +156,9 @@ def load_weights(self): # Note: Automatic MLP padding is safe since zeros are *only* introduced to intermediary state if self.neuron_config.fuse_mlp: - assert all( - getattr(mlp, attr, None) for attr in ["gate_proj", "up_proj"] - ), "fuse_mlp need to have gate and up proj weights" + assert all(getattr(mlp, attr, None) for attr in ["gate_proj", "up_proj"]), ( + "fuse_mlp need to have gate and up proj weights" + ) assert all( getattr(mlp, attr, None).weight.shape[0] % self.config.tp_degree == 0 for attr in ["gate_proj", "up_proj"] diff --git a/optimum/neuron/utils/cache_utils.py b/optimum/neuron/utils/cache_utils.py index 28845713c..84a760988 100644 --- a/optimum/neuron/utils/cache_utils.py +++ b/optimum/neuron/utils/cache_utils.py @@ -62,7 +62,7 @@ def load_custom_cache_repo_name_from_hf_home( - hf_home_cache_repo_file: Union[str, Path] = HF_HOME_CACHE_REPO_FILE + hf_home_cache_repo_file: Union[str, Path] = HF_HOME_CACHE_REPO_FILE, ) -> Optional[str]: if Path(hf_home_cache_repo_file).exists(): with open(hf_home_cache_repo_file, "r") as fp: diff --git a/optimum/neuron/utils/peft_utils.py b/optimum/neuron/utils/peft_utils.py index 7780ff7ed..4866669ac 100644 --- a/optimum/neuron/utils/peft_utils.py +++ b/optimum/neuron/utils/peft_utils.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Utilities related to the PEFT library and support.""" + import collections import functools import os diff --git a/optimum/neuron/utils/version_utils.py b/optimum/neuron/utils/version_utils.py index 818e2bc1e..368c4d186 100644 --- a/optimum/neuron/utils/version_utils.py +++ b/optimum/neuron/utils/version_utils.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Version utilities.""" + from typing import Optional from packaging import version diff --git a/tests/cli/test_neuron_cache_cli.py b/tests/cli/test_neuron_cache_cli.py index 0d7887445..c8b6da574 100644 --- a/tests/cli/test_neuron_cache_cli.py +++ b/tests/cli/test_neuron_cache_cli.py @@ -50,18 +50,18 @@ def _optimum_neuron_cache_create(self, cache_repo_id: Optional[str] = None, publ try: repo_id = cache_repo_id if cache_repo_id is not None else CACHE_REPO_NAME info = HfApi().repo_info(repo_id, repo_type="model") - assert info.private == ( - not public - ), "The privacy of the repo should match the presence of the --public flag." + assert info.private == (not public), ( + "The privacy of the repo should match the presence of the --public flag." + ) except RepositoryNotFoundError: pytest.fail("The repo was not created.") finally: delete_repo(repo_id) - assert ( - repo_id == load_custom_cache_repo_name_from_hf_home() - ), f"Saved local Neuron cache name should be equal to {repo_id}." + assert repo_id == load_custom_cache_repo_name_from_hf_home(), ( + f"Saved local Neuron cache name should be equal to {repo_id}." + ) def test_optimum_neuron_cache_create_with_custom_name(self, hub_test): seed = random.randint(0, 100) @@ -79,9 +79,9 @@ def test_optimum_neuron_cache_set(self, hub_test): p = subprocess.Popen(command) returncode = p.wait() assert returncode == 0 - assert ( - repo_id == load_custom_cache_repo_name_from_hf_home() - ), f"Saved local Neuron cache name should be equal to {repo_id}." + assert repo_id == load_custom_cache_repo_name_from_hf_home(), ( + f"Saved local Neuron cache name should be equal to {repo_id}." + ) def test_optimum_neuron_cache_add(self, hub_test): with TemporaryDirectory() as tmpdir: diff --git a/tests/test_generate.py b/tests/test_generate.py index 706e3538b..a4b054240 100644 --- a/tests/test_generate.py +++ b/tests/test_generate.py @@ -104,12 +104,12 @@ def test_greedy_decoding(self, model_name, use_cache, decoder_only, compiler_fla cpu_samples = _test_generative_decoding(model_name=model_name, device="cpu", decoder_only=decoder_only) - assert np.array_equal( - cpu_samples, xla_neuron_samples_fp32 - ), "XLA Neuron FP32 output doesn't match CPU only output" - assert np.array_equal( - cpu_samples, xla_neuron_samples_bf16 - ), "XLA Neuron bf16 output doesn't match CPU only output" + assert np.array_equal(cpu_samples, xla_neuron_samples_fp32), ( + "XLA Neuron FP32 output doesn't match CPU only output" + ) + assert np.array_equal(cpu_samples, xla_neuron_samples_bf16), ( + "XLA Neuron bf16 output doesn't match CPU only output" + ) @parameterized.expand(BEAM_SEARCH_TESTDATA) @pytest.mark.skip("Remove once generate fix (#262) has been merged.") @@ -130,9 +130,9 @@ def test_beam_search_decoding(self, model_name, use_cache, decoder_only, compile model_name=model_name, device="cpu", decoder_only=decoder_only, generation_config_update=config_update ) - assert np.array_equal( - cpu_samples, xla_neuron_samples_fp32 - ), "XLA Neuron FP32 output doesn't match CPU only output" - assert np.array_equal( - cpu_samples, xla_neuron_samples_bf16 - ), "XLA Neuron bf16 output doesn't match CPU only output" + assert np.array_equal(cpu_samples, xla_neuron_samples_fp32), ( + "XLA Neuron FP32 output doesn't match CPU only output" + ) + assert np.array_equal(cpu_samples, xla_neuron_samples_bf16), ( + "XLA Neuron bf16 output doesn't match CPU only output" + ) diff --git a/tests/test_trainers.py b/tests/test_trainers.py index 17f79248c..0d1650362 100644 --- a/tests/test_trainers.py +++ b/tests/test_trainers.py @@ -267,12 +267,12 @@ def test_train_and_eval_use_remote_cache(self, hub_test_with_local_cache, tmpdir # TODO: investigate that, not urgent. assert files_in_repo == last_files_in_repo, "No file should have been added to the Hub after first training." - assert ( - files_in_cache == last_files_in_cache - ), "No file should have been added to the cache after first training." - assert ( - second_training_duration < first_training_duration - ), "Second training should be faster because cached graphs can be used." + assert files_in_cache == last_files_in_cache, ( + "No file should have been added to the cache after first training." + ) + assert second_training_duration < first_training_duration, ( + "Second training should be faster because cached graphs can be used." + ) @pytest.mark.skip("Test in later release") def test_save_and_resume_from_checkpoint(self, parallel_sizes, tmpdir): diff --git a/tests/test_utils.py b/tests/test_utils.py index 2b84d7a52..4614f4944 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -53,9 +53,9 @@ def test_patch_model(): pass wav2vec2_model = Wav2Vec2Model(Wav2Vec2Config()) - assert ( - wav2vec2_model.config.layerdrop > 0 - ), "Default Wav2vec2Config layerdrop value is already 0 so the test will not check anything." + assert wav2vec2_model.config.layerdrop > 0, ( + "Default Wav2vec2Config layerdrop value is already 0 so the test will not check anything." + ) patching_specs = [] for spec in MODEL_PATCHING_SPECS: patching_specs.append((wav2vec2_model,) + spec) diff --git a/text-generation-inference/server/text_generation_server/model.py b/text-generation-inference/server/text_generation_server/model.py index c4a692c95..e8cb34ee1 100644 --- a/text-generation-inference/server/text_generation_server/model.py +++ b/text-generation-inference/server/text_generation_server/model.py @@ -56,7 +56,7 @@ def log_cache_size(): if os.path.exists(path): usage = shutil.disk_usage(path) gb = 2**30 - logger.info(f"Cache disk [{path}]: total = {usage.total/gb:.2f} G, free = {usage.free/gb:.2f} G") + logger.info(f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G") else: raise ValueError(f"The cache directory ({path}) does not exist.") diff --git a/text-generation-inference/tgi_env.py b/text-generation-inference/tgi_env.py index 6855b468a..ff647c988 100755 --- a/text-generation-inference/tgi_env.py +++ b/text-generation-inference/tgi_env.py @@ -50,7 +50,7 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace: args = parser.parse_known_args(argv)[0] if not args.model_id: - raise Exception("No model id provided ! Either specify it using --model-id cmdline " "or MODEL_ID env var") + raise Exception("No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var") # Override env with cmdline params os.environ["MODEL_ID"] = args.model_id @@ -109,7 +109,7 @@ def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Op if not all_compatible: logger.debug( - "No compatible cached entry found for model %s, env %s, available cores %s, " "neuronxcc version %s", + "No compatible cached entry found for model %s, env %s, available cores %s, neuronxcc version %s", model_id, get_env_dict(), available_cores, @@ -139,7 +139,7 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che if check_compiler_version and neuron_config["compiler_version"] != neuronxcc_version: logger.debug( - "Compiler version conflict, the local one " "(%s) differs from the one used to compile the model (%s)", + "Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)", neuronxcc_version, neuron_config["compiler_version"], ) @@ -163,7 +163,7 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che sequence_length = neuron_config["sequence_length"] if max_input_tokens >= sequence_length: logger.debug( - "Specified max input tokens is not compatible with config sequence length " "( %s >= %s)", + "Specified max input tokens is not compatible with config sequence length ( %s >= %s)", max_input_tokens, sequence_length, ) @@ -205,7 +205,7 @@ def main(): if not compatible: env_dict = get_env_dict() msg = ( - "Invalid neuron config and env. Config {}, env {}, available cores {}, " "neuronxcc version {}" + "Invalid neuron config and env. Config {}, env {}, available cores {}, neuronxcc version {}" ).format(neuron_config, env_dict, available_cores, neuronxcc_version) logger.error(msg) raise Exception(msg) @@ -213,9 +213,9 @@ def main(): neuron_config = lookup_compatible_cached_model(args.model_id, args.revision) if not neuron_config: - msg = ( - "No compatible neuron config found. Provided env {}, " "available cores {}, neuronxcc version {}" - ).format(get_env_dict(), available_cores, neuronxcc_version) + msg = ("No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}").format( + get_env_dict(), available_cores, neuronxcc_version + ) logger.error(msg) raise Exception(msg) diff --git a/tools/auto_fill_inference_cache.py b/tools/auto_fill_inference_cache.py index faa1e6e2d..e1772d784 100644 --- a/tools/auto_fill_inference_cache.py +++ b/tools/auto_fill_inference_cache.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Script to cache models for inference.""" + import argparse import json import logging