From 21e5b85e119931c3819775dbe133ab7199ded1cc Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Sat, 10 Feb 2024 20:44:14 -0800 Subject: [PATCH 01/19] phi2 changes for azure --- ...tion.json => dataset-classification.jsonl} | 0 configs/phi-2/finetuning/aml_config.json | 6 + .../docker-contexts/wais_phi2_env/Dockerfile | 31 +++++ .../wais_phi2_env/requirements.txt | 34 +++++ configs/phi-2/finetuning/invoke_olive.py | 98 ++++++++++++++- .../olive-config-azure-template.json | 118 ++++++++++++++++++ configs/phi-2/finetuning/utils.py | 68 ++++++++++ configs/phi-2/setup/requirements.txt | 23 +++- 8 files changed, 370 insertions(+), 8 deletions(-) rename configs/phi-2/dataset/{dataset-classification.json => dataset-classification.jsonl} (100%) create mode 100644 configs/phi-2/finetuning/aml_config.json create mode 100644 configs/phi-2/finetuning/docker-contexts/wais_phi2_env/Dockerfile create mode 100644 configs/phi-2/finetuning/docker-contexts/wais_phi2_env/requirements.txt create mode 100644 configs/phi-2/finetuning/olive-config-azure-template.json create mode 100644 configs/phi-2/finetuning/utils.py diff --git a/configs/phi-2/dataset/dataset-classification.json b/configs/phi-2/dataset/dataset-classification.jsonl similarity index 100% rename from configs/phi-2/dataset/dataset-classification.json rename to configs/phi-2/dataset/dataset-classification.jsonl diff --git a/configs/phi-2/finetuning/aml_config.json b/configs/phi-2/finetuning/aml_config.json new file mode 100644 index 0000000..a81085a --- /dev/null +++ b/configs/phi-2/finetuning/aml_config.json @@ -0,0 +1,6 @@ +{ + "subscription_id": "", + "resource_group": "", + "workspace_name": "", + "aml_compute_name": "", +} \ No newline at end of file diff --git a/configs/phi-2/finetuning/docker-contexts/wais_phi2_env/Dockerfile b/configs/phi-2/finetuning/docker-contexts/wais_phi2_env/Dockerfile new file mode 100644 index 0000000..29c2bd3 --- /dev/null +++ b/configs/phi-2/finetuning/docker-contexts/wais_phi2_env/Dockerfile @@ -0,0 +1,31 @@ +FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04:20240109.v1 + +ENV DEBIAN_FRONTEND=noninteractive +ENV ACCEPT_EULA=Y + +# Build python3.9 as the default python +RUN apt-get -y update && \ + apt-get -y upgrade && \ + apt-get install -y --no-install-recommends \ + software-properties-common gnupg\ + && add-apt-repository -y ppa:deadsnakes \ + && apt-get install -y --no-install-recommends \ + python3.9-venv \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* && \ + python3.9 -m venv /venv + +ENV PATH=/venv/bin:$PATH + +# Install TensorRT +RUN v="8.4.1-1+cuda11.6" &&\ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ + apt-get -y update && \ + apt-get -y upgrade && \ + apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \ + libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \ + python3-libnvinfer=${v} libnvinfer-samples=${v} + +RUN pip install --upgrade pip && pip install --upgrade setuptools +COPY requirements.txt . +RUN pip install -r requirements.txt --no-cache-dir diff --git a/configs/phi-2/finetuning/docker-contexts/wais_phi2_env/requirements.txt b/configs/phi-2/finetuning/docker-contexts/wais_phi2_env/requirements.txt new file mode 100644 index 0000000..4cd3939 --- /dev/null +++ b/configs/phi-2/finetuning/docker-contexts/wais_phi2_env/requirements.txt @@ -0,0 +1,34 @@ +--extra-index-url https://download.pytorch.org/whl/cu118 +torch +torchvision +torchaudio +packaging +datasets==2.14.5 +transformers==4.36.2 +accelerate==0.23.0 +bitsandbytes==0.41.1 +peft==0.5.0 +scikit-learn==1.3.1 +sentencepiece==0.1.99 +trl==0.7.2 +protobuf +ipykernel==6.25.2 +wandb==0.15.12 +einops +promptflow==0.1.0b8 +promptflow-tools==0.1.0b10 +gradio +azureml-core==1.54.0 +azure-ai-ml==1.12.1 +azureml-fsspec +azureml-mlflow +azure-identity +azure-keyvault +azure-keyvault-secrets +mlflow +docker +onnxruntime-gpu +scipy +onnxruntime-training +torch-ort +git+https://github.com/microsoft/Olive.git@3c25717980ad743e802c16a8e4c831b6196f68bf diff --git a/configs/phi-2/finetuning/invoke_olive.py b/configs/phi-2/finetuning/invoke_olive.py index 261b160..e653397 100644 --- a/configs/phi-2/finetuning/invoke_olive.py +++ b/configs/phi-2/finetuning/invoke_olive.py @@ -1,6 +1,98 @@ +import argparse +import sys +import json +from utils import get_aml_client, create_dataset +from azure.ai.ml.entities import Environment, BuildContext +import re + from olive.workflows import run as olive_run import os +import argparse +import sys + +def parse_aml_config(aml_config): + """Parse the AML config to make sure the required fields are present""" + with open(aml_config, 'r') as file: + aml_config = json.load(file) + + try: + subscription_id = aml_config["subscription_id"] + resource_group = aml_config["resource_group"] + workspace_name = aml_config["workspace_name"] + aml_compute_name = aml_config["aml_compute_name"] + except KeyError as e: + print(f"KeyError: {e} not found in aml_config.json") + sys.exit(1) + + return aml_config + + +def main(): + """Main function of the script.""" + + # input and output arguments + parser = argparse.ArgumentParser() + parser.add_argument("--azure", required=False, action="store_true", help="runs the training on azure when this option is enabled") + parser.add_argument("--aml_config", type=str, required='--azure' in sys.argv, help="aml config (update and use aml_config.json) for azure subscription/workspace details") + + args = parser.parse_args() + + # Run olive from file locally + if not args.azure: + file_path = os.path.join(os.getcwd(), 'finetuning/olive-config.json') + olive_run(file_path) + else: + dataset_local_path = "dataset/dataset-classification.jsonl" + dataset_name = "phi2_train_dataset" + dataset_version = "1" + docker_context_path = "finetuning/docker-contexts/wais_phi2_env" + azure_olive_config_template_path = "finetuning/olive-config-azureml_template.json" + azure_olive_config_path = "finetuning/olive-config-azureml.json" + azure_environment_name = "wais_phi2_env" + + aml_config = parse_aml_config(args.aml_config) + # Get the AML client + ml_client = get_aml_client(aml_config["subscription_id"], aml_config["resource_group"], aml_config["workspace_name"]) + + # Create the environment + print("Creating the environment...") + env_docker_context = Environment( + build=BuildContext(path=docker_context_path), # Path to the Docker context + name=azure_environment_name, + description="Environment created from a Docker context for training phi2 model using Olive.", + ) + aml_env = ml_client.environments.create_or_update(env_docker_context) + print("The environment {} was created successfully.".format(aml_env.name)) + + # Create the dataset + print("Creating the dataset...") + + description = "Train dataset for tone classification model." + dataset = create_dataset(ml_client, local_path=dataset_local_path, name=dataset_name, version=dataset_version, description=description) + print("The dataset {} was created successfully.".format(dataset.name)) + dataset_relative_path = re.split("/datastores/.*/paths/", dataset.path)[-1] + + # Update the olive-config-azureml.json + with open(azure_olive_config_template_path, 'r') as file: + olive_config = json.load(file) + try: + olive_config["azureml_client"]["aml_config_path"] = args.aml_config + olive_config["data_configs"]["dataset_default_train"]["params_config"]["data_files"]["config"]["azureml_client"]["aml_config_path"] = args.aml_config + olive_config["data_configs"]["dataset_default_train"]["params_config"]["data_files"]["config"]["relative_path"] = dataset_relative_path + olive_config["systems"]["aml_system"]["config"]["aml_compute"] = aml_config["aml_compute_name"] + olive_config["systems"]["aml_system"]["config"]["aml_environment_config"]["name"] = aml_env.name + olive_config["systems"]["aml_system"]["config"]["aml_environment_config"]["version"] = aml_env.version + except KeyError as e: + print(f"KeyError: {e} not found in olive-config-azureml.json") + sys.exit(1) + + with open(azure_olive_config_path, 'w') as file: + json.dump(olive_config, file, indent=4) + + # Run olive from file for debug. + file_path = os.path.join(os.getcwd(), azure_olive_config_path) + olive_run(file_path) + -# Run olive from file for debug. -file_path = os.path.join(os.getcwd(), 'finetuning/olive-config.json') -olive_run(file_path) \ No newline at end of file +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/configs/phi-2/finetuning/olive-config-azure-template.json b/configs/phi-2/finetuning/olive-config-azure-template.json new file mode 100644 index 0000000..b695a22 --- /dev/null +++ b/configs/phi-2/finetuning/olive-config-azure-template.json @@ -0,0 +1,118 @@ +{ + "azureml_client": { + "aml_config_path": "", + "read_timeout": 4000, + "max_operation_retries": 4, + "operation_retry_interval": 5 + }, + "systems": { + "local_system": { + "type": "LocalSystem" + }, + "aml_system": { + "type": "AzureML", + "config": { + "aml_compute": "", + "accelerators": [ + "gpu" + ], + "aml_environment_config": { + "name": "", + "version": "" + } + } + } + }, + "input_model": { + "type": "PyTorchModel", + "config": { + "hf_config": { + "model_name": "microsoft/phi-2", + "task": "text-generation", + "from_pretrained_args": { + "trust_remote_code": true + } + } + } + }, + "data_configs": { + "dataset_default_train": { + "name": "dataset_default", + "type": "HuggingfaceContainer", + "params_config": { +#data_configs_data_files_extension_start + +#data_configs_data_files_extension_end + "": + { + "type": "azureml_datastore", + "config": { + "azureml_client": + { + "aml_config_path": "" + }, + "datastore_name": "workspaceblobstore", + "relative_path": "" + } + }, + "split": "", + "component_kwargs": { + "pre_process_data": { + "dataset_type": "", + "text_cols": , + "text_template": "", + "corpus_strategy": "", + "source_max_len": , + "pad_to_max_len": , + "use_attention_mask": false + } + } + } + } + }, + "passes": { + "qlora": { + "type": "QLoRA", + "config": { + "compute_dtype": "", + "quant_type": "", + "double_quant": , + "lora_r": , + "lora_alpha": , + "lora_dropout": , + "train_data_config": "dataset-default_train", + "eval_dataset_size": , + "training_args": { + "seed": , + "data_seed": , + "per_device_train_batch_size": , + "per_device_eval_batch_size": , + "gradient_accumulation_steps": , + "gradient_checkpointing": , + "learning_rate": , + "num_train_epochs":, + "max_steps": , + "logging_steps": 10, + "evaluation_strategy": "steps", + "eval_steps": 187, + "group_by_length": true, + "adam_beta2": 0.999, + "max_grad_norm": 0.3, + "output_dir": "outputs/" + } + } + } + }, + "engine": { + "log_severity_level": 0, + "search_strategy": false, + "evaluate_input_model": false, + "execution_providers": [ + "CUDAExecutionProvider" + ], + "host": "aml_system", + "cache_dir": "cache", + "output_dir": "outputs/models/qlora", + "target": "aml_system" + } +} diff --git a/configs/phi-2/finetuning/utils.py b/configs/phi-2/finetuning/utils.py new file mode 100644 index 0000000..c36753e --- /dev/null +++ b/configs/phi-2/finetuning/utils.py @@ -0,0 +1,68 @@ +# Handle to the workspace +from azure.ai.ml import MLClient + +# Authentication package +from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential +from azure.ai.ml.entities import Data +from azure.ai.ml.constants import AssetTypes + +from azure.ai.ml import command +from azure.ai.ml import Input, Output + + +def get_aml_client(subscription_id, resource_group_name, workspace_name): + """ + Get an Azure Machine Learning client instance. + + Args: + subscription_id (str): The Azure subscription ID. + resource_group_name (str): The name of the resource group. + workspace_name (str): The name of the Azure Machine Learning workspace. + + Returns: + MLClient: An instance of the Azure Machine Learning client. + """ + credential = DefaultAzureCredential(exclude_interactive_browser_credential=False) + + # Create and return MLClient instance + return MLClient(credential, + subscription_id=subscription_id, + resource_group_name=resource_group_name, + workspace_name=workspace_name + ) + + +def create_dataset(ml_client, local_path, name, version, description=""): + """ + Creates a data asset using the specified ML client, local path, name, version, and optional description. + + Args: + ml_client (MLClient): The ML client used to interact with the ML service. + local_path (str): The local path of the data asset. + name (str): The name of the data asset. + version (str): The version of the data asset. + description (str, optional): The description of the data asset. Defaults to "". + + Returns: + Dataset (Dataset): Registered dataset with the given name and version. + """ + + my_data = Data( + name=name, + version=version, + description=description, + path=local_path, + type=AssetTypes.URI_FILE, + ) + + ## create data asset if it doesn't already exist: + try: + dataset = ml_client.data.get(name=name, version=version) + print( + f"Data asset already exists. Name: {dataset.name}, version: {dataset.version}" + ) + except: + dataset = ml_client.data.create_or_update(my_data) + print(f"Data asset created. Name: {dataset.name}, version: {dataset.version}") + + return dataset \ No newline at end of file diff --git a/configs/phi-2/setup/requirements.txt b/configs/phi-2/setup/requirements.txt index a55d10c..4cd3939 100644 --- a/configs/phi-2/setup/requirements.txt +++ b/configs/phi-2/setup/requirements.txt @@ -2,20 +2,33 @@ torch torchvision torchaudio -transformers==4.34.1 +packaging +datasets==2.14.5 +transformers==4.36.2 accelerate==0.23.0 bitsandbytes==0.41.1 -datasets==2.14.5 peft==0.5.0 scikit-learn==1.3.1 sentencepiece==0.1.99 trl==0.7.2 -protobuf==3.20.3 +protobuf ipykernel==6.25.2 wandb==0.15.12 -onnxruntime-gpu==1.16.1 einops -olive-ai==0.3.3 promptflow==0.1.0b8 promptflow-tools==0.1.0b10 gradio +azureml-core==1.54.0 +azure-ai-ml==1.12.1 +azureml-fsspec +azureml-mlflow +azure-identity +azure-keyvault +azure-keyvault-secrets +mlflow +docker +onnxruntime-gpu +scipy +onnxruntime-training +torch-ort +git+https://github.com/microsoft/Olive.git@3c25717980ad743e802c16a8e4c831b6196f68bf From c7e07a2446a2fccd58519ab9bc4e18c82097c9a6 Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Sat, 10 Feb 2024 21:06:59 -0800 Subject: [PATCH 02/19] Fixing template path --- configs/phi-2/.vscode/launch.json | 2 +- configs/phi-2/finetuning/invoke_olive.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/phi-2/.vscode/launch.json b/configs/phi-2/.vscode/launch.json index 2b2502c..74565fe 100644 --- a/configs/phi-2/.vscode/launch.json +++ b/configs/phi-2/.vscode/launch.json @@ -6,7 +6,7 @@ "configurations": [ { "name": "Python: Current File", - "type": "python", + "type": "debugpy", "request": "launch", "program": "${file}", "console": "integratedTerminal", diff --git a/configs/phi-2/finetuning/invoke_olive.py b/configs/phi-2/finetuning/invoke_olive.py index e653397..40b629c 100644 --- a/configs/phi-2/finetuning/invoke_olive.py +++ b/configs/phi-2/finetuning/invoke_olive.py @@ -46,7 +46,7 @@ def main(): dataset_name = "phi2_train_dataset" dataset_version = "1" docker_context_path = "finetuning/docker-contexts/wais_phi2_env" - azure_olive_config_template_path = "finetuning/olive-config-azureml_template.json" + azure_olive_config_template_path = "finetuning/olive-config-azureml-template.json" azure_olive_config_path = "finetuning/olive-config-azureml.json" azure_environment_name = "wais_phi2_env" From bc8f6cfda281764ea3c3d610f8a2ccad191c1b11 Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Mon, 12 Feb 2024 00:20:25 -0800 Subject: [PATCH 03/19] Changing extension from json to jsonl in project-settings and fixing azure template --- configs/phi-2/finetuning/olive-config-azure-template.json | 5 ++--- configs/phi-2/setup/project-settings.json | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/configs/phi-2/finetuning/olive-config-azure-template.json b/configs/phi-2/finetuning/olive-config-azure-template.json index b695a22..74f57c6 100644 --- a/configs/phi-2/finetuning/olive-config-azure-template.json +++ b/configs/phi-2/finetuning/olive-config-azure-template.json @@ -30,7 +30,7 @@ "model_name": "microsoft/phi-2", "task": "text-generation", "from_pretrained_args": { - "trust_remote_code": true + "trust_remote_code": true } } } @@ -43,8 +43,7 @@ #data_configs_data_files_extension_start #data_configs_data_files_extension_end - "": - { + "data_files": { "type": "azureml_datastore", "config": { "azureml_client": diff --git a/configs/phi-2/setup/project-settings.json b/configs/phi-2/setup/project-settings.json index 7bf3b99..214777a 100644 --- a/configs/phi-2/setup/project-settings.json +++ b/configs/phi-2/setup/project-settings.json @@ -29,9 +29,9 @@ "info": "Dataset to train the model from a local file.", "replaceToken": "", "optionValues": [ - "dataset/dataset-classification.json" + "dataset/dataset-classification.jsonl" ], - "defaultValue": "dataset/dataset-classification.json" + "defaultValue": "dataset/dataset-classification.jsonl" }, { "type": "String", From 74f027ba218b7d3437c5684706027f00853ff72e Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Mon, 12 Feb 2024 01:02:09 -0800 Subject: [PATCH 04/19] Fixing train_data_config path --- configs/phi-2/finetuning/olive-config-azure-template.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/phi-2/finetuning/olive-config-azure-template.json b/configs/phi-2/finetuning/olive-config-azure-template.json index 74f57c6..51cdcfa 100644 --- a/configs/phi-2/finetuning/olive-config-azure-template.json +++ b/configs/phi-2/finetuning/olive-config-azure-template.json @@ -79,7 +79,7 @@ "lora_r": , "lora_alpha": , "lora_dropout": , - "train_data_config": "dataset-default_train", + "train_data_config": "dataset_default_train", "eval_dataset_size": , "training_args": { "seed": , From fad56717858522c9ca22f024c6291dc450869a5a Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Mon, 12 Feb 2024 01:05:57 -0800 Subject: [PATCH 05/19] Changing dataset-default to dataset_default --- configs/llama-v2-7b/finetuning/olive-config.json | 6 +++--- configs/mistral-7b/finetuning/olive-config.json | 6 +++--- configs/phi-1_5/finetuning/olive-config.json | 6 +++--- configs/phi-2/finetuning/olive-config.json | 6 +++--- configs/zephyr-7b-beta/finetuning/olive-config.json | 6 +++--- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/configs/llama-v2-7b/finetuning/olive-config.json b/configs/llama-v2-7b/finetuning/olive-config.json index 6e9cfcd..9461fec 100644 --- a/configs/llama-v2-7b/finetuning/olive-config.json +++ b/configs/llama-v2-7b/finetuning/olive-config.json @@ -9,8 +9,8 @@ } }, "data_configs": { - "dataset-default_train": { - "name": "dataset-default", + "dataset_default_train": { + "name": "dataset_default", "type": "HuggingfaceContainer", "params_config": { #data_configs_data_files_extension_start @@ -42,7 +42,7 @@ "lora_r": , "lora_alpha": , "lora_dropout": , - "train_data_config": "dataset-default_train", + "train_data_config": "dataset_default_train", "eval_dataset_size": , "training_args": { "seed": , diff --git a/configs/mistral-7b/finetuning/olive-config.json b/configs/mistral-7b/finetuning/olive-config.json index 36ca04d..b40b829 100644 --- a/configs/mistral-7b/finetuning/olive-config.json +++ b/configs/mistral-7b/finetuning/olive-config.json @@ -9,8 +9,8 @@ } }, "data_configs": { - "dataset-default_train": { - "name": "dataset-default", + "dataset_default_train": { + "name": "dataset_default", "type": "HuggingfaceContainer", "params_config": { #data_configs_data_files_extension_start @@ -42,7 +42,7 @@ "lora_r": , "lora_alpha": , "lora_dropout": , - "train_data_config": "dataset-default_train", + "train_data_config": "dataset_default_train", "eval_dataset_size": , "training_args": { "seed": , diff --git a/configs/phi-1_5/finetuning/olive-config.json b/configs/phi-1_5/finetuning/olive-config.json index e55ab88..1f6cc81 100644 --- a/configs/phi-1_5/finetuning/olive-config.json +++ b/configs/phi-1_5/finetuning/olive-config.json @@ -12,8 +12,8 @@ } }, "data_configs": { - "dataset-default_train": { - "name": "dataset-default", + "dataset_default_train": { + "name": "dataset_default", "type": "HuggingfaceContainer", "params_config": { #data_configs_data_files_extension_start @@ -45,7 +45,7 @@ "lora_r": , "lora_alpha": , "lora_dropout": , - "train_data_config": "dataset-default_train", + "train_data_config": "dataset_default_train", "eval_dataset_size": , "training_args": { "seed": , diff --git a/configs/phi-2/finetuning/olive-config.json b/configs/phi-2/finetuning/olive-config.json index 1775036..911ef58 100644 --- a/configs/phi-2/finetuning/olive-config.json +++ b/configs/phi-2/finetuning/olive-config.json @@ -12,8 +12,8 @@ } }, "data_configs": { - "dataset-default_train": { - "name": "dataset-default", + "dataset_default_train": { + "name": "dataset_default", "type": "HuggingfaceContainer", "params_config": { #data_configs_data_files_extension_start @@ -45,7 +45,7 @@ "lora_r": , "lora_alpha": , "lora_dropout": , - "train_data_config": "dataset-default_train", + "train_data_config": "dataset_default_train", "eval_dataset_size": , "training_args": { "seed": , diff --git a/configs/zephyr-7b-beta/finetuning/olive-config.json b/configs/zephyr-7b-beta/finetuning/olive-config.json index eac516d..ded390f 100644 --- a/configs/zephyr-7b-beta/finetuning/olive-config.json +++ b/configs/zephyr-7b-beta/finetuning/olive-config.json @@ -9,8 +9,8 @@ } }, "data_configs": { - "dataset-default_train": { - "name": "dataset-default", + "dataset_default_train": { + "name": "dataset_default", "type": "HuggingfaceContainer", "params_config": { #data_configs_data_files_extension_start @@ -42,7 +42,7 @@ "lora_r": , "lora_alpha": , "lora_dropout": , - "train_data_config": "dataset-default_train", + "train_data_config": "dataset_default_train", "eval_dataset_size": , "training_args": { "seed": , From af1b71ee37d44276f84d7ef823a360ccc765c39f Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Mon, 12 Feb 2024 01:42:55 -0800 Subject: [PATCH 06/19] Fixing aml_config.json --- configs/phi-2/finetuning/aml_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/phi-2/finetuning/aml_config.json b/configs/phi-2/finetuning/aml_config.json index a81085a..9bc55de 100644 --- a/configs/phi-2/finetuning/aml_config.json +++ b/configs/phi-2/finetuning/aml_config.json @@ -2,5 +2,5 @@ "subscription_id": "", "resource_group": "", "workspace_name": "", - "aml_compute_name": "", + "aml_compute_name": "" } \ No newline at end of file From b6aa38a7531642a627cf4525373844397db089fd Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Mon, 12 Feb 2024 02:02:09 -0800 Subject: [PATCH 07/19] Renaming template file --- ...fig-azure-template.json => olive-config-azureml-template.json} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename configs/phi-2/finetuning/{olive-config-azure-template.json => olive-config-azureml-template.json} (100%) diff --git a/configs/phi-2/finetuning/olive-config-azure-template.json b/configs/phi-2/finetuning/olive-config-azureml-template.json similarity index 100% rename from configs/phi-2/finetuning/olive-config-azure-template.json rename to configs/phi-2/finetuning/olive-config-azureml-template.json From afe75911c9c160f05b3c9bdf31dfdcd159ea828f Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Mon, 12 Feb 2024 17:55:43 -0800 Subject: [PATCH 08/19] phi2 bug fixes --- ...aset-classification.jsonl => dataset-classification.json} | 0 configs/phi-2/finetuning/invoke_olive.py | 5 ++++- configs/phi-2/finetuning/olive-config.json | 2 +- configs/phi-2/setup/project-settings.json | 4 ++-- 4 files changed, 7 insertions(+), 4 deletions(-) rename configs/phi-2/dataset/{dataset-classification.jsonl => dataset-classification.json} (100%) diff --git a/configs/phi-2/dataset/dataset-classification.jsonl b/configs/phi-2/dataset/dataset-classification.json similarity index 100% rename from configs/phi-2/dataset/dataset-classification.jsonl rename to configs/phi-2/dataset/dataset-classification.json diff --git a/configs/phi-2/finetuning/invoke_olive.py b/configs/phi-2/finetuning/invoke_olive.py index 40b629c..c573a71 100644 --- a/configs/phi-2/finetuning/invoke_olive.py +++ b/configs/phi-2/finetuning/invoke_olive.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import argparse import sys import json @@ -42,7 +45,7 @@ def main(): file_path = os.path.join(os.getcwd(), 'finetuning/olive-config.json') olive_run(file_path) else: - dataset_local_path = "dataset/dataset-classification.jsonl" + dataset_local_path = "dataset/dataset-classification.json" dataset_name = "phi2_train_dataset" dataset_version = "1" docker_context_path = "finetuning/docker-contexts/wais_phi2_env" diff --git a/configs/phi-2/finetuning/olive-config.json b/configs/phi-2/finetuning/olive-config.json index 911ef58..66476c0 100644 --- a/configs/phi-2/finetuning/olive-config.json +++ b/configs/phi-2/finetuning/olive-config.json @@ -5,7 +5,7 @@ "hf_config": { "model_name": "model-cache/microsoft/phi-2", "task": "text-generation", - "model_loading_args": { + "from_pretrained_args": { "trust_remote_code": true } } diff --git a/configs/phi-2/setup/project-settings.json b/configs/phi-2/setup/project-settings.json index 214777a..7bf3b99 100644 --- a/configs/phi-2/setup/project-settings.json +++ b/configs/phi-2/setup/project-settings.json @@ -29,9 +29,9 @@ "info": "Dataset to train the model from a local file.", "replaceToken": "", "optionValues": [ - "dataset/dataset-classification.jsonl" + "dataset/dataset-classification.json" ], - "defaultValue": "dataset/dataset-classification.jsonl" + "defaultValue": "dataset/dataset-classification.json" }, { "type": "String", From a56f93146933688ff8730f661078f4e71a68e229 Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Mon, 12 Feb 2024 18:01:22 -0800 Subject: [PATCH 09/19] Adding copyright notice --- configs/phi-2/finetuning/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/configs/phi-2/finetuning/utils.py b/configs/phi-2/finetuning/utils.py index c36753e..86c6275 100644 --- a/configs/phi-2/finetuning/utils.py +++ b/configs/phi-2/finetuning/utils.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + # Handle to the workspace from azure.ai.ml import MLClient From 20e45af1220459e14f72574b53e2ae0e09b3b1e8 Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Mon, 12 Feb 2024 18:01:59 -0800 Subject: [PATCH 10/19] phi1_5 changes for azure --- configs/phi-1_5/.vscode/launch.json | 2 +- configs/phi-1_5/finetuning/aml_config.json | 6 + .../docker-contexts/wais_phi15_env/Dockerfile | 31 +++++ .../wais_phi15_env/requirements.txt | 34 +++++ configs/phi-1_5/finetuning/invoke_olive.py | 101 ++++++++++++++- .../olive-config-azureml-template.json | 117 ++++++++++++++++++ configs/phi-1_5/finetuning/olive-config.json | 2 +- configs/phi-1_5/finetuning/utils.py | 71 +++++++++++ configs/phi-1_5/setup/requirements.txt | 23 +++- 9 files changed, 377 insertions(+), 10 deletions(-) create mode 100644 configs/phi-1_5/finetuning/aml_config.json create mode 100644 configs/phi-1_5/finetuning/docker-contexts/wais_phi15_env/Dockerfile create mode 100644 configs/phi-1_5/finetuning/docker-contexts/wais_phi15_env/requirements.txt create mode 100644 configs/phi-1_5/finetuning/olive-config-azureml-template.json create mode 100644 configs/phi-1_5/finetuning/utils.py diff --git a/configs/phi-1_5/.vscode/launch.json b/configs/phi-1_5/.vscode/launch.json index 2b2502c..74565fe 100644 --- a/configs/phi-1_5/.vscode/launch.json +++ b/configs/phi-1_5/.vscode/launch.json @@ -6,7 +6,7 @@ "configurations": [ { "name": "Python: Current File", - "type": "python", + "type": "debugpy", "request": "launch", "program": "${file}", "console": "integratedTerminal", diff --git a/configs/phi-1_5/finetuning/aml_config.json b/configs/phi-1_5/finetuning/aml_config.json new file mode 100644 index 0000000..9bc55de --- /dev/null +++ b/configs/phi-1_5/finetuning/aml_config.json @@ -0,0 +1,6 @@ +{ + "subscription_id": "", + "resource_group": "", + "workspace_name": "", + "aml_compute_name": "" +} \ No newline at end of file diff --git a/configs/phi-1_5/finetuning/docker-contexts/wais_phi15_env/Dockerfile b/configs/phi-1_5/finetuning/docker-contexts/wais_phi15_env/Dockerfile new file mode 100644 index 0000000..29c2bd3 --- /dev/null +++ b/configs/phi-1_5/finetuning/docker-contexts/wais_phi15_env/Dockerfile @@ -0,0 +1,31 @@ +FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04:20240109.v1 + +ENV DEBIAN_FRONTEND=noninteractive +ENV ACCEPT_EULA=Y + +# Build python3.9 as the default python +RUN apt-get -y update && \ + apt-get -y upgrade && \ + apt-get install -y --no-install-recommends \ + software-properties-common gnupg\ + && add-apt-repository -y ppa:deadsnakes \ + && apt-get install -y --no-install-recommends \ + python3.9-venv \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* && \ + python3.9 -m venv /venv + +ENV PATH=/venv/bin:$PATH + +# Install TensorRT +RUN v="8.4.1-1+cuda11.6" &&\ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ + apt-get -y update && \ + apt-get -y upgrade && \ + apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \ + libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \ + python3-libnvinfer=${v} libnvinfer-samples=${v} + +RUN pip install --upgrade pip && pip install --upgrade setuptools +COPY requirements.txt . +RUN pip install -r requirements.txt --no-cache-dir diff --git a/configs/phi-1_5/finetuning/docker-contexts/wais_phi15_env/requirements.txt b/configs/phi-1_5/finetuning/docker-contexts/wais_phi15_env/requirements.txt new file mode 100644 index 0000000..4cd3939 --- /dev/null +++ b/configs/phi-1_5/finetuning/docker-contexts/wais_phi15_env/requirements.txt @@ -0,0 +1,34 @@ +--extra-index-url https://download.pytorch.org/whl/cu118 +torch +torchvision +torchaudio +packaging +datasets==2.14.5 +transformers==4.36.2 +accelerate==0.23.0 +bitsandbytes==0.41.1 +peft==0.5.0 +scikit-learn==1.3.1 +sentencepiece==0.1.99 +trl==0.7.2 +protobuf +ipykernel==6.25.2 +wandb==0.15.12 +einops +promptflow==0.1.0b8 +promptflow-tools==0.1.0b10 +gradio +azureml-core==1.54.0 +azure-ai-ml==1.12.1 +azureml-fsspec +azureml-mlflow +azure-identity +azure-keyvault +azure-keyvault-secrets +mlflow +docker +onnxruntime-gpu +scipy +onnxruntime-training +torch-ort +git+https://github.com/microsoft/Olive.git@3c25717980ad743e802c16a8e4c831b6196f68bf diff --git a/configs/phi-1_5/finetuning/invoke_olive.py b/configs/phi-1_5/finetuning/invoke_olive.py index 261b160..121a174 100644 --- a/configs/phi-1_5/finetuning/invoke_olive.py +++ b/configs/phi-1_5/finetuning/invoke_olive.py @@ -1,6 +1,101 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import argparse +import sys +import json +from utils import get_aml_client, create_dataset +from azure.ai.ml.entities import Environment, BuildContext +import re + from olive.workflows import run as olive_run import os +import argparse +import sys + +def parse_aml_config(aml_config): + """Parse the AML config to make sure the required fields are present""" + with open(aml_config, 'r') as file: + aml_config = json.load(file) + + try: + subscription_id = aml_config["subscription_id"] + resource_group = aml_config["resource_group"] + workspace_name = aml_config["workspace_name"] + aml_compute_name = aml_config["aml_compute_name"] + except KeyError as e: + print(f"KeyError: {e} not found in aml_config.json") + sys.exit(1) + + return aml_config + + +def main(): + """Main function of the script.""" + + # input and output arguments + parser = argparse.ArgumentParser() + parser.add_argument("--azure", required=False, action="store_true", help="runs the training on azure when this option is enabled") + parser.add_argument("--aml_config", type=str, required='--azure' in sys.argv, help="aml config (update and use aml_config.json) for azure subscription/workspace details") + + args = parser.parse_args() + + # Run olive from file locally + if not args.azure: + file_path = os.path.join(os.getcwd(), 'finetuning/olive-config.json') + olive_run(file_path) + else: + dataset_local_path = "dataset/dataset-classification.json" + dataset_name = "phi15_train_dataset" + dataset_version = "1" + docker_context_path = "finetuning/docker-contexts/wais_phi15_env" + azure_olive_config_template_path = "finetuning/olive-config-azureml-template.json" + azure_olive_config_path = "finetuning/olive-config-azureml.json" + azure_environment_name = "wais_phi15_env" + + aml_config = parse_aml_config(args.aml_config) + # Get the AML client + ml_client = get_aml_client(aml_config["subscription_id"], aml_config["resource_group"], aml_config["workspace_name"]) + + # Create the environment + print("Creating the environment...") + env_docker_context = Environment( + build=BuildContext(path=docker_context_path), # Path to the Docker context + name=azure_environment_name, + description="Environment created from a Docker context for training phi15 model using Olive.", + ) + aml_env = ml_client.environments.create_or_update(env_docker_context) + print("The environment {} was created successfully.".format(aml_env.name)) + + # Create the dataset + print("Creating the dataset...") + + description = "Train dataset for tone classification model." + dataset = create_dataset(ml_client, local_path=dataset_local_path, name=dataset_name, version=dataset_version, description=description) + print("The dataset {} was created successfully.".format(dataset.name)) + dataset_relative_path = re.split("/datastores/.*/paths/", dataset.path)[-1] + + # Update the olive-config-azureml.json + with open(azure_olive_config_template_path, 'r') as file: + olive_config = json.load(file) + try: + olive_config["azureml_client"]["aml_config_path"] = args.aml_config + olive_config["data_configs"]["dataset_default_train"]["params_config"]["data_files"]["config"]["azureml_client"]["aml_config_path"] = args.aml_config + olive_config["data_configs"]["dataset_default_train"]["params_config"]["data_files"]["config"]["relative_path"] = dataset_relative_path + olive_config["systems"]["aml_system"]["config"]["aml_compute"] = aml_config["aml_compute_name"] + olive_config["systems"]["aml_system"]["config"]["aml_environment_config"]["name"] = aml_env.name + olive_config["systems"]["aml_system"]["config"]["aml_environment_config"]["version"] = aml_env.version + except KeyError as e: + print(f"KeyError: {e} not found in olive-config-azureml.json") + sys.exit(1) + + with open(azure_olive_config_path, 'w') as file: + json.dump(olive_config, file, indent=4) + + # Run olive from file for debug. + file_path = os.path.join(os.getcwd(), azure_olive_config_path) + olive_run(file_path) + -# Run olive from file for debug. -file_path = os.path.join(os.getcwd(), 'finetuning/olive-config.json') -olive_run(file_path) \ No newline at end of file +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/configs/phi-1_5/finetuning/olive-config-azureml-template.json b/configs/phi-1_5/finetuning/olive-config-azureml-template.json new file mode 100644 index 0000000..35fb54e --- /dev/null +++ b/configs/phi-1_5/finetuning/olive-config-azureml-template.json @@ -0,0 +1,117 @@ +{ + "azureml_client": { + "aml_config_path": "", + "read_timeout": 4000, + "max_operation_retries": 4, + "operation_retry_interval": 5 + }, + "systems": { + "local_system": { + "type": "LocalSystem" + }, + "aml_system": { + "type": "AzureML", + "config": { + "aml_compute": "", + "accelerators": [ + "gpu" + ], + "aml_environment_config": { + "name": "", + "version": "" + } + } + } + }, + "input_model": { + "type": "PyTorchModel", + "config": { + "hf_config": { + "model_name": "microsoft/phi-1_5", + "task": "text-generation", + "from_pretrained_args": { + "trust_remote_code": true + } + } + } + }, + "data_configs": { + "dataset_default_train": { + "name": "dataset_default", + "type": "HuggingfaceContainer", + "params_config": { +#data_configs_data_files_extension_start + +#data_configs_data_files_extension_end + "data_files": { + "type": "azureml_datastore", + "config": { + "azureml_client": + { + "aml_config_path": "" + }, + "datastore_name": "workspaceblobstore", + "relative_path": "" + } + }, + "split": "", + "component_kwargs": { + "pre_process_data": { + "dataset_type": "", + "text_cols": , + "text_template": "", + "corpus_strategy": "", + "source_max_len": , + "pad_to_max_len": , + "use_attention_mask": false + } + } + } + } + }, + "passes": { + "qlora": { + "type": "QLoRA", + "config": { + "compute_dtype": "", + "quant_type": "", + "double_quant": , + "lora_r": , + "lora_alpha": , + "lora_dropout": , + "train_data_config": "dataset_default_train", + "eval_dataset_size": , + "training_args": { + "seed": , + "data_seed": , + "per_device_train_batch_size": , + "per_device_eval_batch_size": , + "gradient_accumulation_steps": , + "gradient_checkpointing": , + "learning_rate": , + "num_train_epochs":, + "max_steps": , + "logging_steps": 10, + "evaluation_strategy": "steps", + "eval_steps": 187, + "group_by_length": true, + "adam_beta2": 0.999, + "max_grad_norm": 0.3, + "output_dir": "outputs/" + } + } + } + }, + "engine": { + "log_severity_level": 0, + "search_strategy": false, + "evaluate_input_model": false, + "execution_providers": [ + "CUDAExecutionProvider" + ], + "host": "aml_system", + "cache_dir": "model_cache", + "output_dir": "outputs/models/qlora", + "target": "aml_system" + } +} diff --git a/configs/phi-1_5/finetuning/olive-config.json b/configs/phi-1_5/finetuning/olive-config.json index 1f6cc81..3861a25 100644 --- a/configs/phi-1_5/finetuning/olive-config.json +++ b/configs/phi-1_5/finetuning/olive-config.json @@ -5,7 +5,7 @@ "hf_config": { "model_name": "model-cache/microsoft/phi-1_5", "task": "text-generation", - "model_loading_args": { + "from_pretrained_args": { "trust_remote_code": true } } diff --git a/configs/phi-1_5/finetuning/utils.py b/configs/phi-1_5/finetuning/utils.py new file mode 100644 index 0000000..86c6275 --- /dev/null +++ b/configs/phi-1_5/finetuning/utils.py @@ -0,0 +1,71 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +# Handle to the workspace +from azure.ai.ml import MLClient + +# Authentication package +from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential +from azure.ai.ml.entities import Data +from azure.ai.ml.constants import AssetTypes + +from azure.ai.ml import command +from azure.ai.ml import Input, Output + + +def get_aml_client(subscription_id, resource_group_name, workspace_name): + """ + Get an Azure Machine Learning client instance. + + Args: + subscription_id (str): The Azure subscription ID. + resource_group_name (str): The name of the resource group. + workspace_name (str): The name of the Azure Machine Learning workspace. + + Returns: + MLClient: An instance of the Azure Machine Learning client. + """ + credential = DefaultAzureCredential(exclude_interactive_browser_credential=False) + + # Create and return MLClient instance + return MLClient(credential, + subscription_id=subscription_id, + resource_group_name=resource_group_name, + workspace_name=workspace_name + ) + + +def create_dataset(ml_client, local_path, name, version, description=""): + """ + Creates a data asset using the specified ML client, local path, name, version, and optional description. + + Args: + ml_client (MLClient): The ML client used to interact with the ML service. + local_path (str): The local path of the data asset. + name (str): The name of the data asset. + version (str): The version of the data asset. + description (str, optional): The description of the data asset. Defaults to "". + + Returns: + Dataset (Dataset): Registered dataset with the given name and version. + """ + + my_data = Data( + name=name, + version=version, + description=description, + path=local_path, + type=AssetTypes.URI_FILE, + ) + + ## create data asset if it doesn't already exist: + try: + dataset = ml_client.data.get(name=name, version=version) + print( + f"Data asset already exists. Name: {dataset.name}, version: {dataset.version}" + ) + except: + dataset = ml_client.data.create_or_update(my_data) + print(f"Data asset created. Name: {dataset.name}, version: {dataset.version}") + + return dataset \ No newline at end of file diff --git a/configs/phi-1_5/setup/requirements.txt b/configs/phi-1_5/setup/requirements.txt index a55d10c..4cd3939 100644 --- a/configs/phi-1_5/setup/requirements.txt +++ b/configs/phi-1_5/setup/requirements.txt @@ -2,20 +2,33 @@ torch torchvision torchaudio -transformers==4.34.1 +packaging +datasets==2.14.5 +transformers==4.36.2 accelerate==0.23.0 bitsandbytes==0.41.1 -datasets==2.14.5 peft==0.5.0 scikit-learn==1.3.1 sentencepiece==0.1.99 trl==0.7.2 -protobuf==3.20.3 +protobuf ipykernel==6.25.2 wandb==0.15.12 -onnxruntime-gpu==1.16.1 einops -olive-ai==0.3.3 promptflow==0.1.0b8 promptflow-tools==0.1.0b10 gradio +azureml-core==1.54.0 +azure-ai-ml==1.12.1 +azureml-fsspec +azureml-mlflow +azure-identity +azure-keyvault +azure-keyvault-secrets +mlflow +docker +onnxruntime-gpu +scipy +onnxruntime-training +torch-ort +git+https://github.com/microsoft/Olive.git@3c25717980ad743e802c16a8e4c831b6196f68bf From f8372414cf18b5efc88123fbc13b0a0a57e0fcca Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Mon, 12 Feb 2024 20:00:09 -0800 Subject: [PATCH 11/19] Mistral-7b azure changes --- configs/mistral-7b/finetuning/aml_config.json | 6 + .../wais_mistral_7b_env/Dockerfile | 31 +++++ .../wais_mistral_7b_env/requirements.txt | 34 +++++ configs/mistral-7b/finetuning/invoke_olive.py | 98 ++++++++++++++- .../olive-config-azureml-template.json | 117 ++++++++++++++++++ .../mistral-7b/finetuning/olive-config.json | 5 +- configs/mistral-7b/finetuning/utils.py | 68 ++++++++++ .../mistral-7b/setup/project-settings.json | 75 +++++++---- configs/mistral-7b/setup/requirements.txt | 23 +++- 9 files changed, 426 insertions(+), 31 deletions(-) create mode 100644 configs/mistral-7b/finetuning/aml_config.json create mode 100644 configs/mistral-7b/finetuning/docker-contexts/wais_mistral_7b_env/Dockerfile create mode 100644 configs/mistral-7b/finetuning/docker-contexts/wais_mistral_7b_env/requirements.txt create mode 100644 configs/mistral-7b/finetuning/olive-config-azureml-template.json create mode 100644 configs/mistral-7b/finetuning/utils.py diff --git a/configs/mistral-7b/finetuning/aml_config.json b/configs/mistral-7b/finetuning/aml_config.json new file mode 100644 index 0000000..9bc55de --- /dev/null +++ b/configs/mistral-7b/finetuning/aml_config.json @@ -0,0 +1,6 @@ +{ + "subscription_id": "", + "resource_group": "", + "workspace_name": "", + "aml_compute_name": "" +} \ No newline at end of file diff --git a/configs/mistral-7b/finetuning/docker-contexts/wais_mistral_7b_env/Dockerfile b/configs/mistral-7b/finetuning/docker-contexts/wais_mistral_7b_env/Dockerfile new file mode 100644 index 0000000..29c2bd3 --- /dev/null +++ b/configs/mistral-7b/finetuning/docker-contexts/wais_mistral_7b_env/Dockerfile @@ -0,0 +1,31 @@ +FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04:20240109.v1 + +ENV DEBIAN_FRONTEND=noninteractive +ENV ACCEPT_EULA=Y + +# Build python3.9 as the default python +RUN apt-get -y update && \ + apt-get -y upgrade && \ + apt-get install -y --no-install-recommends \ + software-properties-common gnupg\ + && add-apt-repository -y ppa:deadsnakes \ + && apt-get install -y --no-install-recommends \ + python3.9-venv \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* && \ + python3.9 -m venv /venv + +ENV PATH=/venv/bin:$PATH + +# Install TensorRT +RUN v="8.4.1-1+cuda11.6" &&\ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ + apt-get -y update && \ + apt-get -y upgrade && \ + apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \ + libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \ + python3-libnvinfer=${v} libnvinfer-samples=${v} + +RUN pip install --upgrade pip && pip install --upgrade setuptools +COPY requirements.txt . +RUN pip install -r requirements.txt --no-cache-dir diff --git a/configs/mistral-7b/finetuning/docker-contexts/wais_mistral_7b_env/requirements.txt b/configs/mistral-7b/finetuning/docker-contexts/wais_mistral_7b_env/requirements.txt new file mode 100644 index 0000000..4cd3939 --- /dev/null +++ b/configs/mistral-7b/finetuning/docker-contexts/wais_mistral_7b_env/requirements.txt @@ -0,0 +1,34 @@ +--extra-index-url https://download.pytorch.org/whl/cu118 +torch +torchvision +torchaudio +packaging +datasets==2.14.5 +transformers==4.36.2 +accelerate==0.23.0 +bitsandbytes==0.41.1 +peft==0.5.0 +scikit-learn==1.3.1 +sentencepiece==0.1.99 +trl==0.7.2 +protobuf +ipykernel==6.25.2 +wandb==0.15.12 +einops +promptflow==0.1.0b8 +promptflow-tools==0.1.0b10 +gradio +azureml-core==1.54.0 +azure-ai-ml==1.12.1 +azureml-fsspec +azureml-mlflow +azure-identity +azure-keyvault +azure-keyvault-secrets +mlflow +docker +onnxruntime-gpu +scipy +onnxruntime-training +torch-ort +git+https://github.com/microsoft/Olive.git@3c25717980ad743e802c16a8e4c831b6196f68bf diff --git a/configs/mistral-7b/finetuning/invoke_olive.py b/configs/mistral-7b/finetuning/invoke_olive.py index 261b160..68c3805 100644 --- a/configs/mistral-7b/finetuning/invoke_olive.py +++ b/configs/mistral-7b/finetuning/invoke_olive.py @@ -1,6 +1,98 @@ +import argparse +import sys +import json +from utils import get_aml_client, create_dataset +from azure.ai.ml.entities import Environment, BuildContext +import re + from olive.workflows import run as olive_run import os +import argparse +import sys + +def parse_aml_config(aml_config): + """Parse the AML config to make sure the required fields are present""" + with open(aml_config, 'r') as file: + aml_config = json.load(file) + + try: + subscription_id = aml_config["subscription_id"] + resource_group = aml_config["resource_group"] + workspace_name = aml_config["workspace_name"] + aml_compute_name = aml_config["aml_compute_name"] + except KeyError as e: + print(f"KeyError: {e} not found in aml_config.json") + sys.exit(1) + + return aml_config + + +def main(): + """Main function of the script.""" + + # input and output arguments + parser = argparse.ArgumentParser() + parser.add_argument("--azure", required=False, action="store_true", help="runs the training on azure when this option is enabled") + parser.add_argument("--aml_config", type=str, required='--azure' in sys.argv, help="aml config (update and use aml_config.json) for azure subscription/workspace details") + + args = parser.parse_args() + + # Run olive from file locally + if not args.azure: + file_path = os.path.join(os.getcwd(), 'finetuning/olive-config.json') + olive_run(file_path) + else: + dataset_local_path = "dataset/dataset-classification.json" + dataset_name = "mistral_7b_train_dataset" + dataset_version = "1" + docker_context_path = "finetuning/docker-contexts/wais_mistral_7b_env" + azure_olive_config_template_path = "finetuning/olive-config-azureml_template.json" + azure_olive_config_path = "finetuning/olive-config-azureml.json" + azure_environment_name = "wais_mistral_7b_env" + + aml_config = parse_aml_config(args.aml_config) + # Get the AML client + ml_client = get_aml_client(aml_config["subscription_id"], aml_config["resource_group"], aml_config["workspace_name"]) + + # Create the environment + print("Creating the environment...") + env_docker_context = Environment( + build=BuildContext(path=docker_context_path), # Path to the Docker context + name=azure_environment_name, + description="Environment created from a Docker context for training Mistral7b model using Olive.", + ) + aml_env = ml_client.environments.create_or_update(env_docker_context) + print("The environment {} was created successfully.".format(aml_env.name)) + + # Create the dataset + print("Creating the dataset...") + + description = "Train dataset for tone classification model." + dataset = create_dataset(ml_client, local_path=dataset_local_path, name=dataset_name, version=dataset_version, description=description) + print("The dataset {} was created successfully.".format(dataset.name)) + dataset_relative_path = re.split("/datastores/.*/paths/", dataset.path)[-1] + + # Update the olive-config-azureml.json + with open(azure_olive_config_template_path, 'r') as file: + olive_config = json.load(file) + try: + olive_config["azureml_client"]["aml_config_path"] = args.aml_config + olive_config["data_configs"]["dataset_default_train"]["params_config"]["data_files"]["config"]["azureml_client"]["aml_config_path"] = args.aml_config + olive_config["data_configs"]["dataset_default_train"]["params_config"]["data_files"]["config"]["relative_path"] = dataset_relative_path + olive_config["systems"]["aml_system"]["config"]["aml_compute"] = aml_config["aml_compute_name"] + olive_config["systems"]["aml_system"]["config"]["aml_environment_config"]["name"] = aml_env.name + olive_config["systems"]["aml_system"]["config"]["aml_environment_config"]["version"] = aml_env.version + except KeyError as e: + print(f"KeyError: {e} not found in olive-config-azureml.json") + sys.exit(1) + + with open(azure_olive_config_path, 'w') as file: + json.dump(olive_config, file, indent=4) + + # Run olive from file for debug. + file_path = os.path.join(os.getcwd(), azure_olive_config_path) + olive_run(file_path) + -# Run olive from file for debug. -file_path = os.path.join(os.getcwd(), 'finetuning/olive-config.json') -olive_run(file_path) \ No newline at end of file +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/configs/mistral-7b/finetuning/olive-config-azureml-template.json b/configs/mistral-7b/finetuning/olive-config-azureml-template.json new file mode 100644 index 0000000..fb98b33 --- /dev/null +++ b/configs/mistral-7b/finetuning/olive-config-azureml-template.json @@ -0,0 +1,117 @@ +{ + "azureml_client": { + "aml_config_path": "", + "read_timeout": 4000, + "max_operation_retries": 4, + "operation_retry_interval": 5 + }, + "systems": { + "local_system": { + "type": "LocalSystem" + }, + "aml_system": { + "type": "AzureML", + "config": { + "aml_compute": "", + "accelerators": [ + "gpu" + ], + "aml_environment_config": { + "name": "", + "version": "" + } + } + } + }, + "input_model": { + "type": "PyTorchModel", + "config": { + "hf_config": { + "model_name": "mistralai/Mistral-7B-v0.1", + "task": "text-generation", + "from_pretrained_args": { + "trust_remote_code": true + } + } + } + }, + "data_configs": { + "dataset_default_train": { + "name": "dataset_default", + "type": "HuggingfaceContainer", + "params_config": { +#data_configs_data_files_extension_start + +#data_configs_data_files_extension_end + "data_files": { + "type": "azureml_datastore", + "config": { + "azureml_client": + { + "aml_config_path": "" + }, + "datastore_name": "workspaceblobstore", + "relative_path": "" + } + }, + "split": "", + "component_kwargs": { + "pre_process_data": { + "dataset_type": "", + "text_cols": , + "text_template": "", + "corpus_strategy": "", + "source_max_len": , + "pad_to_max_len": , + "use_attention_mask": false + } + } + } + } + }, + "passes": { + "qlora": { + "type": "QLoRA", + "config": { + "compute_dtype": "", + "quant_type": "", + "double_quant": , + "lora_r": , + "lora_alpha": , + "lora_dropout": , + "train_data_config": "dataset_default_train", + "eval_dataset_size": , + "training_args": { + "seed": , + "data_seed": , + "per_device_train_batch_size": , + "per_device_eval_batch_size": , + "gradient_accumulation_steps": , + "gradient_checkpointing": , + "learning_rate": , + "num_train_epochs":, + "max_steps": , + "logging_steps": 10, + "evaluation_strategy": "steps", + "eval_steps": 40, + "group_by_length": true, + "adam_beta2": 0.999, + "max_grad_norm": 0.3, + "output_dir": "outputs/" + } + } + } + }, + "engine": { + "log_severity_level": 0, + "search_strategy": false, + "evaluate_input_model": false, + "execution_providers": [ + "CUDAExecutionProvider" + ], + "host": "aml_system", + "cache_dir": "model_cache", + "output_dir": "outputs/models/qlora", + "target": "aml_system" + } +} diff --git a/configs/mistral-7b/finetuning/olive-config.json b/configs/mistral-7b/finetuning/olive-config.json index b40b829..7002b94 100644 --- a/configs/mistral-7b/finetuning/olive-config.json +++ b/configs/mistral-7b/finetuning/olive-config.json @@ -4,7 +4,10 @@ "config": { "hf_config": { "model_name": "model-cache/mistralai/Mistral-7B", - "task": "text-generation" + "task": "text-generation", + "from_pretrained_args": { + "trust_remote_code": true + } } } }, diff --git a/configs/mistral-7b/finetuning/utils.py b/configs/mistral-7b/finetuning/utils.py new file mode 100644 index 0000000..c36753e --- /dev/null +++ b/configs/mistral-7b/finetuning/utils.py @@ -0,0 +1,68 @@ +# Handle to the workspace +from azure.ai.ml import MLClient + +# Authentication package +from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential +from azure.ai.ml.entities import Data +from azure.ai.ml.constants import AssetTypes + +from azure.ai.ml import command +from azure.ai.ml import Input, Output + + +def get_aml_client(subscription_id, resource_group_name, workspace_name): + """ + Get an Azure Machine Learning client instance. + + Args: + subscription_id (str): The Azure subscription ID. + resource_group_name (str): The name of the resource group. + workspace_name (str): The name of the Azure Machine Learning workspace. + + Returns: + MLClient: An instance of the Azure Machine Learning client. + """ + credential = DefaultAzureCredential(exclude_interactive_browser_credential=False) + + # Create and return MLClient instance + return MLClient(credential, + subscription_id=subscription_id, + resource_group_name=resource_group_name, + workspace_name=workspace_name + ) + + +def create_dataset(ml_client, local_path, name, version, description=""): + """ + Creates a data asset using the specified ML client, local path, name, version, and optional description. + + Args: + ml_client (MLClient): The ML client used to interact with the ML service. + local_path (str): The local path of the data asset. + name (str): The name of the data asset. + version (str): The version of the data asset. + description (str, optional): The description of the data asset. Defaults to "". + + Returns: + Dataset (Dataset): Registered dataset with the given name and version. + """ + + my_data = Data( + name=name, + version=version, + description=description, + path=local_path, + type=AssetTypes.URI_FILE, + ) + + ## create data asset if it doesn't already exist: + try: + dataset = ml_client.data.get(name=name, version=version) + print( + f"Data asset already exists. Name: {dataset.name}, version: {dataset.version}" + ) + except: + dataset = ml_client.data.create_or_update(my_data) + print(f"Data asset created. Name: {dataset.name}, version: {dataset.version}") + + return dataset \ No newline at end of file diff --git a/configs/mistral-7b/setup/project-settings.json b/configs/mistral-7b/setup/project-settings.json index 953ba19..1f7701f 100644 --- a/configs/mistral-7b/setup/project-settings.json +++ b/configs/mistral-7b/setup/project-settings.json @@ -26,9 +26,11 @@ { "type": "String", "label": "Dataset name:", - "info":"Dataset to train the model from a local file.", + "info": "Dataset to train the model from a local file.", "replaceToken": "", - "optionValues": ["dataset/dataset-classification.json"], + "optionValues": [ + "dataset/dataset-classification.json" + ], "defaultValue": "dataset/dataset-classification.json" }, { @@ -50,7 +52,10 @@ "label": "Text columns:", "info": "Columns that match your dataset to populate the training prompt.", "replaceToken": "", - "defaultValue": [ "phrase", "tone" ] + "defaultValue": [ + "phrase", + "tone" + ] }, { "type": "String", @@ -65,7 +70,10 @@ "info": "Do you want to join the samples or process them one by one.", "replaceToken": "", "defaultValue": "join", - "optionValues": ["line-by-line", "join"] + "optionValues": [ + "line-by-line", + "join" + ] }, { "type": "Integer", @@ -91,8 +99,12 @@ "type": "String", "label": "Compute dtype:", "info": "Data type for model weights and adapter weights.", + "learnMore": "hello world", "replaceToken": "", - "optionValues": ["bfloat16", "float16"], + "optionValues": [ + "bfloat16", + "float16" + ], "defaultValue": "bfloat16" }, { @@ -100,113 +112,132 @@ "label": "Quant type:", "info": "Quantization data type to use. Should be one of fp4 or nf4.", "replaceToken": "", - "optionValues": ["nf4", "fp4"], - "defaultValue": "nf4" + "optionValues": [ + "nf4", + "fp4" + ], + "defaultValue": "nf4", + "learnMore": "Can you tell me more about the Hugging Face trainer parameter quant_type?" }, { "type": "Boolean", "label": "Double quant:", "info": "Whether to use nested quantization where the quantization constants from the first quantization are quantized again.", "replaceToken": "", - "defaultValue": true + "defaultValue": true, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter double_quant?" }, { "type": "Integer", "label": "Lora r:", "info": "Lora attention dimension.", "replaceToken": "", - "defaultValue": 32 + "defaultValue": 32, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter lora_r?" }, { "type": "Integer", "label": "Lora alpha:", "info": "The alpha parameter for Lora scaling", "replaceToken": "", - "defaultValue": 64 + "defaultValue": 64, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter lora_alpha?" }, { "type": "Number", "label": "Lora dropout:", "info": "The dropout probability for Lora layers", "replaceToken": "", - "defaultValue": 0.1 + "defaultValue": 0.1, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter lora_dropout?" }, { "type": "Integer", "label": "Eval dataset size:", "info": "Size of the validation dataset, a number or 0-1 percentage.", "replaceToken": "", - "defaultValue": 0.3 + "defaultValue": 0.3, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter eval_dataset_size?" }, { "type": "Integer", "label": "Seed:", "info": "Random seed for initialization.", "replaceToken": "", - "defaultValue": 0 + "defaultValue": 0, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter training_args_seed?" }, { "type": "Integer", "label": "Data seed:", "info": "Random seed to be used with data samplers.", "replaceToken": "", - "defaultValue": 42 + "defaultValue": 42, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter training_args_data_seed?" }, { "type": "Integer", "label": "Per device train batch size:", "info": "The batch size per GPU for training.", "replaceToken": "", - "defaultValue": 8 + "defaultValue": 8, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter per_device_train_batch_size?" }, { "type": "Integer", "label": "Per device eval batch size:", "info": "The batch size per GPU for evaluation.", "replaceToken": "", - "defaultValue": 8 + "defaultValue": 8, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter per_device_eval_batch_size?" }, { "type": "Integer", "label": "Gradient accumulation steps:", "info": "Number of updates steps to accumulate the gradients for, before performing a backward/update pass", "replaceToken": "", - "defaultValue": 4 + "defaultValue": 4, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter gradient_accumulation_steps?" }, { "type": "Boolean", "label": "Enable gradient checkpointing:", "info": "Use gradient checkpointing. Recommended to save the memory.", "replaceToken": "", - "defaultValue": true + "defaultValue": true, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter gradient_checkpointing?" }, { "type": "Number", "label": "Learning rate:", "info": "The initial learning rate for AdamW", "replaceToken": "", - "defaultValue": 0.0002 + "defaultValue": 0.0002, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter learning_rate?" }, { "type": "Integer", "label": "Number of epochs:", "info": "How many complete passes the model will make over the entire training dataset.", "replaceToken": "", - "defaultValue": 3 + "defaultValue": 3, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter num_train_epochs?" }, { "type": "Integer", "label": "Max steps:", "info":"Training will be stopped when this number of steps is reached, regardless of the number of epochs.", "replaceToken": "", - "defaultValue": 80 + "defaultValue": 80, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter max_steps?" }, { "type": "String", "label": "Checkpoint output dir", "info": "Directory to save the checkpoints.", "replaceToken": "", - "defaultValue": "models/checkpoints" + "defaultValue": "models/checkpoints", + "learnMore": "Can you tell me more about the Hugging Face trainer parameter output_dir?" } ] } diff --git a/configs/mistral-7b/setup/requirements.txt b/configs/mistral-7b/setup/requirements.txt index a55d10c..44fe7db 100644 --- a/configs/mistral-7b/setup/requirements.txt +++ b/configs/mistral-7b/setup/requirements.txt @@ -2,20 +2,33 @@ torch torchvision torchaudio -transformers==4.34.1 +packaging +datasets==2.14.5 +transformers==4.36.2 accelerate==0.23.0 bitsandbytes==0.41.1 -datasets==2.14.5 peft==0.5.0 scikit-learn==1.3.1 sentencepiece==0.1.99 trl==0.7.2 -protobuf==3.20.3 +protobuf ipykernel==6.25.2 wandb==0.15.12 -onnxruntime-gpu==1.16.1 einops -olive-ai==0.3.3 promptflow==0.1.0b8 promptflow-tools==0.1.0b10 gradio +azureml-core==1.54.0 +azure-ai-ml==1.12.1 +azureml-fsspec +azureml-mlflow +azure-identity +azure-keyvault +azure-keyvault-secrets +mlflow +docker +onnxruntime-gpu +scipy +onnxruntime-training +torch-ort +git+https://github.com/microsoft/Olive.git@3c25717980ad743e802c16a8e4c831b6196f68bf \ No newline at end of file From 65f20a19dd16fc53f0bf1ec8c4a59f5bd002922e Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Mon, 12 Feb 2024 21:30:53 -0800 Subject: [PATCH 12/19] Fixing Mistral bug --- configs/mistral-7b/finetuning/invoke_olive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/mistral-7b/finetuning/invoke_olive.py b/configs/mistral-7b/finetuning/invoke_olive.py index 68c3805..2771dda 100644 --- a/configs/mistral-7b/finetuning/invoke_olive.py +++ b/configs/mistral-7b/finetuning/invoke_olive.py @@ -46,7 +46,7 @@ def main(): dataset_name = "mistral_7b_train_dataset" dataset_version = "1" docker_context_path = "finetuning/docker-contexts/wais_mistral_7b_env" - azure_olive_config_template_path = "finetuning/olive-config-azureml_template.json" + azure_olive_config_template_path = "finetuning/olive-config-azureml-template.json" azure_olive_config_path = "finetuning/olive-config-azureml.json" azure_environment_name = "wais_mistral_7b_env" From 99e67d9d7e6e12d4c563d468636d97fd2b7e0a48 Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Mon, 12 Feb 2024 21:59:09 -0800 Subject: [PATCH 13/19] Fixing cache_dir path for azure in phi2 --- configs/phi-2/finetuning/olive-config-azureml-template.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/phi-2/finetuning/olive-config-azureml-template.json b/configs/phi-2/finetuning/olive-config-azureml-template.json index 51cdcfa..d58235d 100644 --- a/configs/phi-2/finetuning/olive-config-azureml-template.json +++ b/configs/phi-2/finetuning/olive-config-azureml-template.json @@ -110,7 +110,7 @@ "CUDAExecutionProvider" ], "host": "aml_system", - "cache_dir": "cache", + "cache_dir": "model_cache", "output_dir": "outputs/models/qlora", "target": "aml_system" } From b800786f9026d4d64d604e513396c924c53628a9 Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Mon, 12 Feb 2024 22:14:59 -0800 Subject: [PATCH 14/19] Zephyr-7b azure related changes --- configs/zephyr-7b-beta/.vscode/launch.json | 2 +- .../zephyr-7b-beta/finetuning/aml_config.json | 6 + .../wais_zephyr_7b_env/Dockerfile | 31 +++++ .../wais_zephyr_7b_env/requirements.txt | 34 +++++ .../zephyr-7b-beta/finetuning/invoke_olive.py | 101 ++++++++++++++- .../olive-config-azureml-template.json | 117 ++++++++++++++++++ .../finetuning/olive-config.json | 3 + configs/zephyr-7b-beta/finetuning/utils.py | 71 +++++++++++ .../setup/project-settings.json | 78 ++++++++---- configs/zephyr-7b-beta/setup/requirements.txt | 23 +++- 10 files changed, 433 insertions(+), 33 deletions(-) create mode 100644 configs/zephyr-7b-beta/finetuning/aml_config.json create mode 100644 configs/zephyr-7b-beta/finetuning/docker-contexts/wais_zephyr_7b_env/Dockerfile create mode 100644 configs/zephyr-7b-beta/finetuning/docker-contexts/wais_zephyr_7b_env/requirements.txt create mode 100644 configs/zephyr-7b-beta/finetuning/olive-config-azureml-template.json create mode 100644 configs/zephyr-7b-beta/finetuning/utils.py diff --git a/configs/zephyr-7b-beta/.vscode/launch.json b/configs/zephyr-7b-beta/.vscode/launch.json index 2b2502c..74565fe 100644 --- a/configs/zephyr-7b-beta/.vscode/launch.json +++ b/configs/zephyr-7b-beta/.vscode/launch.json @@ -6,7 +6,7 @@ "configurations": [ { "name": "Python: Current File", - "type": "python", + "type": "debugpy", "request": "launch", "program": "${file}", "console": "integratedTerminal", diff --git a/configs/zephyr-7b-beta/finetuning/aml_config.json b/configs/zephyr-7b-beta/finetuning/aml_config.json new file mode 100644 index 0000000..9bc55de --- /dev/null +++ b/configs/zephyr-7b-beta/finetuning/aml_config.json @@ -0,0 +1,6 @@ +{ + "subscription_id": "", + "resource_group": "", + "workspace_name": "", + "aml_compute_name": "" +} \ No newline at end of file diff --git a/configs/zephyr-7b-beta/finetuning/docker-contexts/wais_zephyr_7b_env/Dockerfile b/configs/zephyr-7b-beta/finetuning/docker-contexts/wais_zephyr_7b_env/Dockerfile new file mode 100644 index 0000000..29c2bd3 --- /dev/null +++ b/configs/zephyr-7b-beta/finetuning/docker-contexts/wais_zephyr_7b_env/Dockerfile @@ -0,0 +1,31 @@ +FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04:20240109.v1 + +ENV DEBIAN_FRONTEND=noninteractive +ENV ACCEPT_EULA=Y + +# Build python3.9 as the default python +RUN apt-get -y update && \ + apt-get -y upgrade && \ + apt-get install -y --no-install-recommends \ + software-properties-common gnupg\ + && add-apt-repository -y ppa:deadsnakes \ + && apt-get install -y --no-install-recommends \ + python3.9-venv \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* && \ + python3.9 -m venv /venv + +ENV PATH=/venv/bin:$PATH + +# Install TensorRT +RUN v="8.4.1-1+cuda11.6" &&\ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ + apt-get -y update && \ + apt-get -y upgrade && \ + apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \ + libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \ + python3-libnvinfer=${v} libnvinfer-samples=${v} + +RUN pip install --upgrade pip && pip install --upgrade setuptools +COPY requirements.txt . +RUN pip install -r requirements.txt --no-cache-dir diff --git a/configs/zephyr-7b-beta/finetuning/docker-contexts/wais_zephyr_7b_env/requirements.txt b/configs/zephyr-7b-beta/finetuning/docker-contexts/wais_zephyr_7b_env/requirements.txt new file mode 100644 index 0000000..4cd3939 --- /dev/null +++ b/configs/zephyr-7b-beta/finetuning/docker-contexts/wais_zephyr_7b_env/requirements.txt @@ -0,0 +1,34 @@ +--extra-index-url https://download.pytorch.org/whl/cu118 +torch +torchvision +torchaudio +packaging +datasets==2.14.5 +transformers==4.36.2 +accelerate==0.23.0 +bitsandbytes==0.41.1 +peft==0.5.0 +scikit-learn==1.3.1 +sentencepiece==0.1.99 +trl==0.7.2 +protobuf +ipykernel==6.25.2 +wandb==0.15.12 +einops +promptflow==0.1.0b8 +promptflow-tools==0.1.0b10 +gradio +azureml-core==1.54.0 +azure-ai-ml==1.12.1 +azureml-fsspec +azureml-mlflow +azure-identity +azure-keyvault +azure-keyvault-secrets +mlflow +docker +onnxruntime-gpu +scipy +onnxruntime-training +torch-ort +git+https://github.com/microsoft/Olive.git@3c25717980ad743e802c16a8e4c831b6196f68bf diff --git a/configs/zephyr-7b-beta/finetuning/invoke_olive.py b/configs/zephyr-7b-beta/finetuning/invoke_olive.py index 261b160..e84a7e5 100644 --- a/configs/zephyr-7b-beta/finetuning/invoke_olive.py +++ b/configs/zephyr-7b-beta/finetuning/invoke_olive.py @@ -1,6 +1,101 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import argparse +import sys +import json +from utils import get_aml_client, create_dataset +from azure.ai.ml.entities import Environment, BuildContext +import re + from olive.workflows import run as olive_run import os +import argparse +import sys + +def parse_aml_config(aml_config): + """Parse the AML config to make sure the required fields are present""" + with open(aml_config, 'r') as file: + aml_config = json.load(file) + + try: + subscription_id = aml_config["subscription_id"] + resource_group = aml_config["resource_group"] + workspace_name = aml_config["workspace_name"] + aml_compute_name = aml_config["aml_compute_name"] + except KeyError as e: + print(f"KeyError: {e} not found in aml_config.json") + sys.exit(1) + + return aml_config + + +def main(): + """Main function of the script.""" + + # input and output arguments + parser = argparse.ArgumentParser() + parser.add_argument("--azure", required=False, action="store_true", help="runs the training on azure when this option is enabled") + parser.add_argument("--aml_config", type=str, required='--azure' in sys.argv, help="aml config (update and use aml_config.json) for azure subscription/workspace details") + + args = parser.parse_args() + + # Run olive from file locally + if not args.azure: + file_path = os.path.join(os.getcwd(), 'finetuning/olive-config.json') + olive_run(file_path) + else: + dataset_local_path = "dataset/dataset-classification.json" + dataset_name = "zephyr_7b_train_dataset" + dataset_version = "1" + docker_context_path = "finetuning/docker-contexts/wais_zephyr_7b_env" + azure_olive_config_template_path = "finetuning/olive-config-azureml-template.json" + azure_olive_config_path = "finetuning/olive-config-azureml.json" + azure_environment_name = "wais_zephyr_7b_env" + + aml_config = parse_aml_config(args.aml_config) + # Get the AML client + ml_client = get_aml_client(aml_config["subscription_id"], aml_config["resource_group"], aml_config["workspace_name"]) + + # Create the environment + print("Creating the environment...") + env_docker_context = Environment( + build=BuildContext(path=docker_context_path), # Path to the Docker context + name=azure_environment_name, + description="Environment created from a Docker context for training zephyr_7b model using Olive.", + ) + aml_env = ml_client.environments.create_or_update(env_docker_context) + print("The environment {} was created successfully.".format(aml_env.name)) + + # Create the dataset + print("Creating the dataset...") + + description = "Train dataset for tone classification model." + dataset = create_dataset(ml_client, local_path=dataset_local_path, name=dataset_name, version=dataset_version, description=description) + print("The dataset {} was created successfully.".format(dataset.name)) + dataset_relative_path = re.split("/datastores/.*/paths/", dataset.path)[-1] + + # Update the olive-config-azureml.json + with open(azure_olive_config_template_path, 'r') as file: + olive_config = json.load(file) + try: + olive_config["azureml_client"]["aml_config_path"] = args.aml_config + olive_config["data_configs"]["dataset_default_train"]["params_config"]["data_files"]["config"]["azureml_client"]["aml_config_path"] = args.aml_config + olive_config["data_configs"]["dataset_default_train"]["params_config"]["data_files"]["config"]["relative_path"] = dataset_relative_path + olive_config["systems"]["aml_system"]["config"]["aml_compute"] = aml_config["aml_compute_name"] + olive_config["systems"]["aml_system"]["config"]["aml_environment_config"]["name"] = aml_env.name + olive_config["systems"]["aml_system"]["config"]["aml_environment_config"]["version"] = aml_env.version + except KeyError as e: + print(f"KeyError: {e} not found in olive-config-azureml.json") + sys.exit(1) + + with open(azure_olive_config_path, 'w') as file: + json.dump(olive_config, file, indent=4) + + # Run olive from file for debug. + file_path = os.path.join(os.getcwd(), azure_olive_config_path) + olive_run(file_path) + -# Run olive from file for debug. -file_path = os.path.join(os.getcwd(), 'finetuning/olive-config.json') -olive_run(file_path) \ No newline at end of file +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/configs/zephyr-7b-beta/finetuning/olive-config-azureml-template.json b/configs/zephyr-7b-beta/finetuning/olive-config-azureml-template.json new file mode 100644 index 0000000..ad2b7f5 --- /dev/null +++ b/configs/zephyr-7b-beta/finetuning/olive-config-azureml-template.json @@ -0,0 +1,117 @@ +{ + "azureml_client": { + "aml_config_path": "", + "read_timeout": 4000, + "max_operation_retries": 4, + "operation_retry_interval": 5 + }, + "systems": { + "local_system": { + "type": "LocalSystem" + }, + "aml_system": { + "type": "AzureML", + "config": { + "aml_compute": "", + "accelerators": [ + "gpu" + ], + "aml_environment_config": { + "name": "", + "version": "" + } + } + } + }, + "input_model": { + "type": "PyTorchModel", + "config": { + "hf_config": { + "model_name": "HuggingFaceH4/zephyr-7b-beta", + "task": "text-generation", + "from_pretrained_args": { + "trust_remote_code": true + } + } + } + }, + "data_configs": { + "dataset_default_train": { + "name": "dataset_default", + "type": "HuggingfaceContainer", + "params_config": { +#data_configs_data_files_extension_start + +#data_configs_data_files_extension_end + "data_files": { + "type": "azureml_datastore", + "config": { + "azureml_client": + { + "aml_config_path": "" + }, + "datastore_name": "workspaceblobstore", + "relative_path": "" + } + }, + "split": "", + "component_kwargs": { + "pre_process_data": { + "dataset_type": "", + "text_cols": , + "text_template": "", + "corpus_strategy": "", + "source_max_len": , + "pad_to_max_len": , + "use_attention_mask": false + } + } + } + } + }, + "passes": { + "qlora": { + "type": "QLoRA", + "config": { + "compute_dtype": "", + "quant_type": "", + "double_quant": , + "lora_r": , + "lora_alpha": , + "lora_dropout": , + "train_data_config": "dataset_default_train", + "eval_dataset_size": , + "training_args": { + "seed": , + "data_seed": , + "per_device_train_batch_size": , + "per_device_eval_batch_size": , + "gradient_accumulation_steps": , + "gradient_checkpointing": , + "learning_rate": , + "num_train_epochs":, + "max_steps": , + "logging_steps": 10, + "evaluation_strategy": "steps", + "eval_steps": 50, + "group_by_length": true, + "adam_beta2": 0.999, + "max_grad_norm": 0.3, + "output_dir": "outputs/" + } + } + } + }, + "engine": { + "log_severity_level": 0, + "search_strategy": false, + "evaluate_input_model": false, + "execution_providers": [ + "CUDAExecutionProvider" + ], + "host": "aml_system", + "cache_dir": "model_cache", + "output_dir": "outputs/models/qlora", + "target": "aml_system" + } +} diff --git a/configs/zephyr-7b-beta/finetuning/olive-config.json b/configs/zephyr-7b-beta/finetuning/olive-config.json index ded390f..c7452ce 100644 --- a/configs/zephyr-7b-beta/finetuning/olive-config.json +++ b/configs/zephyr-7b-beta/finetuning/olive-config.json @@ -5,6 +5,9 @@ "hf_config": { "model_name": "model-cache/HuggingFaceH4/zephyr-7b-beta", "task": "text-generation" + "from_pretrained_args": { + "trust_remote_code": true + } } } }, diff --git a/configs/zephyr-7b-beta/finetuning/utils.py b/configs/zephyr-7b-beta/finetuning/utils.py new file mode 100644 index 0000000..86c6275 --- /dev/null +++ b/configs/zephyr-7b-beta/finetuning/utils.py @@ -0,0 +1,71 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +# Handle to the workspace +from azure.ai.ml import MLClient + +# Authentication package +from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential +from azure.ai.ml.entities import Data +from azure.ai.ml.constants import AssetTypes + +from azure.ai.ml import command +from azure.ai.ml import Input, Output + + +def get_aml_client(subscription_id, resource_group_name, workspace_name): + """ + Get an Azure Machine Learning client instance. + + Args: + subscription_id (str): The Azure subscription ID. + resource_group_name (str): The name of the resource group. + workspace_name (str): The name of the Azure Machine Learning workspace. + + Returns: + MLClient: An instance of the Azure Machine Learning client. + """ + credential = DefaultAzureCredential(exclude_interactive_browser_credential=False) + + # Create and return MLClient instance + return MLClient(credential, + subscription_id=subscription_id, + resource_group_name=resource_group_name, + workspace_name=workspace_name + ) + + +def create_dataset(ml_client, local_path, name, version, description=""): + """ + Creates a data asset using the specified ML client, local path, name, version, and optional description. + + Args: + ml_client (MLClient): The ML client used to interact with the ML service. + local_path (str): The local path of the data asset. + name (str): The name of the data asset. + version (str): The version of the data asset. + description (str, optional): The description of the data asset. Defaults to "". + + Returns: + Dataset (Dataset): Registered dataset with the given name and version. + """ + + my_data = Data( + name=name, + version=version, + description=description, + path=local_path, + type=AssetTypes.URI_FILE, + ) + + ## create data asset if it doesn't already exist: + try: + dataset = ml_client.data.get(name=name, version=version) + print( + f"Data asset already exists. Name: {dataset.name}, version: {dataset.version}" + ) + except: + dataset = ml_client.data.create_or_update(my_data) + print(f"Data asset created. Name: {dataset.name}, version: {dataset.version}") + + return dataset \ No newline at end of file diff --git a/configs/zephyr-7b-beta/setup/project-settings.json b/configs/zephyr-7b-beta/setup/project-settings.json index 607e863..ee4e66a 100644 --- a/configs/zephyr-7b-beta/setup/project-settings.json +++ b/configs/zephyr-7b-beta/setup/project-settings.json @@ -26,9 +26,11 @@ { "type": "String", "label": "Dataset name:", - "info":"Dataset to train the model from a local file.", + "info": "Dataset to train the model from a local file.", "replaceToken": "", - "optionValues": ["dataset/dataset-classification.json"], + "optionValues": [ + "dataset/dataset-classification.json" + ], "defaultValue": "dataset/dataset-classification.json" }, { @@ -50,7 +52,10 @@ "label": "Text columns:", "info": "Columns that match your dataset to populate the training prompt.", "replaceToken": "", - "defaultValue": [ "phrase", "tone" ] + "defaultValue": [ + "phrase", + "tone" + ] }, { "type": "String", @@ -65,7 +70,10 @@ "info": "Do you want to join the samples or process them one by one.", "replaceToken": "", "defaultValue": "join", - "optionValues": ["line-by-line", "join"] + "optionValues": [ + "line-by-line", + "join" + ] }, { "type": "Integer", @@ -73,7 +81,7 @@ "info": "Max numbers of tokens per traning sample.", "replaceToken": "", "defaultValue": 1024 - }, + }, { "type": "Boolean", "label": "Pad to max length:", @@ -92,7 +100,10 @@ "label": "Compute dtype:", "info": "Data type for model weights and adapter weights.", "replaceToken": "", - "optionValues": ["bfloat16", "float16"], + "optionValues": [ + "bfloat16", + "float16" + ], "defaultValue": "bfloat16" }, { @@ -100,113 +111,132 @@ "label": "Quant type:", "info": "Quantization data type to use. Should be one of fp4 or nf4.", "replaceToken": "", - "optionValues": ["nf4", "fp4"], - "defaultValue": "nf4" + "optionValues": [ + "nf4", + "fp4" + ], + "defaultValue": "nf4", + "learnMore": "Can you tell me more about the Hugging Face trainer parameter quant_type?" }, { "type": "Boolean", "label": "Double quant:", "info": "Whether to use nested quantization where the quantization constants from the first quantization are quantized again.", "replaceToken": "", - "defaultValue": true + "defaultValue": true, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter double_quant?" }, { "type": "Integer", "label": "Lora r:", "info": "Lora attention dimension.", "replaceToken": "", - "defaultValue": 32 + "defaultValue": 32, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter lora_r?" }, { "type": "Integer", "label": "Lora alpha:", "info": "The alpha parameter for Lora scaling", "replaceToken": "", - "defaultValue": 64 + "defaultValue": 64, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter lora_alpha?" }, { "type": "Number", "label": "Lora dropout:", "info": "The dropout probability for Lora layers", "replaceToken": "", - "defaultValue": 0.1 + "defaultValue": 0.1, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter lora_dropout?" }, { "type": "Integer", "label": "Eval dataset size:", "info": "Size of the validation dataset, a number or 0-1 percentage.", "replaceToken": "", - "defaultValue": 0.3 - }, + "defaultValue": 0.3, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter eval_dataset_size?" + }, { "type": "Integer", "label": "Seed:", "info": "Random seed for initialization.", "replaceToken": "", - "defaultValue": 0 + "defaultValue": 0, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter training_args_seed?" }, { "type": "Integer", "label": "Data seed:", "info": "Random seed to be used with data samplers.", "replaceToken": "", - "defaultValue": 42 + "defaultValue": 42, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter training_args_data_seed?" }, { "type": "Integer", "label": "Per device train batch size:", "info": "The batch size per GPU for training.", "replaceToken": "", - "defaultValue": 8 + "defaultValue": 8, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter per_device_train_batch_size?" }, { "type": "Integer", "label": "Per device eval batch size:", "info": "The batch size per GPU for evaluation.", "replaceToken": "", - "defaultValue": 8 + "defaultValue": 8, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter per_device_eval_batch_size?" }, { "type": "Integer", "label": "Gradient accumulation steps:", "info": "Number of updates steps to accumulate the gradients for, before performing a backward/update pass", "replaceToken": "", - "defaultValue": 4 + "defaultValue": 4, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter gradient_accumulation_steps?" }, { "type": "Boolean", "label": "Enable gradient checkpointing:", "info": "Use gradient checkpointing. Recommended to save the memory.", "replaceToken": "", - "defaultValue": true + "defaultValue": true, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter gradient_checkpointing?" }, { "type": "Number", "label": "Learning rate:", "info": "The initial learning rate for AdamW", "replaceToken": "", - "defaultValue": 0.0002 + "defaultValue": 0.0002, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter learning_rate?" }, { "type": "Integer", "label": "Number of epochs:", "info": "How many complete passes the model will make over the entire training dataset.", "replaceToken": "", - "defaultValue": 3 + "defaultValue": 3, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter num_train_epochs?" }, { "type": "Integer", "label": "Max steps:", "info":"Training will be stopped when this number of steps is reached, regardless of the number of epochs.", "replaceToken": "", - "defaultValue": 100 + "defaultValue": 100, + "learnMore": "Can you tell me more about the Hugging Face trainer parameter max_steps?" }, { "type": "String", "label": "Checkpoint output dir", "info": "Directory to save the checkpoints.", "replaceToken": "", - "defaultValue": "models/checkpoints" + "defaultValue": "models/checkpoints", + "learnMore": "Can you tell me more about the Hugging Face trainer parameter output_dir?" } ] } diff --git a/configs/zephyr-7b-beta/setup/requirements.txt b/configs/zephyr-7b-beta/setup/requirements.txt index a55d10c..4cd3939 100644 --- a/configs/zephyr-7b-beta/setup/requirements.txt +++ b/configs/zephyr-7b-beta/setup/requirements.txt @@ -2,20 +2,33 @@ torch torchvision torchaudio -transformers==4.34.1 +packaging +datasets==2.14.5 +transformers==4.36.2 accelerate==0.23.0 bitsandbytes==0.41.1 -datasets==2.14.5 peft==0.5.0 scikit-learn==1.3.1 sentencepiece==0.1.99 trl==0.7.2 -protobuf==3.20.3 +protobuf ipykernel==6.25.2 wandb==0.15.12 -onnxruntime-gpu==1.16.1 einops -olive-ai==0.3.3 promptflow==0.1.0b8 promptflow-tools==0.1.0b10 gradio +azureml-core==1.54.0 +azure-ai-ml==1.12.1 +azureml-fsspec +azureml-mlflow +azure-identity +azure-keyvault +azure-keyvault-secrets +mlflow +docker +onnxruntime-gpu +scipy +onnxruntime-training +torch-ort +git+https://github.com/microsoft/Olive.git@3c25717980ad743e802c16a8e4c831b6196f68bf From 422cffdaeec5ca637d3c454a4f480cfd21b5d69d Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Tue, 13 Feb 2024 00:23:42 -0800 Subject: [PATCH 15/19] Adding llama2 azure support --- configs/llama-v2-7b/.vscode/launch.json | 2 +- configs/llama-v2-7b/README.md | 39 ++++++ .../llama-v2-7b/finetuning/aml_config.json | 8 ++ .../wais_llama_2_7b_env/Dockerfile | 31 +++++ .../wais_llama_2_7b_env/requirements.txt | 34 +++++ .../llama-v2-7b/finetuning/invoke_olive.py | 104 ++++++++++++++- .../olive-config-azureml-template.json | 119 +++++++++++++++++ .../olive-config-azureml_template.json | 120 ++++++++++++++++++ .../llama-v2-7b/finetuning/olive-config.json | 5 +- configs/llama-v2-7b/finetuning/utils.py | 71 +++++++++++ configs/llama-v2-7b/setup/requirements.txt | 23 +++- 11 files changed, 546 insertions(+), 10 deletions(-) create mode 100644 configs/llama-v2-7b/finetuning/aml_config.json create mode 100644 configs/llama-v2-7b/finetuning/docker-contexts/wais_llama_2_7b_env/Dockerfile create mode 100644 configs/llama-v2-7b/finetuning/docker-contexts/wais_llama_2_7b_env/requirements.txt create mode 100644 configs/llama-v2-7b/finetuning/olive-config-azureml-template.json create mode 100644 configs/llama-v2-7b/finetuning/olive-config-azureml_template.json create mode 100644 configs/llama-v2-7b/finetuning/utils.py diff --git a/configs/llama-v2-7b/.vscode/launch.json b/configs/llama-v2-7b/.vscode/launch.json index 2b2502c..74565fe 100644 --- a/configs/llama-v2-7b/.vscode/launch.json +++ b/configs/llama-v2-7b/.vscode/launch.json @@ -6,7 +6,7 @@ "configurations": [ { "name": "Python: Current File", - "type": "python", + "type": "debugpy", "request": "launch", "program": "${file}", "console": "integratedTerminal", diff --git a/configs/llama-v2-7b/README.md b/configs/llama-v2-7b/README.md index a06b31a..a90f4f3 100644 --- a/configs/llama-v2-7b/README.md +++ b/configs/llama-v2-7b/README.md @@ -23,6 +23,45 @@ cd inference python gradio_chat.py --baseonly ``` +### Model fine-tuning and inferencing + +Once the workspace is opened in a dev container, open a terminal (the default path is project root), then run the command below to fine tune a LLM on the selected dataset. + +```bash +python finetuning/invoke_olive.py +``` + +### Model fine-tuning on Azure + +Please make sure that you have created the Azure ML workspace according to the following directions: +https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources?view=azureml-api-2 + +Create Compute cluster using GPU nodes (https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu?view=azureml-api-2): + +https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-attach-compute-cluster?view=azureml-api-2&tabs=azure-studio + +Setup "hf-token" in the Keyvault associated with the Azure ML workspace. +Get HuggingFace access token from your HuggingFace account. +1. Get your Huggingface token string from Settings -> [Access Tokens](https://huggingface.co/settings/tokens). Please, make sure that your account has access to any gated model that you will access for finetuning. +2. Create or use an existing [Azure Key Vault](https://learn.microsoft.com/en-us/azure/key-vault/general/overview). Assume the key vault is named `keyvault_name`. Add a new secret named `hf-token`, and set the value as the token from the first step. It is important to note that `hf-token` secret name is reserved specifically for Huggingface login. Do not use this name in this keyvault for other purpose. +3. Give the compute cluster access to the keyvault `keyvault_name` using the following instructions: +a. Create a managed identity for the compute cluster. +b. Give the compute cluster access to the keyvault `keyvault_name` using the following instructions: +https://learn.microsoft.com/en-us/azure/key-vault/general/assign-access-policy?tabs=azure-portal + +Update finetuning/aml_config.json with correct values. + +Once the workspace is opened in a dev container, open a terminal (the default path is project root). + +```bash +az login +python finetuning/invoke_olive.py --azure --aml_config finetuning/aml_config.json +``` + +This will submit an AML job which can be monitored using the link printed by the program. After training finishes, the trained model can be downloaded from the outputs folder. + + + Checkpoints and final model will be saved in `models` folder. Next run inferencing with the fune-tuned model through chats in a `console`, `web browser` or `prompt flow`. diff --git a/configs/llama-v2-7b/finetuning/aml_config.json b/configs/llama-v2-7b/finetuning/aml_config.json new file mode 100644 index 0000000..79bf87f --- /dev/null +++ b/configs/llama-v2-7b/finetuning/aml_config.json @@ -0,0 +1,8 @@ +{ + "subscription_id": "", + "resource_group": "", + "workspace_name": "", + "aml_compute_name": "", + "hf_token": "", + "keyvault_name": "" +} \ No newline at end of file diff --git a/configs/llama-v2-7b/finetuning/docker-contexts/wais_llama_2_7b_env/Dockerfile b/configs/llama-v2-7b/finetuning/docker-contexts/wais_llama_2_7b_env/Dockerfile new file mode 100644 index 0000000..29c2bd3 --- /dev/null +++ b/configs/llama-v2-7b/finetuning/docker-contexts/wais_llama_2_7b_env/Dockerfile @@ -0,0 +1,31 @@ +FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04:20240109.v1 + +ENV DEBIAN_FRONTEND=noninteractive +ENV ACCEPT_EULA=Y + +# Build python3.9 as the default python +RUN apt-get -y update && \ + apt-get -y upgrade && \ + apt-get install -y --no-install-recommends \ + software-properties-common gnupg\ + && add-apt-repository -y ppa:deadsnakes \ + && apt-get install -y --no-install-recommends \ + python3.9-venv \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* && \ + python3.9 -m venv /venv + +ENV PATH=/venv/bin:$PATH + +# Install TensorRT +RUN v="8.4.1-1+cuda11.6" &&\ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ + apt-get -y update && \ + apt-get -y upgrade && \ + apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \ + libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \ + python3-libnvinfer=${v} libnvinfer-samples=${v} + +RUN pip install --upgrade pip && pip install --upgrade setuptools +COPY requirements.txt . +RUN pip install -r requirements.txt --no-cache-dir diff --git a/configs/llama-v2-7b/finetuning/docker-contexts/wais_llama_2_7b_env/requirements.txt b/configs/llama-v2-7b/finetuning/docker-contexts/wais_llama_2_7b_env/requirements.txt new file mode 100644 index 0000000..4cd3939 --- /dev/null +++ b/configs/llama-v2-7b/finetuning/docker-contexts/wais_llama_2_7b_env/requirements.txt @@ -0,0 +1,34 @@ +--extra-index-url https://download.pytorch.org/whl/cu118 +torch +torchvision +torchaudio +packaging +datasets==2.14.5 +transformers==4.36.2 +accelerate==0.23.0 +bitsandbytes==0.41.1 +peft==0.5.0 +scikit-learn==1.3.1 +sentencepiece==0.1.99 +trl==0.7.2 +protobuf +ipykernel==6.25.2 +wandb==0.15.12 +einops +promptflow==0.1.0b8 +promptflow-tools==0.1.0b10 +gradio +azureml-core==1.54.0 +azure-ai-ml==1.12.1 +azureml-fsspec +azureml-mlflow +azure-identity +azure-keyvault +azure-keyvault-secrets +mlflow +docker +onnxruntime-gpu +scipy +onnxruntime-training +torch-ort +git+https://github.com/microsoft/Olive.git@3c25717980ad743e802c16a8e4c831b6196f68bf diff --git a/configs/llama-v2-7b/finetuning/invoke_olive.py b/configs/llama-v2-7b/finetuning/invoke_olive.py index 261b160..58c7259 100644 --- a/configs/llama-v2-7b/finetuning/invoke_olive.py +++ b/configs/llama-v2-7b/finetuning/invoke_olive.py @@ -1,6 +1,104 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import argparse +import sys +import json +from utils import get_aml_client, create_dataset +from azure.ai.ml.entities import Environment, BuildContext +import re + from olive.workflows import run as olive_run import os +import argparse +import sys + +def parse_aml_config(aml_config): + """Parse the AML config to make sure the required fields are present""" + with open(aml_config, 'r') as file: + aml_config = json.load(file) + + try: + subscription_id = aml_config["subscription_id"] + resource_group = aml_config["resource_group"] + workspace_name = aml_config["workspace_name"] + aml_compute_name = aml_config["aml_compute_name"] + hf_token = aml_config["hf_token"] + keyvault_name = aml_config["keyvault_name"] + except KeyError as e: + print(f"KeyError: {e} not found in aml_config.json") + sys.exit(1) + + return aml_config + + +def main(): + """Main function of the script.""" + + # input and output arguments + parser = argparse.ArgumentParser() + parser.add_argument("--azure", required=False, action="store_true", help="runs the training on azure when this option is enabled") + parser.add_argument("--aml_config", type=str, required='--azure' in sys.argv, help="aml config (update and use aml_config.json) for azure subscription/workspace details") + + args = parser.parse_args() + + # Run olive from file locally + if not args.azure: + file_path = os.path.join(os.getcwd(), 'finetuning/olive-config.json') + olive_run(file_path) + else: + dataset_local_path = "dataset/dataset-classification.json" + dataset_name = "llama_2_7b_train_dataset" + dataset_version = "1" + docker_context_path = "finetuning/docker-contexts/wais_llama_2_7b_env" + azure_olive_config_template_path = "finetuning/olive-config-azureml-template.json" + azure_olive_config_path = "finetuning/olive-config-azureml.json" + azure_environment_name = "wais_llama_2_7b_env" + + aml_config = parse_aml_config(args.aml_config) + # Get the AML client + ml_client = get_aml_client(aml_config["subscription_id"], aml_config["resource_group"], aml_config["workspace_name"]) + + # Create the environment + print("Creating the environment...") + env_docker_context = Environment( + build=BuildContext(path=docker_context_path), # Path to the Docker context + name=azure_environment_name, + description="Environment created from a Docker context for training llama_2_7b model using Olive.", + ) + aml_env = ml_client.environments.create_or_update(env_docker_context) + print("The environment {} was created successfully.".format(aml_env.name)) + + # Create the dataset + print("Creating the dataset...") + + description = "Train dataset for tone classification model." + dataset = create_dataset(ml_client, local_path=dataset_local_path, name=dataset_name, version=dataset_version, description=description) + print("The dataset {} was created successfully.".format(dataset.name)) + dataset_relative_path = re.split("/datastores/.*/paths/", dataset.path)[-1] + + # Update the olive-config-azureml.json + with open(azure_olive_config_template_path, 'r') as file: + olive_config = json.load(file) + try: + olive_config["azureml_client"]["aml_config_path"] = args.aml_config + olive_config["data_configs"]["dataset_default_train"]["params_config"]["data_files"]["config"]["azureml_client"]["aml_config_path"] = args.aml_config + olive_config["data_configs"]["dataset_default_train"]["params_config"]["data_files"]["config"]["relative_path"] = dataset_relative_path + olive_config["systems"]["aml_system"]["config"]["aml_compute"] = aml_config["aml_compute_name"] + olive_config["systems"]["aml_system"]["config"]["aml_environment_config"]["name"] = aml_env.name + olive_config["systems"]["aml_system"]["config"]["aml_environment_config"]["version"] = aml_env.version + olive_config["azureml_client"]["keyvault_name"] = aml_config["keyvault_name"] + except KeyError as e: + print(f"KeyError: {e} not found in olive-config-azureml.json") + sys.exit(1) + + with open(azure_olive_config_path, 'w') as file: + json.dump(olive_config, file, indent=4) + + # Run olive from file for debug. + file_path = os.path.join(os.getcwd(), azure_olive_config_path) + olive_run(file_path) + -# Run olive from file for debug. -file_path = os.path.join(os.getcwd(), 'finetuning/olive-config.json') -olive_run(file_path) \ No newline at end of file +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/configs/llama-v2-7b/finetuning/olive-config-azureml-template.json b/configs/llama-v2-7b/finetuning/olive-config-azureml-template.json new file mode 100644 index 0000000..48e1945 --- /dev/null +++ b/configs/llama-v2-7b/finetuning/olive-config-azureml-template.json @@ -0,0 +1,119 @@ +{ + "azureml_client": { + "aml_config_path": "", + "read_timeout": 4000, + "max_operation_retries": 4, + "operation_retry_interval": 5, + "keyvault_name" : "" + }, + "systems": { + "local_system": { + "type": "LocalSystem" + }, + "aml_system": { + "type": "AzureML", + "config": { + "aml_compute": "", + "accelerators": [ + "gpu" + ], + "hf_token": true, + "aml_environment_config": { + "name": "", + "version": "" + } + } + } + }, + "input_model": { + "type": "PyTorchModel", + "config": { + "hf_config": { + "model_name": "meta-llama/Llama-2-7b-hf", + "task": "text-generation", + "from_pretrained_args": { + "trust_remote_code": true + } + } + } + }, + "data_configs": { + "dataset_default_train": { + "name": "dataset_default", + "type": "HuggingfaceContainer", + "params_config": { +#data_configs_data_files_extension_start + +#data_configs_data_files_extension_end + "data_files": { + "type": "azureml_datastore", + "config": { + "azureml_client": + { + "aml_config_path": "" + }, + "datastore_name": "workspaceblobstore", + "relative_path": "" + } + }, + "split": "", + "component_kwargs": { + "pre_process_data": { + "dataset_type": "", + "text_cols": , + "text_template": "", + "corpus_strategy": "", + "source_max_len": , + "pad_to_max_len": , + "use_attention_mask": false + } + } + } + } + }, + "passes": { + "qlora": { + "type": "QLoRA", + "config": { + "compute_dtype": "", + "quant_type": "", + "double_quant": , + "lora_r": , + "lora_alpha": , + "lora_dropout": , + "train_data_config": "dataset_default_train", + "eval_dataset_size": , + "training_args": { + "seed": , + "data_seed": , + "per_device_train_batch_size": , + "per_device_eval_batch_size": , + "gradient_accumulation_steps": , + "gradient_checkpointing": , + "learning_rate": , + "num_train_epochs":, + "max_steps": , + "logging_steps": 10, + "evaluation_strategy": "steps", + "eval_steps": 187, + "group_by_length": true, + "adam_beta2": 0.999, + "max_grad_norm": 0.3, + "output_dir": "outputs/" + } + } + } + }, + "engine": { + "log_severity_level": 0, + "search_strategy": false, + "evaluate_input_model": false, + "execution_providers": [ + "CUDAExecutionProvider" + ], + "host": "aml_system", + "cache_dir": "model_cache", + "output_dir": "outputs/models/qlora", + "target": "aml_system" + } +} diff --git a/configs/llama-v2-7b/finetuning/olive-config-azureml_template.json b/configs/llama-v2-7b/finetuning/olive-config-azureml_template.json new file mode 100644 index 0000000..666a8ad --- /dev/null +++ b/configs/llama-v2-7b/finetuning/olive-config-azureml_template.json @@ -0,0 +1,120 @@ +{ + "azureml_client": { + "aml_config_path": "", + "read_timeout": 4000, + "max_operation_retries": 4, + "operation_retry_interval": 5, + "keyvault_name" : "" + }, + "systems": { + "local_system": { + "type": "LocalSystem" + }, + "aml_system": { + "type": "AzureML", + "config": { + "aml_compute": "", + "accelerators": [ + "gpu" + ], + "hf_token": true, + "aml_environment_config": { + "name": "", + "version": "" + } + } + } + }, + "input_model": { + "type": "PyTorchModel", + "config": { + "hf_config": { + "model_name": "meta-llama/Llama-2-7b-hf", + "task": "text-generation", + "from_pretrained_args": { + "trust_remote_code": true + } + } + } + }, + "data_configs": { + "dataset_default_train": { + "name": "dataset_default", + "type": "HuggingfaceContainer", + "params_config": { + "data_name": "json", + "data_files": { + "type": "azureml_datastore", + "config": { + "azureml_client": + { + "aml_config_path": "" + }, + "datastore_name": "workspaceblobstore", + "relative_path": "" + } + }, + "split": "train", + "component_kwargs": { + "pre_process_data": { + "dataset_type": "corpus", + "text_cols": [ + "phrase", + "tone" + ], + "text_template": "### Text: {phrase}\n### The tone is:\n{tone}", + "corpus_strategy": "join", + "source_max_len": 1024, + "pad_to_max_len": false, + "use_attention_mask": false + } + } + } + } + }, + "passes": { + "qlora": { + "type": "QLoRA", + "config": { + "compute_dtype": "bfloat16", + "quant_type": "nf4", + "double_quant": true, + "lora_r": 64, + "lora_alpha": 64, + "lora_dropout": 0.1, + "train_data_config": "dataset_default_train", + "eval_dataset_size": 0.3, + "training_args": { + "seed": 0, + "data_seed": 42, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "gradient_accumulation_steps": 4, + "gradient_checkpointing": true, + "learning_rate": 0.0001, + "num_train_epochs": 3, + "max_steps": 1200, + "logging_steps": 10, + "evaluation_strategy": "steps", + "eval_steps": 187, + "group_by_length": true, + "adam_beta2": 0.999, + "max_grad_norm": 0.3, + "output_dir": "outputs/models/checkpoints" + } + } + } + }, + "engine": { + "log_severity_level": 0, + "search_strategy": false, + "evaluate_input_model": false, + "execution_providers": [ + "CUDAExecutionProvider" + ], + "host": "aml_system", + "cache_dir": "model_cache", + "output_dir": "outputs/models/qlora", + "target": "aml_system" + } +} \ No newline at end of file diff --git a/configs/llama-v2-7b/finetuning/olive-config.json b/configs/llama-v2-7b/finetuning/olive-config.json index 9461fec..f06cd93 100644 --- a/configs/llama-v2-7b/finetuning/olive-config.json +++ b/configs/llama-v2-7b/finetuning/olive-config.json @@ -4,7 +4,10 @@ "config": { "hf_config": { "model_name": "model-cache/meta-llama/Llama-2-7b", - "task": "text-generation" + "task": "text-generation", + "from_pretrained_args": { + "trust_remote_code": true + } } } }, diff --git a/configs/llama-v2-7b/finetuning/utils.py b/configs/llama-v2-7b/finetuning/utils.py new file mode 100644 index 0000000..86c6275 --- /dev/null +++ b/configs/llama-v2-7b/finetuning/utils.py @@ -0,0 +1,71 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +# Handle to the workspace +from azure.ai.ml import MLClient + +# Authentication package +from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential +from azure.ai.ml.entities import Data +from azure.ai.ml.constants import AssetTypes + +from azure.ai.ml import command +from azure.ai.ml import Input, Output + + +def get_aml_client(subscription_id, resource_group_name, workspace_name): + """ + Get an Azure Machine Learning client instance. + + Args: + subscription_id (str): The Azure subscription ID. + resource_group_name (str): The name of the resource group. + workspace_name (str): The name of the Azure Machine Learning workspace. + + Returns: + MLClient: An instance of the Azure Machine Learning client. + """ + credential = DefaultAzureCredential(exclude_interactive_browser_credential=False) + + # Create and return MLClient instance + return MLClient(credential, + subscription_id=subscription_id, + resource_group_name=resource_group_name, + workspace_name=workspace_name + ) + + +def create_dataset(ml_client, local_path, name, version, description=""): + """ + Creates a data asset using the specified ML client, local path, name, version, and optional description. + + Args: + ml_client (MLClient): The ML client used to interact with the ML service. + local_path (str): The local path of the data asset. + name (str): The name of the data asset. + version (str): The version of the data asset. + description (str, optional): The description of the data asset. Defaults to "". + + Returns: + Dataset (Dataset): Registered dataset with the given name and version. + """ + + my_data = Data( + name=name, + version=version, + description=description, + path=local_path, + type=AssetTypes.URI_FILE, + ) + + ## create data asset if it doesn't already exist: + try: + dataset = ml_client.data.get(name=name, version=version) + print( + f"Data asset already exists. Name: {dataset.name}, version: {dataset.version}" + ) + except: + dataset = ml_client.data.create_or_update(my_data) + print(f"Data asset created. Name: {dataset.name}, version: {dataset.version}") + + return dataset \ No newline at end of file diff --git a/configs/llama-v2-7b/setup/requirements.txt b/configs/llama-v2-7b/setup/requirements.txt index a55d10c..44fe7db 100644 --- a/configs/llama-v2-7b/setup/requirements.txt +++ b/configs/llama-v2-7b/setup/requirements.txt @@ -2,20 +2,33 @@ torch torchvision torchaudio -transformers==4.34.1 +packaging +datasets==2.14.5 +transformers==4.36.2 accelerate==0.23.0 bitsandbytes==0.41.1 -datasets==2.14.5 peft==0.5.0 scikit-learn==1.3.1 sentencepiece==0.1.99 trl==0.7.2 -protobuf==3.20.3 +protobuf ipykernel==6.25.2 wandb==0.15.12 -onnxruntime-gpu==1.16.1 einops -olive-ai==0.3.3 promptflow==0.1.0b8 promptflow-tools==0.1.0b10 gradio +azureml-core==1.54.0 +azure-ai-ml==1.12.1 +azureml-fsspec +azureml-mlflow +azure-identity +azure-keyvault +azure-keyvault-secrets +mlflow +docker +onnxruntime-gpu +scipy +onnxruntime-training +torch-ort +git+https://github.com/microsoft/Olive.git@3c25717980ad743e802c16a8e4c831b6196f68bf \ No newline at end of file From b01fae5c727b03f550fa474d6ac5cd875f955cac Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Tue, 13 Feb 2024 00:35:29 -0800 Subject: [PATCH 16/19] Updating ReadMe.md for all --- configs/llama-v2-7b/README.md | 5 +++-- configs/mistral-7b/README.md | 32 +++++++++++++++++++++++++++++++ configs/phi-1_5/README.md | 33 ++++++++++++++++++++++++++++++++ configs/phi-2/README.md | 32 +++++++++++++++++++++++++++++++ configs/zephyr-7b-beta/README.md | 32 +++++++++++++++++++++++++++++++ 5 files changed, 132 insertions(+), 2 deletions(-) diff --git a/configs/llama-v2-7b/README.md b/configs/llama-v2-7b/README.md index a90f4f3..bf73f10 100644 --- a/configs/llama-v2-7b/README.md +++ b/configs/llama-v2-7b/README.md @@ -31,6 +31,8 @@ Once the workspace is opened in a dev container, open a terminal (the default pa python finetuning/invoke_olive.py ``` +Checkpoints and final model will be saved in `models` folder. + ### Model fine-tuning on Azure Please make sure that you have created the Azure ML workspace according to the following directions: @@ -61,8 +63,7 @@ python finetuning/invoke_olive.py --azure --aml_config finetuning/aml_config.jso This will submit an AML job which can be monitored using the link printed by the program. After training finishes, the trained model can be downloaded from the outputs folder. - -Checkpoints and final model will be saved in `models` folder. +### Inference Next run inferencing with the fune-tuned model through chats in a `console`, `web browser` or `prompt flow`. diff --git a/configs/mistral-7b/README.md b/configs/mistral-7b/README.md index f4e36ce..bf73f10 100644 --- a/configs/mistral-7b/README.md +++ b/configs/mistral-7b/README.md @@ -33,6 +33,38 @@ python finetuning/invoke_olive.py Checkpoints and final model will be saved in `models` folder. +### Model fine-tuning on Azure + +Please make sure that you have created the Azure ML workspace according to the following directions: +https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources?view=azureml-api-2 + +Create Compute cluster using GPU nodes (https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu?view=azureml-api-2): + +https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-attach-compute-cluster?view=azureml-api-2&tabs=azure-studio + +Setup "hf-token" in the Keyvault associated with the Azure ML workspace. +Get HuggingFace access token from your HuggingFace account. +1. Get your Huggingface token string from Settings -> [Access Tokens](https://huggingface.co/settings/tokens). Please, make sure that your account has access to any gated model that you will access for finetuning. +2. Create or use an existing [Azure Key Vault](https://learn.microsoft.com/en-us/azure/key-vault/general/overview). Assume the key vault is named `keyvault_name`. Add a new secret named `hf-token`, and set the value as the token from the first step. It is important to note that `hf-token` secret name is reserved specifically for Huggingface login. Do not use this name in this keyvault for other purpose. +3. Give the compute cluster access to the keyvault `keyvault_name` using the following instructions: +a. Create a managed identity for the compute cluster. +b. Give the compute cluster access to the keyvault `keyvault_name` using the following instructions: +https://learn.microsoft.com/en-us/azure/key-vault/general/assign-access-policy?tabs=azure-portal + +Update finetuning/aml_config.json with correct values. + +Once the workspace is opened in a dev container, open a terminal (the default path is project root). + +```bash +az login +python finetuning/invoke_olive.py --azure --aml_config finetuning/aml_config.json +``` + +This will submit an AML job which can be monitored using the link printed by the program. After training finishes, the trained model can be downloaded from the outputs folder. + + +### Inference + Next run inferencing with the fune-tuned model through chats in a `console`, `web browser` or `prompt flow`. ```bash diff --git a/configs/phi-1_5/README.md b/configs/phi-1_5/README.md index f4e36ce..916b3d1 100644 --- a/configs/phi-1_5/README.md +++ b/configs/phi-1_5/README.md @@ -33,6 +33,39 @@ python finetuning/invoke_olive.py Checkpoints and final model will be saved in `models` folder. + +### Model fine-tuning on Azure + +Please make sure that you have created the Azure ML workspace according to the following directions: +https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources?view=azureml-api-2 + +Create Compute cluster using GPU nodes (https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu?view=azureml-api-2): + +https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-attach-compute-cluster?view=azureml-api-2&tabs=azure-studio + +Setup "hf-token" in the Keyvault associated with the Azure ML workspace. +Get HuggingFace access token from your HuggingFace account. +1. Get your Huggingface token string from Settings -> [Access Tokens](https://huggingface.co/settings/tokens). Please, make sure that your account has access to any gated model that you will access for finetuning. +2. Create or use an existing [Azure Key Vault](https://learn.microsoft.com/en-us/azure/key-vault/general/overview). Assume the key vault is named `keyvault_name`. Add a new secret named `hf-token`, and set the value as the token from the first step. It is important to note that `hf-token` secret name is reserved specifically for Huggingface login. Do not use this name in this keyvault for other purpose. +3. Give the compute cluster access to the keyvault `keyvault_name` using the following instructions: +a. Create a managed identity for the compute cluster. +b. Give the compute cluster access to the keyvault `keyvault_name` using the following instructions: +https://learn.microsoft.com/en-us/azure/key-vault/general/assign-access-policy?tabs=azure-portal + +Update finetuning/aml_config.json with correct values. + +Once the workspace is opened in a dev container, open a terminal (the default path is project root). + +```bash +az login +python finetuning/invoke_olive.py --azure --aml_config finetuning/aml_config.json +``` + +This will submit an AML job which can be monitored using the link printed by the program. After training finishes, the trained model can be downloaded from the outputs folder. + + +### Inference + Next run inferencing with the fune-tuned model through chats in a `console`, `web browser` or `prompt flow`. ```bash diff --git a/configs/phi-2/README.md b/configs/phi-2/README.md index f4e36ce..bf73f10 100644 --- a/configs/phi-2/README.md +++ b/configs/phi-2/README.md @@ -33,6 +33,38 @@ python finetuning/invoke_olive.py Checkpoints and final model will be saved in `models` folder. +### Model fine-tuning on Azure + +Please make sure that you have created the Azure ML workspace according to the following directions: +https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources?view=azureml-api-2 + +Create Compute cluster using GPU nodes (https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu?view=azureml-api-2): + +https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-attach-compute-cluster?view=azureml-api-2&tabs=azure-studio + +Setup "hf-token" in the Keyvault associated with the Azure ML workspace. +Get HuggingFace access token from your HuggingFace account. +1. Get your Huggingface token string from Settings -> [Access Tokens](https://huggingface.co/settings/tokens). Please, make sure that your account has access to any gated model that you will access for finetuning. +2. Create or use an existing [Azure Key Vault](https://learn.microsoft.com/en-us/azure/key-vault/general/overview). Assume the key vault is named `keyvault_name`. Add a new secret named `hf-token`, and set the value as the token from the first step. It is important to note that `hf-token` secret name is reserved specifically for Huggingface login. Do not use this name in this keyvault for other purpose. +3. Give the compute cluster access to the keyvault `keyvault_name` using the following instructions: +a. Create a managed identity for the compute cluster. +b. Give the compute cluster access to the keyvault `keyvault_name` using the following instructions: +https://learn.microsoft.com/en-us/azure/key-vault/general/assign-access-policy?tabs=azure-portal + +Update finetuning/aml_config.json with correct values. + +Once the workspace is opened in a dev container, open a terminal (the default path is project root). + +```bash +az login +python finetuning/invoke_olive.py --azure --aml_config finetuning/aml_config.json +``` + +This will submit an AML job which can be monitored using the link printed by the program. After training finishes, the trained model can be downloaded from the outputs folder. + + +### Inference + Next run inferencing with the fune-tuned model through chats in a `console`, `web browser` or `prompt flow`. ```bash diff --git a/configs/zephyr-7b-beta/README.md b/configs/zephyr-7b-beta/README.md index f4e36ce..bf73f10 100644 --- a/configs/zephyr-7b-beta/README.md +++ b/configs/zephyr-7b-beta/README.md @@ -33,6 +33,38 @@ python finetuning/invoke_olive.py Checkpoints and final model will be saved in `models` folder. +### Model fine-tuning on Azure + +Please make sure that you have created the Azure ML workspace according to the following directions: +https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources?view=azureml-api-2 + +Create Compute cluster using GPU nodes (https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu?view=azureml-api-2): + +https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-attach-compute-cluster?view=azureml-api-2&tabs=azure-studio + +Setup "hf-token" in the Keyvault associated with the Azure ML workspace. +Get HuggingFace access token from your HuggingFace account. +1. Get your Huggingface token string from Settings -> [Access Tokens](https://huggingface.co/settings/tokens). Please, make sure that your account has access to any gated model that you will access for finetuning. +2. Create or use an existing [Azure Key Vault](https://learn.microsoft.com/en-us/azure/key-vault/general/overview). Assume the key vault is named `keyvault_name`. Add a new secret named `hf-token`, and set the value as the token from the first step. It is important to note that `hf-token` secret name is reserved specifically for Huggingface login. Do not use this name in this keyvault for other purpose. +3. Give the compute cluster access to the keyvault `keyvault_name` using the following instructions: +a. Create a managed identity for the compute cluster. +b. Give the compute cluster access to the keyvault `keyvault_name` using the following instructions: +https://learn.microsoft.com/en-us/azure/key-vault/general/assign-access-policy?tabs=azure-portal + +Update finetuning/aml_config.json with correct values. + +Once the workspace is opened in a dev container, open a terminal (the default path is project root). + +```bash +az login +python finetuning/invoke_olive.py --azure --aml_config finetuning/aml_config.json +``` + +This will submit an AML job which can be monitored using the link printed by the program. After training finishes, the trained model can be downloaded from the outputs folder. + + +### Inference + Next run inferencing with the fune-tuned model through chats in a `console`, `web browser` or `prompt flow`. ```bash From b26ed4a80b0b42a3b8041cbfdd5e707e3b962b1d Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Tue, 13 Feb 2024 12:40:49 -0800 Subject: [PATCH 17/19] Updating launch.json --- configs/llama-v2-7b/.vscode/launch.json | 23 ++++++++++++++++++++ configs/mistral-7b/.vscode/launch.json | 25 +++++++++++++++++++++- configs/phi-1_5/.vscode/launch.json | 23 ++++++++++++++++++++ configs/phi-2/.vscode/launch.json | 23 ++++++++++++++++++++ configs/zephyr-7b-beta/.vscode/launch.json | 23 ++++++++++++++++++++ 5 files changed, 116 insertions(+), 1 deletion(-) diff --git a/configs/llama-v2-7b/.vscode/launch.json b/configs/llama-v2-7b/.vscode/launch.json index 74565fe..15196a1 100644 --- a/configs/llama-v2-7b/.vscode/launch.json +++ b/configs/llama-v2-7b/.vscode/launch.json @@ -11,6 +11,29 @@ "program": "${file}", "console": "integratedTerminal", "justMyCode": false + }, + { + "name": "Python: invoke_finetune_azure", + "type": "debugpy", + "request": "launch", + "program": "finetuning/invoke_olive.py", + "console": "integratedTerminal", + "justMyCode": false, + "args": [ + "--azure", + "--aml_config", + "finetuning/aml_config.json", + ] + }, + { + "name": "Python: invoke_finetune_local", + "type": "debugpy", + "request": "launch", + "program": "finetuning/invoke_olive.py", + "console": "integratedTerminal", + "justMyCode": false, + "args": [ + ] } ] } \ No newline at end of file diff --git a/configs/mistral-7b/.vscode/launch.json b/configs/mistral-7b/.vscode/launch.json index 2b2502c..15196a1 100644 --- a/configs/mistral-7b/.vscode/launch.json +++ b/configs/mistral-7b/.vscode/launch.json @@ -6,11 +6,34 @@ "configurations": [ { "name": "Python: Current File", - "type": "python", + "type": "debugpy", "request": "launch", "program": "${file}", "console": "integratedTerminal", "justMyCode": false + }, + { + "name": "Python: invoke_finetune_azure", + "type": "debugpy", + "request": "launch", + "program": "finetuning/invoke_olive.py", + "console": "integratedTerminal", + "justMyCode": false, + "args": [ + "--azure", + "--aml_config", + "finetuning/aml_config.json", + ] + }, + { + "name": "Python: invoke_finetune_local", + "type": "debugpy", + "request": "launch", + "program": "finetuning/invoke_olive.py", + "console": "integratedTerminal", + "justMyCode": false, + "args": [ + ] } ] } \ No newline at end of file diff --git a/configs/phi-1_5/.vscode/launch.json b/configs/phi-1_5/.vscode/launch.json index 74565fe..15196a1 100644 --- a/configs/phi-1_5/.vscode/launch.json +++ b/configs/phi-1_5/.vscode/launch.json @@ -11,6 +11,29 @@ "program": "${file}", "console": "integratedTerminal", "justMyCode": false + }, + { + "name": "Python: invoke_finetune_azure", + "type": "debugpy", + "request": "launch", + "program": "finetuning/invoke_olive.py", + "console": "integratedTerminal", + "justMyCode": false, + "args": [ + "--azure", + "--aml_config", + "finetuning/aml_config.json", + ] + }, + { + "name": "Python: invoke_finetune_local", + "type": "debugpy", + "request": "launch", + "program": "finetuning/invoke_olive.py", + "console": "integratedTerminal", + "justMyCode": false, + "args": [ + ] } ] } \ No newline at end of file diff --git a/configs/phi-2/.vscode/launch.json b/configs/phi-2/.vscode/launch.json index 74565fe..15196a1 100644 --- a/configs/phi-2/.vscode/launch.json +++ b/configs/phi-2/.vscode/launch.json @@ -11,6 +11,29 @@ "program": "${file}", "console": "integratedTerminal", "justMyCode": false + }, + { + "name": "Python: invoke_finetune_azure", + "type": "debugpy", + "request": "launch", + "program": "finetuning/invoke_olive.py", + "console": "integratedTerminal", + "justMyCode": false, + "args": [ + "--azure", + "--aml_config", + "finetuning/aml_config.json", + ] + }, + { + "name": "Python: invoke_finetune_local", + "type": "debugpy", + "request": "launch", + "program": "finetuning/invoke_olive.py", + "console": "integratedTerminal", + "justMyCode": false, + "args": [ + ] } ] } \ No newline at end of file diff --git a/configs/zephyr-7b-beta/.vscode/launch.json b/configs/zephyr-7b-beta/.vscode/launch.json index 74565fe..15196a1 100644 --- a/configs/zephyr-7b-beta/.vscode/launch.json +++ b/configs/zephyr-7b-beta/.vscode/launch.json @@ -11,6 +11,29 @@ "program": "${file}", "console": "integratedTerminal", "justMyCode": false + }, + { + "name": "Python: invoke_finetune_azure", + "type": "debugpy", + "request": "launch", + "program": "finetuning/invoke_olive.py", + "console": "integratedTerminal", + "justMyCode": false, + "args": [ + "--azure", + "--aml_config", + "finetuning/aml_config.json", + ] + }, + { + "name": "Python: invoke_finetune_local", + "type": "debugpy", + "request": "launch", + "program": "finetuning/invoke_olive.py", + "console": "integratedTerminal", + "justMyCode": false, + "args": [ + ] } ] } \ No newline at end of file From 75ac4df3afb5f7ddd4b57ca90d81f64c9cfe7505 Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Tue, 13 Feb 2024 18:57:54 -0800 Subject: [PATCH 18/19] Removing unnecessary file and unnecessary hf_token entry from aml_config of llama2 --- .../llama-v2-7b/finetuning/aml_config.json | 3 +- .../llama-v2-7b/finetuning/invoke_olive.py | 3 +- .../olive-config-azureml_template.json | 120 ------------------ 3 files changed, 2 insertions(+), 124 deletions(-) delete mode 100644 configs/llama-v2-7b/finetuning/olive-config-azureml_template.json diff --git a/configs/llama-v2-7b/finetuning/aml_config.json b/configs/llama-v2-7b/finetuning/aml_config.json index 79bf87f..ed10e90 100644 --- a/configs/llama-v2-7b/finetuning/aml_config.json +++ b/configs/llama-v2-7b/finetuning/aml_config.json @@ -2,7 +2,6 @@ "subscription_id": "", "resource_group": "", "workspace_name": "", - "aml_compute_name": "", - "hf_token": "", + "aml_compute_name": "", "keyvault_name": "" } \ No newline at end of file diff --git a/configs/llama-v2-7b/finetuning/invoke_olive.py b/configs/llama-v2-7b/finetuning/invoke_olive.py index 58c7259..da92458 100644 --- a/configs/llama-v2-7b/finetuning/invoke_olive.py +++ b/configs/llama-v2-7b/finetuning/invoke_olive.py @@ -22,8 +22,7 @@ def parse_aml_config(aml_config): subscription_id = aml_config["subscription_id"] resource_group = aml_config["resource_group"] workspace_name = aml_config["workspace_name"] - aml_compute_name = aml_config["aml_compute_name"] - hf_token = aml_config["hf_token"] + aml_compute_name = aml_config["aml_compute_name"] keyvault_name = aml_config["keyvault_name"] except KeyError as e: print(f"KeyError: {e} not found in aml_config.json") diff --git a/configs/llama-v2-7b/finetuning/olive-config-azureml_template.json b/configs/llama-v2-7b/finetuning/olive-config-azureml_template.json deleted file mode 100644 index 666a8ad..0000000 --- a/configs/llama-v2-7b/finetuning/olive-config-azureml_template.json +++ /dev/null @@ -1,120 +0,0 @@ -{ - "azureml_client": { - "aml_config_path": "", - "read_timeout": 4000, - "max_operation_retries": 4, - "operation_retry_interval": 5, - "keyvault_name" : "" - }, - "systems": { - "local_system": { - "type": "LocalSystem" - }, - "aml_system": { - "type": "AzureML", - "config": { - "aml_compute": "", - "accelerators": [ - "gpu" - ], - "hf_token": true, - "aml_environment_config": { - "name": "", - "version": "" - } - } - } - }, - "input_model": { - "type": "PyTorchModel", - "config": { - "hf_config": { - "model_name": "meta-llama/Llama-2-7b-hf", - "task": "text-generation", - "from_pretrained_args": { - "trust_remote_code": true - } - } - } - }, - "data_configs": { - "dataset_default_train": { - "name": "dataset_default", - "type": "HuggingfaceContainer", - "params_config": { - "data_name": "json", - "data_files": { - "type": "azureml_datastore", - "config": { - "azureml_client": - { - "aml_config_path": "" - }, - "datastore_name": "workspaceblobstore", - "relative_path": "" - } - }, - "split": "train", - "component_kwargs": { - "pre_process_data": { - "dataset_type": "corpus", - "text_cols": [ - "phrase", - "tone" - ], - "text_template": "### Text: {phrase}\n### The tone is:\n{tone}", - "corpus_strategy": "join", - "source_max_len": 1024, - "pad_to_max_len": false, - "use_attention_mask": false - } - } - } - } - }, - "passes": { - "qlora": { - "type": "QLoRA", - "config": { - "compute_dtype": "bfloat16", - "quant_type": "nf4", - "double_quant": true, - "lora_r": 64, - "lora_alpha": 64, - "lora_dropout": 0.1, - "train_data_config": "dataset_default_train", - "eval_dataset_size": 0.3, - "training_args": { - "seed": 0, - "data_seed": 42, - "per_device_train_batch_size": 1, - "per_device_eval_batch_size": 1, - "gradient_accumulation_steps": 4, - "gradient_checkpointing": true, - "learning_rate": 0.0001, - "num_train_epochs": 3, - "max_steps": 1200, - "logging_steps": 10, - "evaluation_strategy": "steps", - "eval_steps": 187, - "group_by_length": true, - "adam_beta2": 0.999, - "max_grad_norm": 0.3, - "output_dir": "outputs/models/checkpoints" - } - } - } - }, - "engine": { - "log_severity_level": 0, - "search_strategy": false, - "evaluate_input_model": false, - "execution_providers": [ - "CUDAExecutionProvider" - ], - "host": "aml_system", - "cache_dir": "model_cache", - "output_dir": "outputs/models/qlora", - "target": "aml_system" - } -} \ No newline at end of file From 463dd89a76a826f066ad8584007adbd1dc82182e Mon Sep 17 00:00:00 2001 From: Amol Ambardekar Date: Wed, 14 Feb 2024 10:06:04 -0800 Subject: [PATCH 19/19] Fixing bug in Zephyr olive-config.json --- configs/zephyr-7b-beta/finetuning/olive-config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/zephyr-7b-beta/finetuning/olive-config.json b/configs/zephyr-7b-beta/finetuning/olive-config.json index c7452ce..305e869 100644 --- a/configs/zephyr-7b-beta/finetuning/olive-config.json +++ b/configs/zephyr-7b-beta/finetuning/olive-config.json @@ -4,7 +4,7 @@ "config": { "hf_config": { "model_name": "model-cache/HuggingFaceH4/zephyr-7b-beta", - "task": "text-generation" + "task": "text-generation", "from_pretrained_args": { "trust_remote_code": true }