Merge branch 'main' into zhzhang/mamba_for_main

huggingface · Feb 6, 2025 · f66cda0 · f66cda0
2 parents 1f481ab + 27d1495
commit f66cda0
Show file tree

Hide file tree

Showing 131 changed files with 7,361 additions and 3,797 deletions.
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
@@ -51,7 +51,7 @@ jobs:
           echo ${{ env.COMMIT_SHA }} > ./commit_sha
           echo ${{ env.PR_NUMBER }} > ./pr_number
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: doc-build-artifact
           path: optimum-habana/habana-doc-build/
diff --git a/.github/workflows/slow_tests_gaudi2.yml b/.github/workflows/slow_tests_gaudi2.yml
@@ -60,7 +60,7 @@ jobs:
             --net=host \
             --ipc=host \
             vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
-            /bin/bash tests/ci/slow_tests_diffusers.sh
+            /bin/bash tests/ci/slow_tests_diffusers.sh ${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   deepspeed:
     name: Test DeepSpeed models
     if: ${{ !cancelled() && (success() || failure()) }}

diff --git a/Makefile b/Makefile
@@ -98,7 +98,7 @@ slow_tests_deepspeed: test_installs
 
 slow_tests_diffusers: test_installs
 	python -m pip install -r examples/stable-diffusion/requirements.txt
-	python -m pytest tests/test_diffusers.py -v -s -k "test_textual_inversion"
+	python -m pytest tests/test_diffusers.py -v -s -k "textual_inversion"
 	python -m pip install peft==0.7.0
 	python -m pytest tests/test_diffusers.py -v -s -k "test_train_text_to_image_"
 	python -m pytest tests/test_diffusers.py -v -s -k "test_train_controlnet"
@@ -107,6 +107,7 @@ slow_tests_diffusers: test_installs
 
 # Run text-generation non-regression tests
 slow_tests_text_generation_example: test_installs
+	python -m pip install triton==3.1.0 autoawq
 	BUILD_CUDA_EXT=0 python -m pip install -vvv --no-build-isolation git+https://github.com/HabanaAI/AutoGPTQ.git
 	python -m pip install git+https://github.com/HabanaAI/[email protected]
 	python -m pytest tests/test_text_generation_example.py tests/test_encoder_decoder.py -v -s --token $(TOKEN)
@@ -120,6 +121,11 @@ slow_tests_openclip_vqa_example: test_installs
 	python -m pip install -r examples/visual-question-answering/openclip_requirements.txt
 	python -m pytest tests/test_openclip_vqa.py
 
+# Run video comprehension tests
+slow_tests_video_llava_example: test_installs
+	python -m pip install -r examples/video-comprehension/requirements.txt
+	python -m pytest tests/test_video_llava.py
+
 slow_tests_fsdp: test_installs
 	python -m pytest tests/test_fsdp_examples.py -v -s --token $(TOKEN)
 

diff --git a/README.md b/README.md
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
@@ -105,11 +105,13 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | TableTransformer |       | <div style="text-align:left"><li>Single card</li></div> | <li>[table object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/table-detection)</li> |
 | DETR         |          | <div style="text-align:left"><li>Single card</li></div> | <li>[object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/object-detection)</li> |
 | Mllama     | <div style="text-align:left"><li>LoRA</li></div> |✅      | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
+| Video-LLaVA         |          | <div style="text-align:left"><li>Single card</li></div> | <li>[video comprehension](https://github.com/huggingface/optimum-habana/tree/main/examples/video-comprehension)</li> |
 | MiniCPM3 |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | Baichuan2 | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| DeepSeek-V2 |   | ✅ | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| DeepSeek-V2 | ✅ | ✅ | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | ChatGLM     | <div style="text-align:left"><li>DeepSpeed</li></div> |  <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | Mamba |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| Qwen2-VL |          |  <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
 
 - Diffusers
 
@@ -120,7 +122,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | Stable Diffusion Depth2img | | <li>Single card</li> | <li>[depth-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | LDM3D               |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | FLUX.1              | <li>[fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#dreambooth-lora-fine-tuning-with-flux1-dev)</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
-| Text to Video       |          | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-video)</li> |
+| Text to Video       |          | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#text-to-video-generation)</li> |
 
 - PyTorch Image Models/TIMM:
 

diff --git a/examples/audio-classification/README.md b/examples/audio-classification/README.md
@@ -107,52 +107,6 @@ On 8 HPUs, this script should run in ~12 minutes and yield an accuracy of **80.4
 
 > If you get an error reporting unused parameters in the model, you can specify `--ddp_find_unused_parameters True`. Using this parameter might affect the training speed.
 
-
-## DeepSpeed
-
-> You need to install DeepSpeed with:
-> ```bash
-> pip install git+https://github.com/HabanaAI/[email protected]
-> ```
-
-DeepSpeed can be used with almost the same command as for a multi-card run:
-- `use_mpi` should be replaced by `use_deepspeed`,
-- an additional `--deepspeed path_to_my_deepspeed config` argument should be provided, for instance `--deepspeed ../../tests/configs/deepspeed_zero_2.json`.
-
-For example:
-```bash
-PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_audio_classification.py \
-    --model_name_or_path facebook/wav2vec2-base \
-    --dataset_name common_language \
-    --audio_column_name audio \
-    --label_column_name language \
-    --output_dir /tmp/wav2vec2-base-lang-id \
-    --overwrite_output_dir \
-    --remove_unused_columns False \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-4 \
-    --max_length_seconds 8 \
-    --attention_mask False \
-    --warmup_ratio 0.1 \
-    --num_train_epochs 10 \
-    --per_device_train_batch_size 16 \
-    --per_device_eval_batch_size 32 \
-    --seed 0 \
-    --use_habana \
-    --use_lazy_mode False\
-    --gaudi_config_name Habana/wav2vec2 \
-    --throughput_warmup_steps 3 \
-    --deepspeed ../../tests/configs/deepspeed_zero_2.json \
-    --trust_remote_code True
-```
-
-[The documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) provides more information about how to use DeepSpeed within Optimum Habana.
-
-> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
-
-
 ## Inference
 
 To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...

diff --git a/examples/audio-classification/requirements.txt b/examples/audio-classification/requirements.txt
@@ -1,3 +1,4 @@
 datasets>=1.14.0
 evaluate
+numba==0.60.0
 librosa
diff --git a/examples/audio-classification/run_audio_classification.py b/examples/audio-classification/run_audio_classification.py
@@ -177,6 +177,31 @@ class ModelArguments:
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
     )
+    use_flash_attention: bool = field(
+        default=False, metadata={"help": "Whether to use Habana flash attention for fine-tuning"}
+    )
+    flash_attention_recompute: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to enable recompute in Habana flash attention for fine-tuning."
+            " It is applicable only when use_flash_attention is True."
+        },
+    )
+    flash_attention_fast_softmax: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use fast softmax for Habana flash attention."
+            " It is applicable only when use_flash_attention is True."
+        },
+    )
+
+    def __post_init__(self):
+        if self.flash_attention_recompute:
+            assert self.use_flash_attention, "flash_attention_recompute is set, but use_flash_attention is not"
+            os.environ["FLASH_ATTENTION_RECOMPUTE"] = "1"
+        if self.flash_attention_fast_softmax:
+            assert self.use_flash_attention, "flash_attention_fast_softmax is set, but use_flash_attention is not"
+            os.environ["FLASH_ATTENTION_FAST_SOFTMAX"] = "1"
 
 
 def main():
@@ -364,6 +389,7 @@ def compute_metrics(eval_pred):
         revision=model_args.model_revision,
         token=model_args.token,
         trust_remote_code=model_args.trust_remote_code,
+        attn_implementation="sdpa" if model_args.use_flash_attention else "eager",
     )
     model = AutoModelForAudioClassification.from_pretrained(
         model_args.model_name_or_path,

diff --git a/examples/contrastive-image-text/README.md b/examples/contrastive-image-text/README.md
@@ -163,61 +163,8 @@ python3 ../gaudi_spawn.py --world_size 8 --use_mpi run_clip.py \
 
 ### DeepSpeed
 
-Run the following command for training with DeepSpeed:
-
-```bash
-PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 \
-python3 ../gaudi_spawn.py --world_size 8 --use_deepspeed run_clip.py \
-    --output_dir=/tmp/clip_roberta \
-    --model_name_or_path=./clip-roberta \
-    --data_dir $PWD/data \
-    --dataset_name ydshieh/coco_dataset_script \
-    --dataset_config_name 2017 \
-    --image_column image_path \
-    --caption_column caption \
-    --remove_unused_columns=False \
-    --do_train --do_eval \
-    --mediapipe_dataloader \
-    --per_device_train_batch_size="64" \
-    --per_device_eval_batch_size="64" \
-    --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \
-    --overwrite_output_dir \
-    --use_habana \
-    --use_lazy_mode=False \
-    --gaudi_config_name="Habana/clip" \
-    --throughput_warmup_steps=30 \
-    --save_strategy="no" \
-    --dataloader_num_workers=2 \
-    --use_hpu_graphs \
-    --max_steps=100 \
-    --torch_compile_backend=hpu_backend \
-    --torch_compile \
-    --logging_nan_inf_filter \
-    --trust_remote_code \
-    --deepspeed <path_to_my_deepspeed_config>
-
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-Here is a DeepSpeed configuration you can use to train your models on Gaudi:
-```json
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
-```
+You can check the [DeepSpeed](https://github.com/huggingface/optimum-habana/tree/main/examples#deepspeed) section in Optimum Habana examples for how to run DeepSpeed.
+You can also look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
 
 
 ## BridgeTower