Merge branch 'main' into synapse_1_20

huggingface · Feb 5, 2025 · 58de6b6 · 58de6b6
2 parents 6a520ff + 1b0461c
commit 58de6b6
Show file tree

Hide file tree

Showing 25 changed files with 1,362 additions and 18 deletions.
diff --git a/Makefile b/Makefile
@@ -121,6 +121,11 @@ slow_tests_openclip_vqa_example: test_installs
 	python -m pip install -r examples/visual-question-answering/openclip_requirements.txt
 	python -m pytest tests/test_openclip_vqa.py
 
+# Run video comprehension tests
+slow_tests_video_llava_example: test_installs
+	python -m pip install -r examples/video-comprehension/requirements.txt
+	python -m pytest tests/test_video_llava.py
+
 slow_tests_fsdp: test_installs
 	python -m pytest tests/test_fsdp_examples.py -v -s --token $(TOKEN)
 

diff --git a/README.md b/README.md
@@ -282,6 +282,8 @@ The following model architectures, tasks and device distributions have been vali
 | DeepSeek-V2 | :heavy_check_mark: | :heavy_check_mark: | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | ChatGLM | <li>DeepSpeed</li> | <li>Single card</li> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | Qwen2-VL |          |  <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
+| VideoLLaVA | | <div style="text-align:left"><li>Single card</li></div> | <li>[Video comprehension](https://github.com/huggingface/optimum-habana/tree/main/examples/video-comprehension)</li> |
+
 </div>
 
 

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
@@ -105,6 +105,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | TableTransformer |       | <div style="text-align:left"><li>Single card</li></div> | <li>[table object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/table-detection)</li> |
 | DETR         |          | <div style="text-align:left"><li>Single card</li></div> | <li>[object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/object-detection)</li> |
 | Mllama     | <div style="text-align:left"><li>LoRA</li></div> |✅      | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
+| Video-LLaVA         |          | <div style="text-align:left"><li>Single card</li></div> | <li>[video comprehension](https://github.com/huggingface/optimum-habana/tree/main/examples/video-comprehension)</li> |
 | MiniCPM3 |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | Baichuan2 | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | DeepSeek-V2 | ✅ | ✅ | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |

diff --git a/examples/text-generation/quantization_config/maxabs_quant_mixtral.json b/examples/text-generation/quantization_config/maxabs_quant_mixtral.json
@@ -3,10 +3,7 @@
     "mode": "QUANTIZE",
     "observer": "maxabs",
     "scale_method": "maxabs_hw",
-    "allowlist": {"types": [], "names":  ["gate","w1","w3","w2"]},
-    "blocklist": {"types": [], "names":  [
-        "model.layers.1.block_sparse_moe.experts.(3|4).w2",
-        "model.layers.[29-31].block_sparse_moe.experts.[0-7].w2"
-    ]},
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  ["self_attn"]},
     "dump_stats_path": "./hqt_output/measure"
 }
diff --git a/examples/video-comprehension/README.md b/examples/video-comprehension/README.md
@@ -0,0 +1,33 @@
+<!---
+Copyright 2024 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+#  Examples
+
+This directory contains example scripts that demonstrate how to perform video comprehension on Gaudi with graph mode.
+
+## Single-HPU inference
+
+### Video-LLaVA Model
+
+```bash
+python3 run_example.py \
+    --model_name_or_path "LanguageBind/Video-LLaVA-7B-hf" \
+    --warmup 3 \
+    --n_iterations 5 \
+    --batch_size 1 \
+    --use_hpu_graphs \
+    --bf16 \
+    --output_dir ./
+```
+Models that have been validated:
+  - [LanguageBind/Video-LLaVA-7B-hf ](https://huggingface.co/LanguageBind/Video-LLaVA-7B-hf)
diff --git a/examples/video-comprehension/requirements.txt b/examples/video-comprehension/requirements.txt
@@ -0,0 +1,2 @@
+av == 12.1.0
+sentencepiece == 0.2.0
diff --git a/examples/video-comprehension/run_example.py b/examples/video-comprehension/run_example.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import json
+import logging
+import os
+import time
+from pathlib import Path
+
+import av
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from transformers import VideoLlavaProcessor
+
+from optimum.habana.transformers.modeling_utils import (
+    GaudiVideoLlavaForConditionalGeneration,
+    adapt_transformers_to_gaudi,
+)
+
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+
+def read_video_pyav(container, indices):
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        help="Path to pre-trained model",
+    )
+    parser.add_argument(
+        "--video_path",
+        default=None,
+        type=str,
+        nargs="*",
+        help='Path to video as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --video_path "URL1" "URL2")',
+    )
+    parser.add_argument(
+        "--prompt",
+        default=None,
+        type=str,
+        help='Optional argument to give a prompt of your choice as input. is a single string (eg: --prompt "Hello world")',
+    )
+    parser.add_argument(
+        "--use_hpu_graphs",
+        action="store_true",
+        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
+    )
+    parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.")
+    parser.add_argument(
+        "--bf16",
+        action="store_true",
+        help="Whether to perform generation in bf16 precision.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        help="Output directory to store results in.",
+    )
+    parser.add_argument(
+        "--token",
+        default=None,
+        type=str,
+        help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+        "generated when running `huggingface-cli login` (stored in `~/.huggingface`).",
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
+    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
+    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
+    parser.add_argument(
+        "--ignore_eos",
+        action="store_true",
+        help="Whether to disable stopping with eos token when calling `generate`.",
+    )
+    parser.add_argument(
+        "--use_flash_attention",
+        action="store_true",
+        help="Whether to enable Habana Flash Attention, provided that the model supports it.",
+    )
+    parser.add_argument(
+        "--flash_attention_recompute",
+        action="store_true",
+        help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.",
+    )
+
+    args = parser.parse_args()
+
+    os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
+
+    if args.video_path is None:
+        args.video_path = [
+            hf_hub_download(
+                repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
+            )
+        ]
+
+    if args.prompt is None:
+        args.prompt = ["USER: <video>Why is this video funny? ASSISTANT:"]
+    video_paths = args.video_path
+    video_paths_len = len(video_paths)
+
+    prompts = args.prompt
+    if args.batch_size > video_paths_len:
+        # Dynamically extends to support larger batch sizes
+        num_path_to_add = args.batch_size - video_paths_len
+        for i in range(num_path_to_add):
+            video_paths.append(video_paths[i % video_paths_len])
+            prompts.append(prompts[i % video_paths_len])
+    elif args.batch_size < video_paths_len:
+        video_paths = video_paths[: args.batch_size]
+
+    video_clips = []
+
+    for video_path in video_paths:
+        container = av.open(video_path)
+        num_frames = container.streams.video[0].frames
+        indices = np.arange(0, num_frames, num_frames / 8).astype(int)
+        clip = read_video_pyav(container, indices)
+        video_clips.append(clip)
+
+    if args.bf16:
+        model_dtype = torch.bfloat16
+    else:
+        model_dtype = torch.float32
+
+    adapt_transformers_to_gaudi()
+    model = GaudiVideoLlavaForConditionalGeneration.from_pretrained(args.model_name_or_path)
+    model = model.to(model_dtype)
+    device = torch.device("hpu")
+    model = model.to(device)
+    if args.use_hpu_graphs:
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+        model = wrap_in_hpu_graph(model)
+
+    processor = VideoLlavaProcessor.from_pretrained(args.model_name_or_path)
+    processor.tokenizer.padding_side = "left"
+    inputs = processor(text=prompts, videos=video_clips, return_tensors="pt")
+    inputs = inputs.to(device)
+
+    # warm up
+    for i in range(args.warmup):
+        generate_ids = model.generate(
+            **inputs,
+            lazy_mode=True,
+            hpu_graphs=args.use_hpu_graphs,
+            max_new_tokens=args.max_new_tokens,
+            ignore_eos=args.ignore_eos,
+            use_flash_attention=args.use_flash_attention,
+            flash_attention_recompute=args.flash_attention_recompute,
+        )
+    torch.hpu.synchronize()
+
+    start = time.perf_counter()
+    for i in range(args.n_iterations):
+        generate_ids = model.generate(
+            **inputs,
+            lazy_mode=True,
+            hpu_graphs=args.use_hpu_graphs,
+            max_new_tokens=args.max_new_tokens,
+            ignore_eos=args.ignore_eos,
+            use_flash_attention=args.use_flash_attention,
+            flash_attention_recompute=args.flash_attention_recompute,
+        )
+        generate_texts = processor.batch_decode(
+            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+    end = time.perf_counter()
+    duration = end - start
+
+    # Let's calculate the number of generated tokens
+    n_input_tokens = inputs["input_ids"].shape[1]
+    n_output_tokens = 0
+    for i in range(generate_ids.shape[0]):
+        n_input_tokens = torch.sum(inputs["attention_mask"][i, :]).item()
+        # We have to subtract the number of input tokens as they are part of the returned sequence
+        n_output_tokens += len(generate_ids[i]) - n_input_tokens
+
+    total_new_tokens_generated = args.n_iterations * n_output_tokens
+    throughput = total_new_tokens_generated / duration
+    logger.info(f"result = {generate_texts}")
+    logger.info(
+        f"time = {(end - start) * 1000 / args.n_iterations}ms, Throughput (including tokenization) = {throughput} tokens/second"
+    )
+
+    # Store results if necessary
+    if args.output_dir is not None:
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        results = {
+            "throughput": throughput,
+            "output": generate_texts,
+        }
+        with (output_dir / "results.json").open("w", encoding="utf-8") as f:
+            json.dump(results, f, ensure_ascii=False, indent=4)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/optimum/habana/accelerate/utils/modeling.py b/optimum/habana/accelerate/utils/modeling.py
@@ -0,0 +1,52 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Device similarity check compatible with hpu
+"""
+
+import torch
+
+
+def gaudi_check_device_same(first_device, second_device):
+    """
+    Copied from https://github.com/huggingface/accelerate/blob/6b2d968897c91bc3f96274b4679d84e9950ad908/src/accelerate/utils/modeling.py#L50
+    difference is addition of HPU device checks
+
+    Args:
+        first_device (`torch.device`):
+            First device to check
+        second_device (`torch.device`):
+            Second device to check
+    """
+    if first_device.type != second_device.type:
+        return False
+
+    if first_device.type == "cuda" and first_device.index is None:
+        # In case the first_device is a cuda device and have
+        # the index attribute set to `None`, default it to `0`
+        first_device = torch.device("cuda", index=0)
+
+    elif first_device.type == "hpu" and first_device.index is None:
+        first_device = torch.device("hpu", index=0)
+
+    if second_device.type == "cuda" and second_device.index is None:
+        # In case the second_device is a cuda device and have
+        # the index attribute set to `None`, default it to `0`
+        second_device = torch.device("cuda", index=0)
+
+    elif second_device.type == "hpu" and second_device.index is None:
+        second_device = torch.device("hpu", index=0)
+
+    return first_device == second_device
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -657,8 +657,6 @@ def __call__(
             t1 = t0
 
             self._num_timesteps = len(timesteps)
-            if hasattr(self.scheduler, "set_begin_index"):
-                self.scheduler.set_begin_index()
 
             hb_profiler = HabanaProfile(
                 warmup=profiling_warmup_steps,

diff --git a/...um/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/...um/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -540,8 +540,6 @@ def denoising_value_valid(dnv):
                 ).to(device=device, dtype=latents.dtype)
 
             self._num_timesteps = len(timesteps)
-            if hasattr(self.scheduler, "set_begin_index"):
-                self.scheduler.set_begin_index()
 
             # 8.3 Denoising loop
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)

diff --git a/...um/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/...um/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -748,8 +748,6 @@ def denoising_value_valid(dnv):
                 ).to(device=device, dtype=latents.dtype)
 
             self._num_timesteps = len(timesteps)
-            if hasattr(self.scheduler, "set_begin_index"):
-                self.scheduler.set_begin_index()
 
             outputs = {
                 "images": [],