Add clear hpu cache flag for stable perf (#59) (#1634)

huggingface · Jan 31, 2025 · e142942 · e142942
1 parent 4a3a9a8
commit e142942
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 1 deletion.
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
@@ -226,6 +226,11 @@ def setup_parser(parser):
         action="store_true",
         help="Skip HPU Graph usage for first token to save memory",
     )
+    parser.add_argument(
+        "--clear_hpu_graphs_cache",
+        action="store_true",
+        help="Clear HPU graphs cache",
+    )
     parser.add_argument(
         "--show_graphs_count",
         action="store_true",

diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
@@ -661,6 +661,7 @@ def setup_generation_config(args, model, assistant_model, tokenizer):
     generation_config.trim_logits = args.trim_logits
     generation_config.attn_softmax_bf16 = args.attn_softmax_bf16
     generation_config.limit_hpu_graphs = args.limit_hpu_graphs
+    generation_config.clear_hpu_graphs_cache = args.clear_hpu_graphs_cache
     generation_config.reuse_cache = args.reuse_cache
     generation_config.reduce_recompile = args.reduce_recompile
     if generation_config.reduce_recompile:

diff --git a/optimum/habana/transformers/generation/configuration_utils.py b/optimum/habana/transformers/generation/configuration_utils.py
@@ -21,6 +21,8 @@ class GaudiGenerationConfig(GenerationConfig):
         is also running in lower precision.
     limit_hpu_graphs (`bool`, *optional*):
         Skip HPU Graph usage for first token to save memory
+    clear_hpu_graphs_cache (`bool`, *optional*):
+        Clear HPU Graph cache
     reuse_cache (`bool`, *optional*):
         Whether to reuse key/value cache for decoding. It should save memory.
     bucket_size (`int`, *optional*):
@@ -46,6 +48,7 @@ def __init__(self, **kwargs):
         self.ignore_eos = kwargs.get("ignore_eos", None)
         self.attn_softmax_bf16 = kwargs.get("attn_softmax_bf16", None)
         self.limit_hpu_graphs = kwargs.get("limit_hpu_graphs", None)
+        self.clear_hpu_graphs_cache = kwargs.get("clear_hpu_graphs_cache", None)
         self.reuse_cache = kwargs.get("reuse_cache", None)
         self.bucket_size = kwargs.get("bucket_size", -1)
         self.bucket_internal = kwargs.get("bucket_internal", None)

diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
@@ -1263,6 +1263,9 @@ def generate(
         model_kwargs["use_hpu_graphs"] = hpu_graphs
         model_kwargs["limit_hpu_graphs"] = generation_config.limit_hpu_graphs
 
+        # determine whether to clear hpu graphs cache
+        model_kwargs["clear_hpu_graphs_cache"] = generation_config.clear_hpu_graphs_cache
+
         # prepare for allocate kv cache
         model_kwargs["reuse_cache"] = generation_config.reuse_cache
 
@@ -2616,8 +2619,14 @@ def _sample(
             and not model_kwargs.get("reuse_cache", False)
             and bucket_internal
         ):
+            # Clear HPU graphs cache
+            if model_kwargs.get("clear_hpu_graphs_cache", False):
+                self.clear_cache()
+
             # Clear HPU graphs input tensors of the decode phase after the full generation while loop
-            self.clear_inputs()
+            else:
+                self.clear_inputs()
+
             # Delete past key value tensors
             self._remove_past_key_values(model_kwargs)