add buckets setting for lm_eval (#2044)

* add buckets setting for lm_eval Signed-off-by: xinhe3 <[email protected]> * clear graph cache to avoid OOM Signed-off-by: xinhe3 <[email protected]> --------- Signed-off-by: xinhe3 <[email protected]> Co-authored-by: xinhe3 <[email protected]>
intel · Oct 29, 2024 · c186708 · c186708
1 parent f176798
commit c186708
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 2 deletions.
diff --git a/neural_compressor/evaluation/lm_eval/accuracy.py b/neural_compressor/evaluation/lm_eval/accuracy.py
@@ -199,6 +199,7 @@ def cli_evaluate(args) -> None:
             },
         )
     lm.pad_to_buckets = args.pad_to_buckets
+    lm.buckets = args.buckets
 
     results = evaluator.simple_evaluate(
         model=lm,

diff --git a/neural_compressor/evaluation/lm_eval/models/huggingface.py b/neural_compressor/evaluation/lm_eval/models/huggingface.py
@@ -116,11 +116,14 @@ def __init__(
         peft: Optional[str] = None,
         autogptq: Optional[Union[bool, str]] = False,
         pad_to_buckets: Optional[Union[bool]] = False,
+        buckets: Optional[list] = [32, 64, 128, 256, 512, 1024, 2048, 4096],
         model_format: Optional[str] = "torch",
         **kwargs,
     ) -> None:
         super().__init__()
         self.pad_to_buckets = pad_to_buckets
+        self.buckets = buckets
+        self.last_bucket = -1
         self.model_format = model_format
         # optionally: take in an already-initialized transformers.PreTrainedModel
         if not isinstance(pretrained, str):
@@ -874,6 +877,18 @@ def tok_decode(self, tokens, skip_special_tokens=True):
         elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
             return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
 
+    def find_bucket(self, length):
+        suitable_buckets = [b for b in self.buckets if b >= length]
+        if len(suitable_buckets) == 0:
+            eval_logger.error(f"The input_length={length} exceeds the maximum value in buckets={self.buckets}")
+            eval_logger.error("Please add a higher value into the buckets list for this case.")
+            exit(0)
+        else:
+            if self.last_bucket != suitable_buckets[0]:
+                self.model.clear_cache()  # clear graph cache to avoid OOM
+                self.last_bucket = suitable_buckets[0]
+            return self.last_bucket
+
     def _model_call(self, inps, attn_mask=None, labels=None):
         """
         :param inps: torch.Tensor
@@ -943,8 +958,7 @@ def _model_call(self, inps, attn_mask=None, labels=None):
                     if self.pad_to_buckets:  # use buckets to pad inputs
                         bs, seq_length = inps.shape
                         padding_length = 0
-                        buckets = [64, 128, 256, 512, 1024, 2048, 4096, 8192]
-                        bucket_length = [b for b in buckets if b >= seq_length][0]
+                        bucket_length = self.find_bucket(seq_length)
                         padding_length = bucket_length - seq_length
                         inps = F.pad(inps, (0, padding_length), value=self.model.config.pad_token_id)
                     output = self.model(inps)

diff --git a/neural_compressor/evaluation/lm_eval/utils.py b/neural_compressor/evaluation/lm_eval/utils.py
@@ -49,6 +49,7 @@ def __init__(
         seed=[0, 1234, 1234],
         trust_remote_code=False,
         pad_to_buckets=None,  # used by HPU to align input length for performance.
+        buckets=[32, 64, 128, 256, 512, 1024, 2048, 4096],  # used by HPU to limit input length range.
     ):
         self.model = model
         self.tasks = tasks
@@ -81,3 +82,4 @@ def __init__(
                 self.pad_to_buckets = False
         else:
             self.pad_to_buckets = pad_to_buckets
+        self.buckets = buckets