Tweak memory extract to be use batch increment instead of an absolute…

… batch size
mila-iqia · Feb 6, 2025 · 99c01a8 · 99c01a8
1 parent 6cdc40e
commit 99c01a8
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 49 deletions.
diff --git a/config/base.yaml b/config/base.yaml
@@ -216,7 +216,7 @@ resnet50-noio:
   inherits: _torchvision
   voir:
     options:
-      stop: 500
+      stop: 60
       interval: "1s"
 
   tags:
@@ -229,7 +229,7 @@ resnet50-noio:
 
   argv:
     --model: resnet50
-    --batch-size: batch_resize(256)
+    --batch-size: auto_batch(256)
     --loader: synthetic_fixed
     --optim: channel_last
 

diff --git a/config/examples/system.yaml b/config/examples/system.yaml
@@ -41,14 +41,14 @@ system:
 multirun:
   runs:
     # Force batch size to populate the sizing model
-    - name: "bs{sizer.batch_size}"
+    - name: "bs{sizer.batch_size}.{time}"
       matrix:
         sizer.auto: 1
         sizer.batch_size: [1, 2, 4, 8, 16, 32, 64, 128]
         sizer.save: ["scaling.yaml"]
 
     # Matrix run
-    - name: "c{sizer.capacity}_m{sizer.multiple}_w{cpu.n_workers}"
+    - name: "c{sizer.capacity}_m{sizer.multiple}_w{cpu.n_workers}.{time}"
       matrix:
         cpu.auto: 1
         cpu.n_workers: [2, 4, 8, 16, 32]
@@ -58,7 +58,7 @@ multirun:
         sizer.save: ["scaling.yaml"]
 
     # Auto run
-    - name: "auto"
+    - name: "auto.{time}"
       matrix:
         cpu.auto: 1
         sizer.auto: 1

diff --git a/config/scaling/L40S.yaml b/config/scaling/L40S.yaml
@@ -1,13 +1,4 @@
-resnet152-ddp-gpus:
-  observations:
-  - {batch_size: 0, cpu: 8, memory: 25995 MiB, perf: 1995.03}
 resnet50:
   observations:
-  - {batch_size: 1, cpu: 8, memory: 46041 MiB, perf: 78.92}
-  - {batch_size: 2, cpu: 8, memory: 1237 MiB, perf: 159.82}
-  - {batch_size: 2, cpu: 8, memory: 34611 MiB, perf: 161.62}
-  - {batch_size: 4, cpu: 8, memory: 1331 MiB, perf: 315.69}
-  - {batch_size: 4, cpu: 8, memory: 46041 MiB, perf: 308.07}
-  - {batch_size: 8, cpu: 8, memory: 1489 MiB, perf: 603.96}
-  - {batch_size: 8, cpu: 8, memory: 46041 MiB, perf: 602.72}
-version: 2.0
+  - {batch_size: 248, cpu: 8, memory: 12798 MiB, perf: 654.31}
+version: 2
diff --git a/milabench/sizer.py b/milabench/sizer.py
@@ -9,6 +9,7 @@
 from voir.instruments.gpu import get_gpu_info
 from cantilever.core.statstream import StatStream
 
+from .syslog import syslog
 from .system import CPUOptions, SizerOptions, system_global, option
 from .validation.validation import ValidationLayer
 
@@ -270,10 +271,12 @@ class MemoryUsageExtractor(ValidationLayer):
     """Extract max memory usage per benchmark to populate the memory model"""
 
     def __init__(self):
+        syslog("new extrator")
+
         self.filepath = option("sizer.save", str, None)
         sizer = Sizer()
 
-        if os.path.exists(self.filepath):
+        if self.filepath and os.path.exists(self.filepath):
             with open(self.filepath, "r") as fp:
                 self.memory = yaml.safe_load(fp) or {}
         else:
@@ -286,7 +289,9 @@ def __init__(self):
         self.scaling = None
         self.stats = defaultdict(lambda: StatStream(drop_first_obs=0))
         self.benchname = None
-        self.batch_size = 0
+        self.batch_size = None
+        self.active_count = defaultdict(int)
+        self.rc = defaultdict(int)
         self.max_usage = float("-inf")  # Usage from the gpu monitor
         self.peak_usage = float("-inf") # Usage provided by the bench itself (for jax)
         self.early_stopped = False
@@ -302,6 +307,7 @@ def on_cpu_count_set(self, pack, _, value):
         self.stats["cpu"] += value
 
     def on_batch_size_set(self, pack, _, value):
+        self.batch_size = value
         self.stats["batch_size"] += value
 
     def convert(self):
@@ -322,11 +328,13 @@ def convert(self):
     def on_start(self, entry):
         if self.filepath is None:
             return
-
+        
         self.benchname = entry.pack.config["name"]
         self.max_usage = float("-inf")
         self.peak_usage = float("-inf")
 
+        self.active_count[self.benchname] += 1
+
     def on_data(self, entry):
         if self.filepath is None:
             return
@@ -359,46 +367,75 @@ def on_end(self, entry):
         if self.filepath is None:
             return
 
-        if (
-            self.benchname is None
-            or self.batch_size is None
-            or self.max_memory_usage() == float("-inf")
-        ):
+        if self.benchname is None:
+            syslog("Skipping missing benchmark {}", entry)
+            return
+
+        if self.batch_size is None:
+            syslog("Skipping missing batch_size {}", entry)
+            return
+
+        if self.max_memory_usage() == float("-inf"):
+            syslog("Missing memory info {}", entry)
             return
 
         # Only update is successful
         rc = entry.data["return_code"]
 
         if rc == 0 or self.early_stopped:
-            config = self.memory.setdefault(self.benchname, dict())
-            observations = config.setdefault("observations", [])
-
-            obs = {
-                "cpu": int(self.stats["cpu"].avg),
-                "batch_size": int(self.stats["batch_size"].avg),
-                "memory": f"{int(self.stats['memory'].max)} MiB",
-                "perf": float(f"{self.stats['perf'].avg:.2f}"),
-            }
-
-            if memorypeak := self.stats.pop("memorypeak", None):
-                if memorypeak.current_count != 0:
-                    obs["memory"] = f"{int(memorypeak.max)} MiB",
-
-            observations.append(obs)
-
-            config["observations"] = list(sorted(observations, key=lambda x: x["batch_size"]))
-
-        self.benchname = None
-        self.batch_size = None
-        self.stats = defaultdict(lambda: StatStream(drop_first_obs=0))
-        self.max_usage = float("-inf")
-        self.peak_usage = float("-inf")
+            rc = 0
+
+        self.rc[self.benchname] += rc
+        self.active_count[self.benchname] -= 1
+
+        if self.active_count[self.benchname] <= 0:
+
+            if self.rc[self.benchname] == 0:
+                syslog("addin new obs")
+                self.push_observation()
+            else:
+                syslog("Could not add scaling data because of a failure")
+
+            self.benchname = None
+            self.batch_size = None
+            self.stats = defaultdict(lambda: StatStream(drop_first_obs=0))
+            self.max_usage = float("-inf")
+            self.peak_usage = float("-inf")
+            # avoid losing results
+            try:
+                self.save()
+            except Exception as err:
+                print(f"Could not save scaling file because of {err}")
+
+    def push_observation(self):
+        config = self.memory.setdefault(self.benchname, dict())
+        observations = config.setdefault("observations", [])
+
+        obs = {
+            "cpu": int(self.stats["cpu"].avg),
+            "batch_size": int(self.stats["batch_size"].avg),
+            "memory": f"{int(self.stats['memory'].max)} MiB",
+            "perf": float(f"{self.stats['perf'].avg:.2f}"),
+        }
+
+        if memorypeak := self.stats.pop("memorypeak", None):
+            if memorypeak.current_count != 0:
+                obs["memory"] = f"{int(memorypeak.max)} MiB",
+
+        observations.append(obs)
+        config["observations"] = list(sorted(observations, key=lambda x: x["batch_size"]))
+
+    def save(self):
+        syslog("Saving scaling file")
 
-    def report(self, *args):
         if self.filepath is not None:
             with open(self.filepath, "w") as file:
                 yaml.dump(self.memory, file, Dumper=compact_dump())
 
+    def report(self, *args):
+        syslog("end")
+        self.save()
+
 
 def arch_to_device(arch):
     device_types = [
@@ -462,7 +499,10 @@ def batch_resize(default):
         newvalue = default
 
         if gpu_opt.enabled:
-            newvalue = suggested_batch_size(pack)
+            if gpu_opt.add is not None and isinstance(gpu_opt.add, int):
+                newvalue = max(1, default + gpu_opt.add)
+            else:
+                newvalue = suggested_batch_size(pack)
 
         broadcast(on_batch_size_set, pack, default, newvalue)
         return newvalue

diff --git a/milabench/system.py b/milabench/system.py
@@ -184,6 +184,9 @@ class SizerOptions:
     # overrides the batch size to use for all benchmarks
     size: int = defaultfield("sizer.batch_size", int, None)
 
+    # Add a fixed number to the current batch size
+    add: int = defaultfield("sizer.batch_size_add", int, None)
+
     # Enables auto batch resize
     autoscale: bool = defaultfield("sizer.auto", int, 0)