Skip to content

Commit

Permalink
Tweak memory extract to be use batch increment instead of an absolute…
Browse files Browse the repository at this point in the history
… batch size
  • Loading branch information
pierre.delaunay committed Feb 6, 2025
1 parent 6cdc40e commit 99c01a8
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 49 deletions.
4 changes: 2 additions & 2 deletions config/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ resnet50-noio:
inherits: _torchvision
voir:
options:
stop: 500
stop: 60
interval: "1s"

tags:
Expand All @@ -229,7 +229,7 @@ resnet50-noio:

argv:
--model: resnet50
--batch-size: batch_resize(256)
--batch-size: auto_batch(256)
--loader: synthetic_fixed
--optim: channel_last

Expand Down
6 changes: 3 additions & 3 deletions config/examples/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ system:
multirun:
runs:
# Force batch size to populate the sizing model
- name: "bs{sizer.batch_size}"
- name: "bs{sizer.batch_size}.{time}"
matrix:
sizer.auto: 1
sizer.batch_size: [1, 2, 4, 8, 16, 32, 64, 128]
sizer.save: ["scaling.yaml"]

# Matrix run
- name: "c{sizer.capacity}_m{sizer.multiple}_w{cpu.n_workers}"
- name: "c{sizer.capacity}_m{sizer.multiple}_w{cpu.n_workers}.{time}"
matrix:
cpu.auto: 1
cpu.n_workers: [2, 4, 8, 16, 32]
Expand All @@ -58,7 +58,7 @@ multirun:
sizer.save: ["scaling.yaml"]

# Auto run
- name: "auto"
- name: "auto.{time}"
matrix:
cpu.auto: 1
sizer.auto: 1
Expand Down
13 changes: 2 additions & 11 deletions config/scaling/L40S.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,4 @@
resnet152-ddp-gpus:
observations:
- {batch_size: 0, cpu: 8, memory: 25995 MiB, perf: 1995.03}
resnet50:
observations:
- {batch_size: 1, cpu: 8, memory: 46041 MiB, perf: 78.92}
- {batch_size: 2, cpu: 8, memory: 1237 MiB, perf: 159.82}
- {batch_size: 2, cpu: 8, memory: 34611 MiB, perf: 161.62}
- {batch_size: 4, cpu: 8, memory: 1331 MiB, perf: 315.69}
- {batch_size: 4, cpu: 8, memory: 46041 MiB, perf: 308.07}
- {batch_size: 8, cpu: 8, memory: 1489 MiB, perf: 603.96}
- {batch_size: 8, cpu: 8, memory: 46041 MiB, perf: 602.72}
version: 2.0
- {batch_size: 248, cpu: 8, memory: 12798 MiB, perf: 654.31}
version: 2
106 changes: 73 additions & 33 deletions milabench/sizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from voir.instruments.gpu import get_gpu_info
from cantilever.core.statstream import StatStream

from .syslog import syslog
from .system import CPUOptions, SizerOptions, system_global, option
from .validation.validation import ValidationLayer

Expand Down Expand Up @@ -270,10 +271,12 @@ class MemoryUsageExtractor(ValidationLayer):
"""Extract max memory usage per benchmark to populate the memory model"""

def __init__(self):
syslog("new extrator")

self.filepath = option("sizer.save", str, None)
sizer = Sizer()

if os.path.exists(self.filepath):
if self.filepath and os.path.exists(self.filepath):
with open(self.filepath, "r") as fp:
self.memory = yaml.safe_load(fp) or {}
else:
Expand All @@ -286,7 +289,9 @@ def __init__(self):
self.scaling = None
self.stats = defaultdict(lambda: StatStream(drop_first_obs=0))
self.benchname = None
self.batch_size = 0
self.batch_size = None
self.active_count = defaultdict(int)
self.rc = defaultdict(int)
self.max_usage = float("-inf") # Usage from the gpu monitor
self.peak_usage = float("-inf") # Usage provided by the bench itself (for jax)
self.early_stopped = False
Expand All @@ -302,6 +307,7 @@ def on_cpu_count_set(self, pack, _, value):
self.stats["cpu"] += value

def on_batch_size_set(self, pack, _, value):
self.batch_size = value
self.stats["batch_size"] += value

def convert(self):
Expand All @@ -322,11 +328,13 @@ def convert(self):
def on_start(self, entry):
if self.filepath is None:
return

self.benchname = entry.pack.config["name"]
self.max_usage = float("-inf")
self.peak_usage = float("-inf")

self.active_count[self.benchname] += 1

def on_data(self, entry):
if self.filepath is None:
return
Expand Down Expand Up @@ -359,46 +367,75 @@ def on_end(self, entry):
if self.filepath is None:
return

if (
self.benchname is None
or self.batch_size is None
or self.max_memory_usage() == float("-inf")
):
if self.benchname is None:
syslog("Skipping missing benchmark {}", entry)
return

if self.batch_size is None:
syslog("Skipping missing batch_size {}", entry)
return

if self.max_memory_usage() == float("-inf"):
syslog("Missing memory info {}", entry)
return

# Only update is successful
rc = entry.data["return_code"]

if rc == 0 or self.early_stopped:
config = self.memory.setdefault(self.benchname, dict())
observations = config.setdefault("observations", [])

obs = {
"cpu": int(self.stats["cpu"].avg),
"batch_size": int(self.stats["batch_size"].avg),
"memory": f"{int(self.stats['memory'].max)} MiB",
"perf": float(f"{self.stats['perf'].avg:.2f}"),
}

if memorypeak := self.stats.pop("memorypeak", None):
if memorypeak.current_count != 0:
obs["memory"] = f"{int(memorypeak.max)} MiB",

observations.append(obs)

config["observations"] = list(sorted(observations, key=lambda x: x["batch_size"]))

self.benchname = None
self.batch_size = None
self.stats = defaultdict(lambda: StatStream(drop_first_obs=0))
self.max_usage = float("-inf")
self.peak_usage = float("-inf")
rc = 0

self.rc[self.benchname] += rc
self.active_count[self.benchname] -= 1

if self.active_count[self.benchname] <= 0:

if self.rc[self.benchname] == 0:
syslog("addin new obs")
self.push_observation()
else:
syslog("Could not add scaling data because of a failure")

self.benchname = None
self.batch_size = None
self.stats = defaultdict(lambda: StatStream(drop_first_obs=0))
self.max_usage = float("-inf")
self.peak_usage = float("-inf")
# avoid losing results
try:
self.save()
except Exception as err:
print(f"Could not save scaling file because of {err}")

def push_observation(self):
config = self.memory.setdefault(self.benchname, dict())
observations = config.setdefault("observations", [])

obs = {
"cpu": int(self.stats["cpu"].avg),
"batch_size": int(self.stats["batch_size"].avg),
"memory": f"{int(self.stats['memory'].max)} MiB",
"perf": float(f"{self.stats['perf'].avg:.2f}"),
}

if memorypeak := self.stats.pop("memorypeak", None):
if memorypeak.current_count != 0:
obs["memory"] = f"{int(memorypeak.max)} MiB",

observations.append(obs)
config["observations"] = list(sorted(observations, key=lambda x: x["batch_size"]))

def save(self):
syslog("Saving scaling file")

def report(self, *args):
if self.filepath is not None:
with open(self.filepath, "w") as file:
yaml.dump(self.memory, file, Dumper=compact_dump())

def report(self, *args):
syslog("end")
self.save()


def arch_to_device(arch):
device_types = [
Expand Down Expand Up @@ -462,7 +499,10 @@ def batch_resize(default):
newvalue = default

if gpu_opt.enabled:
newvalue = suggested_batch_size(pack)
if gpu_opt.add is not None and isinstance(gpu_opt.add, int):
newvalue = max(1, default + gpu_opt.add)
else:
newvalue = suggested_batch_size(pack)

broadcast(on_batch_size_set, pack, default, newvalue)
return newvalue
Expand Down
3 changes: 3 additions & 0 deletions milabench/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@ class SizerOptions:
# overrides the batch size to use for all benchmarks
size: int = defaultfield("sizer.batch_size", int, None)

# Add a fixed number to the current batch size
add: int = defaultfield("sizer.batch_size_add", int, None)

# Enables auto batch resize
autoscale: bool = defaultfield("sizer.auto", int, 0)

Expand Down

0 comments on commit 99c01a8

Please sign in to comment.