From 3d2718031897bcc5eac93e1304442ba528f5840c Mon Sep 17 00:00:00 2001 From: Pierre Delaunay <pierre@delaunay.io> Date: Thu, 21 Nov 2024 13:04:41 -0500 Subject: [PATCH] Revert bad commit --- benchmarks/diffusion/main.py | 5 -- benchmarks/flops/benchfile.py | 14 ++---- benchmarks/flops/dev.yaml | 56 --------------------- benchmarks/flops/dev/extra/flops/mark_torch | 0 benchmarks/flops/main.py | 3 +- benchmarks/flops/requirements.cpu.txt | 5 -- benchmarks/flops/simple.sh | 13 ----- benchmarks/geo_gnn/modelsize.py | 36 ------------- benchmarks/purejaxrl/dqn.py | 5 -- benchmarks/purejaxrl/ppo.py | 6 --- benchmarks/recursiongfn/main.py | 4 +- benchmarks/torchatari/main.py | 3 -- benchmate/benchmate/models.py | 36 ------------- benchmate/benchmate/monitor.py | 35 ++++--------- milabench/_version.py | 6 +-- milabench/commands/executors.py | 3 -- milabench/pack.py | 2 - milabench/report.py | 4 +- 18 files changed, 21 insertions(+), 215 deletions(-) delete mode 100644 benchmarks/flops/dev.yaml delete mode 100644 benchmarks/flops/dev/extra/flops/mark_torch delete mode 100644 benchmarks/flops/requirements.cpu.txt delete mode 100644 benchmarks/flops/simple.sh delete mode 100644 benchmarks/geo_gnn/modelsize.py delete mode 100644 benchmate/benchmate/models.py diff --git a/benchmarks/diffusion/main.py b/benchmarks/diffusion/main.py index c5b7757ee..0bcb67d50 100755 --- a/benchmarks/diffusion/main.py +++ b/benchmarks/diffusion/main.py @@ -57,11 +57,6 @@ def models(accelerator, args: Arguments): unet = UNet2DConditionModel.from_pretrained( args.model, subfolder="unet", revision=args.revision, variant=args.variant ) - - from benchmate.models import model_size - print(model_size(unet)) - print(model_size(encoder)) - print(model_size(vae)) vae.requires_grad_(False) encoder.requires_grad_(False) diff --git a/benchmarks/flops/benchfile.py b/benchmarks/flops/benchfile.py index 3090a3c75..59c5c4a7f 100644 --- a/benchmarks/flops/benchfile.py +++ b/benchmarks/flops/benchfile.py @@ -8,15 +8,11 @@ class FlopsBenchmarch(Package): def build_run_plan(self) -> "Command": import milabench.commands as cmd - main = self.dirs.code / self.main_script + pack = cmd.PackCommand(self, *self.argv, lazy=True) - - use_stdout = True - - if use_stdout: - return pack.use_stdout() - else: - pack = cmd.VoirCommand(pack, cwd=main.parent) - return pack + # pack = cmd.VoirCommand(pack, cwd=main.parent) + pack = cmd.ActivatorCommand(pack) + return pack.use_stdout() + __pack__ = FlopsBenchmarch diff --git a/benchmarks/flops/dev.yaml b/benchmarks/flops/dev.yaml deleted file mode 100644 index 5106bf407..000000000 --- a/benchmarks/flops/dev.yaml +++ /dev/null @@ -1,56 +0,0 @@ - - -_flops: - inherits: _defaults - definition: . - group: flops - install-variant: unpinned - install_group: torch - plan: - method: per_gpu - - tags: - - diagnostic - - flops - - monogpu - - nobatch - - argv: - --number: 30 - --repeat: 90 - - -fp16: - inherits: _flops - - argv: - --number: 30 - --repeat: 10 - --m: 8192 - --n: 8192 - --dtype: fp16 - -bf16: - inherits: _flops - - argv: - --m: 8192 - --n: 8192 - --dtype: bf16 - -tf32: - inherits: _flops - - argv: - --m: 8192 - --n: 8192 - --dtype: fp32 - --tf32: true - -fp32: - inherits: _flops - - argv: - --m: 256 - --n: 256 - --dtype: fp32 diff --git a/benchmarks/flops/dev/extra/flops/mark_torch b/benchmarks/flops/dev/extra/flops/mark_torch deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmarks/flops/main.py b/benchmarks/flops/main.py index ba03518a6..e4f05c178 100755 --- a/benchmarks/flops/main.py +++ b/benchmarks/flops/main.py @@ -109,12 +109,11 @@ def main(): log, monitor = setupvoir() - # FIXME - #with monitor: f(args.number, args.repeat, args.m, args.n, TERA, dtypes[args.dtype], log) monitor.stop() + if __name__ == "__main__": main() print("done") diff --git a/benchmarks/flops/requirements.cpu.txt b/benchmarks/flops/requirements.cpu.txt deleted file mode 100644 index 88f8b61e5..000000000 --- a/benchmarks/flops/requirements.cpu.txt +++ /dev/null @@ -1,5 +0,0 @@ -torch -torchvision -torchcompat -tqdm -voir diff --git a/benchmarks/flops/simple.sh b/benchmarks/flops/simple.sh deleted file mode 100644 index 3f54d4243..000000000 --- a/benchmarks/flops/simple.sh +++ /dev/null @@ -1,13 +0,0 @@ - - - - -export MILABENCH_BASE="$(pwd)/dev" -export MILABENCH_CONFIG="$(pwd)/dev.yaml" - - -milabench install --select fp32 - -milabench prepare --select fp32 - -milabench run --select fp32 diff --git a/benchmarks/geo_gnn/modelsize.py b/benchmarks/geo_gnn/modelsize.py deleted file mode 100644 index 0b65655a2..000000000 --- a/benchmarks/geo_gnn/modelsize.py +++ /dev/null @@ -1,36 +0,0 @@ -from torch_geometric.nn.models import PNA as _PNA, DimeNet as _DimeNet - -import torch - -from benchmate.models import model_size - - -print(model_size(_DimeNet( - hidden_channels=64, - out_channels=1, - num_blocks=6, - num_bilinear=8, - num_spherical=7, - num_radial=6, - cutoff=10.0, - envelope_exponent=5, - num_before_skip=1, - num_after_skip=2, - num_output_layers=3, -) - -)) - -print(model_size( -_PNA( - # Basic GCNN setup - in_channels=1, - out_channels=1, - hidden_channels=64, - num_layers=64, - # https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.PNAConv.html - aggregators=['mean', 'min', 'max', 'std'], - scalers=['identity', 'amplification', 'attenuation'], - # Histogram of in-degrees of nodes in the training set, used by scalers to normalize - deg=torch.tensor(4), -))) \ No newline at end of file diff --git a/benchmarks/purejaxrl/dqn.py b/benchmarks/purejaxrl/dqn.py index 85e9e8b5d..fc0a97b8d 100644 --- a/benchmarks/purejaxrl/dqn.py +++ b/benchmarks/purejaxrl/dqn.py @@ -98,11 +98,6 @@ def train(rng): init_x = jnp.zeros(env.observation_space(env_params).shape) network_params = network.init(_rng, init_x) - - param_count = sum(x.size for x in jax.tree.leaves(network_params)) - print("PARAM COUNT", param_count) - - def linear_schedule(count): frac = 1.0 - (count / config["NUM_UPDATES"]) return config["LR"] * frac diff --git a/benchmarks/purejaxrl/ppo.py b/benchmarks/purejaxrl/ppo.py index a70e195ea..0cc8896cc 100644 --- a/benchmarks/purejaxrl/ppo.py +++ b/benchmarks/purejaxrl/ppo.py @@ -107,15 +107,9 @@ def train(rng): network = ActorCritic( env.action_space(env_params).shape[0], activation=config["ACTIVATION"] ) - - rng, _rng = jax.random.split(rng) init_x = jnp.zeros(env.observation_space(env_params).shape) network_params = network.init(_rng, init_x) - - param_count = sum(x.size for x in jax.tree.leaves(network_params)) - print("PARAM COUNT", param_count) - if config["ANNEAL_LR"]: tx = optax.chain( optax.clip_by_global_norm(config["MAX_GRAD_NORM"]), diff --git a/benchmarks/recursiongfn/main.py b/benchmarks/recursiongfn/main.py index 7099247dc..81d08e8aa 100644 --- a/benchmarks/recursiongfn/main.py +++ b/benchmarks/recursiongfn/main.py @@ -92,13 +92,11 @@ def __init__( self.num_cond_dim = self.temperature_conditional.encoding_size() def _load_task_models(self): - xdg_cache = os.environ.get("XDG_CACHE_HOME") + xdg_cache = os.environ["XDG_CACHE_HOME"] model = bengio2021flow.load_original_model( cache=True, location=Path(os.path.join(xdg_cache, "bengio2021flow_proxy.pkl.gz")), ) - from benchmate.models import model_size - print(model_size(model)) model.to(get_worker_device()) model = self._wrap_model(model) return {"seh": model} diff --git a/benchmarks/torchatari/main.py b/benchmarks/torchatari/main.py index 898a8cabe..bf5b7ef65 100644 --- a/benchmarks/torchatari/main.py +++ b/benchmarks/torchatari/main.py @@ -201,10 +201,7 @@ def main(): envs = RecordEpisodeStatistics(envs) assert isinstance(envs.action_space, gym.spaces.Discrete), "only discrete action space is supported" - - from benchmate.models import model_size agent = Agent(envs).to(device) - print(model_size(agent)) optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) # ALGO Logic: Storage setup diff --git a/benchmate/benchmate/models.py b/benchmate/benchmate/models.py deleted file mode 100644 index efd13e2c6..000000000 --- a/benchmate/benchmate/models.py +++ /dev/null @@ -1,36 +0,0 @@ - - -def model_summary(model, input_shape): - try: - from torchsummary import summary - - summary(model, input_shape) - except: - print("Could not print summary") - - -def model_size(model): - param_size = 0 - param_count = 0 - for param in model.parameters(): - param_count += param.nelement() - param_size += param.nelement() * param.element_size() - - buffer_size = 0 - buffer_count = 0 - for buff in model.buffers(): - buffer_count += buff.nelement() - buffer_size += buff.nelement() * buff.element_size() - - return { - "param": { - "count": param_count, - "size": param_size / 1024**2, - "unit": "MB" - }, - "buffer": { - "count": buffer_count, - "size": buffer_size / 1024**2, - "unit": "MB" - } - } diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py index 294d0c88f..0ad34a3d3 100644 --- a/benchmate/benchmate/monitor.py +++ b/benchmate/benchmate/monitor.py @@ -13,30 +13,7 @@ from voir.instruments.io import io_monitor from voir.instruments.network import network_monitor from voir.instruments.monitor import monitor -from voir.helpers import current_overseer -from .metrics import sumggle_push, give_push, file_push - - -def auto_push(): - # use_stdout = int(os.getenv("MILABENCH_USE_STDOUT", 0)) - mb_managed = int(os.getenv("MILABENCH_MANAGED", 0)) - - # Milabench managed: we need to push metrics to it - if mb_managed == 1: - # Using voir, DATA_FD is defined as well - ov = current_overseer.get() - if ov is not None: - return ov.give - - # Not using Voir, using structured stdout - if int(os.getenv("MILABENCH_USE_STDOUT", 0)) == 1: - return sumggle_push() - - raise RuntimeError("Could not find something to push to") - - # Not using milabench; using stdout - return file_push() @instrument_definition @@ -64,10 +41,16 @@ def monitor_node(ov, poll_interval=1, arch=None): def _smuggle_monitor(poll_interval=10, worker_init=None, **monitors): - log = auto_push() - + data_file = SmuggleWriter(sys.stdout) def mblog(data): - log(**data) + nonlocal data_file + + if data_file is not None: + try: + print(json.dumps(data), file=data_file) + except ValueError: + pass + # print("Is bench ending?, ignoring ValueError") def get(): t = time.time() diff --git a/milabench/_version.py b/milabench/_version.py index 5f55a16da..cdd2418dd 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v0.1.0-129-ga60a3aa" -__commit__ = "a60a3aae21e87e46bcce403620a3f56c12878554" -__date__ = "2024-11-06 22:52:12 -0500" +__tag__ = "v1.0.0_RC1-9-g6d1e1140" +__commit__ = "6d1e114000cc4200ea307330032234db6696e40d" +__date__ = "2024-09-30 14:39:43 -0400" diff --git a/milabench/commands/executors.py b/milabench/commands/executors.py index 807a261e2..f0402d29b 100644 --- a/milabench/commands/executors.py +++ b/milabench/commands/executors.py @@ -32,9 +32,6 @@ async def execute(pack, *args, cwd=None, env={}, external=False, use_stdout=Fals sized_args = scale_argv(pack, args) final_args = resolve_argv(pack, sized_args) - if use_stdout: - exec_env["MILABENCH_USE_STDOUT"] = "1" - return await run( final_args, **kwargs, diff --git a/milabench/pack.py b/milabench/pack.py index 20feca39d..1cdde0939 100644 --- a/milabench/pack.py +++ b/milabench/pack.py @@ -335,8 +335,6 @@ def make_env(self): f"MILABENCH_DIR_{name.upper()}": path for name, path in self.config["dirs"].items() } - - env["MILABENCH_MANAGED"] = "1" env["OMP_NUM_THREADS"] = resolve_placeholder(self, "{cpu_per_gpu}") diff --git a/milabench/report.py b/milabench/report.py index bdc4999cc..c54ed8ddd 100644 --- a/milabench/report.py +++ b/milabench/report.py @@ -525,12 +525,12 @@ def pandas_to_string(df, formatters=_formatters): # Compute column size col_size = defaultdict(int) for index, row in df.iterrows(): - col_size["bench"] = max(col_size["bench"], len(index), len("bench")) + col_size["bench"] = max(col_size["bench"], len(index)) for col, val in zip(columns, row): fmt = formatters.get(col) if fmt is not None: val = fmt(val) - col_size[col] = max(col_size[col], len(val), len(col)) + col_size[col] = max(col_size[col], len(val)) # Generate report sep = " | "