diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index d4fb5277..86cd83e5 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -51,8 +51,7 @@
 					".venv": true,
 					".pytest_cache": true,
 					".benchmarks": true,
-					".ruff_cache": true,
-					".regression_files": true
+					".ruff_cache": true
 				},
 				"python.testing.unittestEnabled": false,
 				"python.testing.pytestEnabled": true,
@@ -85,7 +84,6 @@
 	"containerEnv": {
 		"SCRATCH": "/home/vscode/scratch",
 		"SLURM_TMPDIR": "/tmp",
-		"NETWORK_DIR": "/network",
 		"UV_LINK_MODE": "symlink",
 		"UV_CACHE_DIR": "/home/vscode/.uv_cache"
 	},
diff --git a/.github/actions-runner-job.sh b/.github/actions-runner-job.sh
index 432b9a84..4fa7d1e2 100755
--- a/.github/actions-runner-job.sh
+++ b/.github/actions-runner-job.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 #SBATCH --nodes=1
 #SBATCH --ntasks=1
-#SBATCH --cpus-per-task=1
-#SBATCH --mem=16G
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=32G
 #SBATCH --gpus=rtx8000:1
 #SBATCH --time=00:30:00
 #SBATCH --dependency=singleton
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 70e93c2a..35298b2b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -87,7 +87,7 @@ jobs:
   local_integration_tests:
     needs: [unit_tests, check_docs]
     runs-on: self-hosted
-    timeout-minutes: 20
+    timeout-minutes: 30
     strategy:
       max-parallel: 1
       matrix:
@@ -150,7 +150,7 @@ jobs:
     name: Run integration tests on the ${{ matrix.cluster }} cluster in job ${{ needs.launch-slurm-actions-runner.outputs.job_id}}
     needs: [launch-slurm-actions-runner]
     runs-on: ${{ matrix.cluster }}
-    timeout-minutes: 20
+    timeout-minutes: 30
     strategy:
       max-parallel: 5
       matrix:
diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_cifar10_example.yaml b/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_cifar10_example.yaml
deleted file mode 100644
index 5dab27b0..00000000
--- a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_cifar10_example.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-input:
-  device: cpu
-  hash: -1373365636602041987
-  max: 2.1
-  mean: -0.0
-  min: -2.0
-  shape:
-  - 128
-  - 3
-  - 32
-  - 32
-  sum: -2429.8
-out:
-  device: cpu
-  hash: -5286755934104888446
-  max: 0.7
-  mean: 0.0
-  min: -0.8
-  shape:
-  - 128
-  - 10
-  sum: 20.2
diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml b/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml
deleted file mode 100644
index aaa55377..00000000
--- a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-input:
-  device: cpu
-  hash: 9223185275738543696
-  max: 2.8
-  mean: 0.5
-  min: -0.4
-  shape:
-  - 128
-  - 1
-  - 28
-  - 28
-  sum: 48391.2
-out:
-  device: cpu
-  hash: 3229404000460739909
-  max: 1.2
-  mean: -0.0
-  min: -1.1
-  shape:
-  - 128
-  - 10
-  sum: -40.6
diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_mnist_example.yaml b/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_mnist_example.yaml
deleted file mode 100644
index 0d41f6d3..00000000
--- a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_mnist_example.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-input:
-  device: cpu
-  hash: 8611995894311838429
-  max: 2.8
-  mean: 0.0
-  min: -0.4
-  shape:
-  - 128
-  - 1
-  - 28
-  - 28
-  sum: 1437.2
-out:
-  device: cpu
-  hash: -4763233483389115210
-  max: 0.8
-  mean: -0.0
-  min: -0.9
-  shape:
-  - 128
-  - 10
-  sum: -30.8
diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/resnet18_cifar10_example.yaml b/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/resnet18_cifar10_example.yaml
deleted file mode 100644
index dea2f076..00000000
--- a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/resnet18_cifar10_example.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-input:
-  device: cpu
-  hash: -1373365636602041987
-  max: 2.1
-  mean: -0.0
-  min: -2.0
-  shape:
-  - 128
-  - 3
-  - 32
-  - 32
-  sum: -2429.8
-out:
-  device: cpu
-  hash: -1856253906003733022
-  max: 2.1
-  mean: -0.2
-  min: -3.0
-  shape:
-  - 128
-  - 10
-  sum: -265.8
diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/resnet50_cifar10_example.yaml b/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/resnet50_cifar10_example.yaml
deleted file mode 100644
index 78bbee98..00000000
--- a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/resnet50_cifar10_example.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-input:
-  device: cpu
-  hash: -1373365636602041987
-  max: 2.1
-  mean: -0.0
-  min: -2.0
-  shape:
-  - 128
-  - 3
-  - 32
-  - 32
-  sum: -2429.8
-out:
-  device: cpu
-  hash: -9209917346416037156
-  max: 6.0
-  mean: 0.3
-  min: -5.2
-  shape:
-  - 128
-  - 10
-  sum: 322.7
diff --git a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_cifar10_example.yaml b/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_cifar10_example.yaml
deleted file mode 100644
index 66b7eef8..00000000
--- a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_cifar10_example.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-network.0.1.bias:
-  device: cpu
-  max: '1.770e-02'
-  mean: '-1.236e-04'
-  min: '-1.797e-02'
-  shape:
-  - 128
-  sum: '-1.581e-02'
-network.0.1.weight:
-  device: cpu
-  max: '1.804e-02'
-  mean: '-8.050e-06'
-  min: '-1.804e-02'
-  shape:
-  - 128
-  - 3072
-  sum: '-3.166e+00'
-network.1.0.bias:
-  device: cpu
-  max: '8.806e-02'
-  mean: '-3.074e-03'
-  min: '-8.612e-02'
-  shape:
-  - 128
-  sum: '-3.935e-01'
-network.1.0.weight:
-  device: cpu
-  max: '8.836e-02'
-  mean: '5.354e-04'
-  min: '-8.837e-02'
-  shape:
-  - 128
-  - 128
-  sum: '8.773e+00'
-network.2.0.bias:
-  device: cpu
-  max: '8.265e-02'
-  mean: '2.135e-02'
-  min: '-2.476e-02'
-  shape:
-  - 10
-  sum: '2.135e-01'
-network.2.0.weight:
-  device: cpu
-  max: '8.824e-02'
-  mean: '-6.046e-04'
-  min: '-8.823e-02'
-  shape:
-  - 10
-  - 128
-  sum: '-7.739e-01'
diff --git a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml b/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml
deleted file mode 100644
index 309c24b7..00000000
--- a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-network.0.1.bias:
-  device: cpu
-  max: '3.564e-02'
-  mean: '-5.232e-04'
-  min: '-3.566e-02'
-  shape:
-  - 128
-  sum: '-6.697e-02'
-network.0.1.weight:
-  device: cpu
-  max: '3.571e-02'
-  mean: '7.122e-05'
-  min: '-3.571e-02'
-  shape:
-  - 128
-  - 784
-  sum: '7.147e+00'
-network.1.0.bias:
-  device: cpu
-  max: '8.382e-02'
-  mean: '-9.825e-03'
-  min: '-8.787e-02'
-  shape:
-  - 128
-  sum: '-1.258e+00'
-network.1.0.weight:
-  device: cpu
-  max: '8.838e-02'
-  mean: '1.486e-04'
-  min: '-8.838e-02'
-  shape:
-  - 128
-  - 128
-  sum: '2.434e+00'
-network.2.0.bias:
-  device: cpu
-  max: '7.293e-02'
-  mean: '1.038e-02'
-  min: '-8.284e-02'
-  shape:
-  - 10
-  sum: '1.038e-01'
-network.2.0.weight:
-  device: cpu
-  max: '8.835e-02'
-  mean: '-1.525e-03'
-  min: '-8.816e-02'
-  shape:
-  - 10
-  - 128
-  sum: '-1.952e+00'
diff --git a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_mnist_example.yaml b/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_mnist_example.yaml
deleted file mode 100644
index 309c24b7..00000000
--- a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_mnist_example.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-network.0.1.bias:
-  device: cpu
-  max: '3.564e-02'
-  mean: '-5.232e-04'
-  min: '-3.566e-02'
-  shape:
-  - 128
-  sum: '-6.697e-02'
-network.0.1.weight:
-  device: cpu
-  max: '3.571e-02'
-  mean: '7.122e-05'
-  min: '-3.571e-02'
-  shape:
-  - 128
-  - 784
-  sum: '7.147e+00'
-network.1.0.bias:
-  device: cpu
-  max: '8.382e-02'
-  mean: '-9.825e-03'
-  min: '-8.787e-02'
-  shape:
-  - 128
-  sum: '-1.258e+00'
-network.1.0.weight:
-  device: cpu
-  max: '8.838e-02'
-  mean: '1.486e-04'
-  min: '-8.838e-02'
-  shape:
-  - 128
-  - 128
-  sum: '2.434e+00'
-network.2.0.bias:
-  device: cpu
-  max: '7.293e-02'
-  mean: '1.038e-02'
-  min: '-8.284e-02'
-  shape:
-  - 10
-  sum: '1.038e-01'
-network.2.0.weight:
-  device: cpu
-  max: '8.835e-02'
-  mean: '-1.525e-03'
-  min: '-8.816e-02'
-  shape:
-  - 10
-  - 128
-  sum: '-1.952e+00'
diff --git a/.regression_files/project/algorithms/hf_example_test/test_backward_pass_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml b/.regression_files/project/algorithms/hf_example_test/test_backward_pass_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml
deleted file mode 100644
index f91a9de7..00000000
--- a/.regression_files/project/algorithms/hf_example_test/test_backward_pass_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml
+++ /dev/null
@@ -1,286 +0,0 @@
-batch.attention_mask:
-  device: cpu
-  max: 1
-  mean: '8.374e-02'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 343
-batch.input_ids:
-  device: cpu
-  max: 26101
-  mean: '1.597e+02'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 654306
-batch.labels:
-  device: cpu
-  max: 1
-  mean: '7.188e-01'
-  min: 0
-  shape:
-  - 32
-  sum: 23
-batch.token_type_ids:
-  device: cpu
-  max: 0
-  mean: '0.e+00'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 0
-grads.network.albert.embeddings.LayerNorm.bias:
-  device: cpu
-  max: '9.495e-03'
-  mean: '-1.080e-05'
-  min: '-1.796e-02'
-  shape:
-  - 128
-  sum: '-1.383e-03'
-grads.network.albert.embeddings.LayerNorm.weight:
-  device: cpu
-  max: '1.186e-02'
-  mean: '-2.625e-04'
-  min: '-1.228e-02'
-  shape:
-  - 128
-  sum: '-3.360e-02'
-grads.network.albert.embeddings.position_embeddings.weight:
-  device: cpu
-  max: '6.970e-01'
-  mean: '-3.638e-12'
-  min: '-1.086e+00'
-  shape:
-  - 512
-  - 128
-  sum: '-2.384e-07'
-grads.network.albert.embeddings.token_type_embeddings.weight:
-  device: cpu
-  max: '6.053e-01'
-  mean: '-1.863e-09'
-  min: '-1.119e+00'
-  shape:
-  - 2
-  - 128
-  sum: '-4.768e-07'
-grads.network.albert.embeddings.word_embeddings.weight:
-  device: cpu
-  max: '1.541e+00'
-  mean: '-2.008e-13'
-  min: '-6.233e-01'
-  shape:
-  - 30000
-  - 128
-  sum: '-7.711e-07'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm.bias:
-  device: cpu
-  max: '6.357e-02'
-  mean: '-3.738e-04'
-  min: '-6.593e-02'
-  shape:
-  - 768
-  sum: '-2.871e-01'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm.weight:
-  device: cpu
-  max: '8.125e-02'
-  mean: '1.121e-04'
-  min: '-5.811e-01'
-  shape:
-  - 768
-  sum: '8.612e-02'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense.bias:
-  device: cpu
-  max: '6.013e-02'
-  mean: '-1.940e-11'
-  min: '-5.395e-02'
-  shape:
-  - 768
-  sum: '-1.490e-08'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense.weight:
-  device: cpu
-  max: '1.061e-01'
-  mean: '4.042e-13'
-  min: '-1.112e-01'
-  shape:
-  - 768
-  - 768
-  sum: '2.384e-07'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.bias:
-  device: cpu
-  max: '1.275e-08'
-  mean: '-1.333e-11'
-  min: '-6.650e-09'
-  shape:
-  - 768
-  sum: '-1.023e-08'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.weight:
-  device: cpu
-  max: '6.536e-01'
-  mean: '4.320e-06'
-  min: '-3.507e-01'
-  shape:
-  - 768
-  - 768
-  sum: '2.548e+00'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.bias:
-  device: cpu
-  max: '2.402e-02'
-  mean: '2.56e-05'
-  min: '-1.913e-02'
-  shape:
-  - 768
-  sum: '1.966e-02'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight:
-  device: cpu
-  max: '1.087e-01'
-  mean: '7.314e-07'
-  min: '-1.164e-01'
-  shape:
-  - 768
-  - 768
-  sum: '4.314e-01'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.bias:
-  device: cpu
-  max: '6.786e-02'
-  mean: '-3.315e-04'
-  min: '-8.925e-02'
-  shape:
-  - 768
-  sum: '-2.546e-01'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.weight:
-  device: cpu
-  max: '4.607e-01'
-  mean: '-6.091e-06'
-  min: '-3.011e-01'
-  shape:
-  - 768
-  - 768
-  sum: '-3.592e+00'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn.bias:
-  device: cpu
-  max: '4.213e-02'
-  mean: '-3.888e-05'
-  min: '-6.737e-02'
-  shape:
-  - 3072
-  sum: '-1.195e-01'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn.weight:
-  device: cpu
-  max: '2.953e-01'
-  mean: '-5.795e-07'
-  min: '-2.323e-01'
-  shape:
-  - 3072
-  - 768
-  sum: '-1.367e+00'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn_output.bias:
-  device: cpu
-  max: '5.003e-02'
-  mean: '-5.821e-11'
-  min: '-5.843e-02'
-  shape:
-  - 768
-  sum: '-4.470e-08'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn_output.weight:
-  device: cpu
-  max: '6.105e-01'
-  mean: '-2.627e-12'
-  min: '-5.125e-01'
-  shape:
-  - 768
-  - 3072
-  sum: '-6.199e-06'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.bias:
-  device: cpu
-  max: '6.435e-02'
-  mean: '-1.912e-04'
-  min: '-6.824e-02'
-  shape:
-  - 768
-  sum: '-1.468e-01'
-grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight:
-  device: cpu
-  max: '5.071e-02'
-  mean: '-6.398e-04'
-  min: '-4.395e-01'
-  shape:
-  - 768
-  sum: '-4.914e-01'
-grads.network.albert.encoder.embedding_hidden_mapping_in.bias:
-  device: cpu
-  max: '7.07e-03'
-  mean: '-8.878e-05'
-  min: '-7.231e-03'
-  shape:
-  - 768
-  sum: '-6.818e-02'
-grads.network.albert.encoder.embedding_hidden_mapping_in.weight:
-  device: cpu
-  max: '8.686e-02'
-  mean: '2.216e-06'
-  min: '-8.327e-02'
-  shape:
-  - 768
-  - 128
-  sum: '2.178e-01'
-grads.network.albert.pooler.bias:
-  device: cpu
-  max: '1.253e-02'
-  mean: '5.213e-05'
-  min: '-8.348e-03'
-  shape:
-  - 768
-  sum: '4.004e-02'
-grads.network.albert.pooler.weight:
-  device: cpu
-  max: '9.280e-02'
-  mean: '-9.552e-07'
-  min: '-6.335e-02'
-  shape:
-  - 768
-  - 768
-  sum: '-5.634e-01'
-grads.network.classifier.bias:
-  device: cpu
-  max: '2.129e-01'
-  mean: '7.451e-09'
-  min: '-2.129e-01'
-  shape:
-  - 2
-  sum: '1.490e-08'
-grads.network.classifier.weight:
-  device: cpu
-  max: '2.222e-01'
-  mean: '-3.444e-10'
-  min: '-2.222e-01'
-  shape:
-  - 2
-  - 768
-  sum: '-5.29e-07'
-outputs.labels:
-  device: cpu
-  max: 1
-  mean: '7.188e-01'
-  min: 0
-  shape:
-  - 32
-  sum: 23
-outputs.loss:
-  device: cpu
-  max: '7.185e-01'
-  mean: '7.185e-01'
-  min: '7.185e-01'
-  shape: []
-  sum: '7.185e-01'
-outputs.preds:
-  device: cpu
-  max: 1
-  mean: '4.688e-01'
-  min: 0
-  shape:
-  - 32
-  sum: 15
diff --git a/.regression_files/project/algorithms/hf_example_test/test_forward_pass_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml b/.regression_files/project/algorithms/hf_example_test/test_forward_pass_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml
deleted file mode 100644
index f8eb4d0d..00000000
--- a/.regression_files/project/algorithms/hf_example_test/test_forward_pass_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-input.attention_mask:
-  device: cpu
-  hash: -5248677368460617222
-  max: 1
-  mean: 0.1
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 343
-input.input_ids:
-  device: cpu
-  hash: -8391087330217722819
-  max: 26101
-  mean: 159.7
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 654306
-input.labels:
-  device: cpu
-  hash: -3945588999998408889
-  max: 1
-  mean: 0.7
-  min: 0
-  shape:
-  - 32
-  sum: 23
-input.token_type_ids:
-  device: cpu
-  hash: -8123354182314851848
-  max: 0
-  mean: 0.0
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 0
-out.logits:
-  device: cpu
-  hash: -3045239871714879068
-  max: 0.6
-  mean: 0.4
-  min: 0.1
-  shape:
-  - 32
-  - 2
-  sum: 26.8
-out.loss:
-  device: cpu
-  hash: 1287410195914297480
-  max: 0.7
-  mean: 0.7
-  min: 0.7
-  shape: []
-  sum: 0.7
diff --git a/.regression_files/project/algorithms/hf_example_test/test_forward_pass_is_reproducible/cuda/albert_base_v2_hf_text_hf_example.yaml b/.regression_files/project/algorithms/hf_example_test/test_forward_pass_is_reproducible/cuda/albert_base_v2_hf_text_hf_example.yaml
deleted file mode 100644
index 8e622121..00000000
--- a/.regression_files/project/algorithms/hf_example_test/test_forward_pass_is_reproducible/cuda/albert_base_v2_hf_text_hf_example.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-input.attention_mask:
-  device: cuda:0
-  max: 1
-  mean: '8.374e-02'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 343
-input.input_ids:
-  device: cuda:0
-  max: 26101
-  mean: '1.597e+02'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 654306
-input.labels:
-  device: cuda:0
-  max: 1
-  mean: '7.188e-01'
-  min: 0
-  shape:
-  - 32
-  sum: 23
-input.token_type_ids:
-  device: cuda:0
-  max: 0
-  mean: '0.e+00'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 0
-out.logits:
-  device: cuda:0
-  max: '4.019e-02'
-  mean: '-1.58e-01'
-  min: '-4.991e-01'
-  shape:
-  - 32
-  - 2
-  sum: '-1.011e+01'
-out.loss:
-  device: cuda:0
-  max: '7.185e-01'
-  mean: '7.185e-01'
-  min: '7.185e-01'
-  shape: []
-  sum: '7.185e-01'
diff --git a/.regression_files/project/algorithms/hf_example_test/test_initialization_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml b/.regression_files/project/algorithms/hf_example_test/test_initialization_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml
deleted file mode 100644
index 528e67c0..00000000
--- a/.regression_files/project/algorithms/hf_example_test/test_initialization_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml
+++ /dev/null
@@ -1,228 +0,0 @@
-network.albert.embeddings.LayerNorm.bias:
-  device: cpu
-  max: '2.53e+00'
-  mean: '-3.477e-02'
-  min: '-1.398e+00'
-  shape:
-  - 128
-  sum: '-4.451e+00'
-network.albert.embeddings.LayerNorm.weight:
-  device: cpu
-  max: '3.675e+00'
-  mean: '3.264e+00'
-  min: '1.297e+00'
-  shape:
-  - 128
-  sum: '4.178e+02'
-network.albert.embeddings.position_embeddings.weight:
-  device: cpu
-  max: '2.774e-01'
-  mean: '1.058e-04'
-  min: '-2.344e-01'
-  shape:
-  - 512
-  - 128
-  sum: '6.933e+00'
-network.albert.embeddings.token_type_embeddings.weight:
-  device: cpu
-  max: '4.431e-02'
-  mean: '1.339e-04'
-  min: '-8.033e-02'
-  shape:
-  - 2
-  - 128
-  sum: '3.429e-02'
-network.albert.embeddings.word_embeddings.weight:
-  device: cpu
-  max: '2.003e-01'
-  mean: '-5.478e-03'
-  min: '-1.946e-01'
-  shape:
-  - 30000
-  - 128
-  sum: '-2.104e+04'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm.bias:
-  device: cpu
-  max: '2.411e+00'
-  mean: '-6.698e-03'
-  min: '-3.421e+00'
-  shape:
-  - 768
-  sum: '-5.144e+00'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm.weight:
-  device: cpu
-  max: '2.478e+00'
-  mean: '5.703e-01'
-  min: '3.535e-01'
-  shape:
-  - 768
-  sum: '4.38e+02'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense.bias:
-  device: cpu
-  max: '5.149e+00'
-  mean: '-3.476e-03'
-  min: '-8.748e+00'
-  shape:
-  - 768
-  sum: '-2.669e+00'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense.weight:
-  device: cpu
-  max: '7.227e-01'
-  mean: '1.840e-06'
-  min: '-5.057e-01'
-  shape:
-  - 768
-  - 768
-  sum: '1.085e+00'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.bias:
-  device: cpu
-  max: '1.643e+00'
-  mean: '1.291e-02'
-  min: '-1.689e+00'
-  shape:
-  - 768
-  sum: '9.916e+00'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.weight:
-  device: cpu
-  max: '2.669e-01'
-  mean: '1.060e-04'
-  min: '-3.136e-01'
-  shape:
-  - 768
-  - 768
-  sum: '6.253e+01'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.bias:
-  device: cpu
-  max: '4.806e+00'
-  mean: '6.103e-02'
-  min: '-4.117e+00'
-  shape:
-  - 768
-  sum: '4.687e+01'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight:
-  device: cpu
-  max: '3.613e-01'
-  mean: '-2.149e-05'
-  min: '-2.743e-01'
-  shape:
-  - 768
-  - 768
-  sum: '-1.268e+01'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.bias:
-  device: cpu
-  max: '5.064e-01'
-  mean: '8.661e-04'
-  min: '-6.153e-01'
-  shape:
-  - 768
-  sum: '6.652e-01'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.weight:
-  device: cpu
-  max: '2.998e-01'
-  mean: '-9.619e-05'
-  min: '-2.962e-01'
-  shape:
-  - 768
-  - 768
-  sum: '-5.674e+01'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn.bias:
-  device: cpu
-  max: '5.147e-01'
-  mean: '-5.56e-01'
-  min: '-9.e+00'
-  shape:
-  - 3072
-  sum: '-1.708e+03'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn.weight:
-  device: cpu
-  max: '1.932e+00'
-  mean: '-1.609e-05'
-  min: '-1.779e+00'
-  shape:
-  - 3072
-  - 768
-  sum: '-3.796e+01'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn_output.bias:
-  device: cpu
-  max: '1.906e+00'
-  mean: '-1.445e-02'
-  min: '-1.471e+01'
-  shape:
-  - 768
-  sum: '-1.11e+01'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn_output.weight:
-  device: cpu
-  max: '1.226e+00'
-  mean: '-1.576e-05'
-  min: '-2.475e+00'
-  shape:
-  - 768
-  - 3072
-  sum: '-3.717e+01'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.bias:
-  device: cpu
-  max: '4.331e+00'
-  mean: '-4.060e-02'
-  min: '-7.592e-01'
-  shape:
-  - 768
-  sum: '-3.118e+01'
-network.albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight:
-  device: cpu
-  max: '3.067e+00'
-  mean: '1.35e+00'
-  min: '2.373e-01'
-  shape:
-  - 768
-  sum: '1.037e+03'
-network.albert.encoder.embedding_hidden_mapping_in.bias:
-  device: cpu
-  max: '2.250e+00'
-  mean: '-2.328e-02'
-  min: '-2.484e+00'
-  shape:
-  - 768
-  sum: '-1.788e+01'
-network.albert.encoder.embedding_hidden_mapping_in.weight:
-  device: cpu
-  max: '2.709e-01'
-  mean: '3.868e-04'
-  min: '-2.624e-01'
-  shape:
-  - 768
-  - 128
-  sum: '3.802e+01'
-network.albert.pooler.bias:
-  device: cpu
-  max: '1.409e+00'
-  mean: '5.837e-03'
-  min: '-1.279e+00'
-  shape:
-  - 768
-  sum: '4.483e+00'
-network.albert.pooler.weight:
-  device: cpu
-  max: '2.83e-01'
-  mean: '-2.292e-05'
-  min: '-2.817e-01'
-  shape:
-  - 768
-  - 768
-  sum: '-1.352e+01'
-network.classifier.bias:
-  device: cpu
-  max: '0.e+00'
-  mean: '0.e+00'
-  min: '0.e+00'
-  shape:
-  - 2
-  sum: '0.e+00'
-network.classifier.weight:
-  device: cpu
-  max: '6.891e-02'
-  mean: '8.459e-05'
-  min: '-6.203e-02'
-  shape:
-  - 2
-  - 768
-  sum: '1.299e-01'
diff --git a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_cifar10_image_classifier.yaml
similarity index 84%
rename from .regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_cifar10_example.yaml
rename to .regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_cifar10_image_classifier.yaml
index b4b3f47e..8e762f3f 100644
--- a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_cifar10_example.yaml
+++ b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_cifar10_image_classifier.yaml
@@ -1,5 +1,5 @@
 batch.0:
-  device: cpu
+  device: cuda:0
   max: '2.126e+00'
   mean: '-6.179e-03'
   min: '-1.989e+00'
@@ -10,7 +10,7 @@ batch.0:
   - 32
   sum: '-2.43e+03'
 batch.1:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.555e+00'
   min: 0
@@ -18,7 +18,7 @@ batch.1:
   - 128
   sum: 583
 grads.network.0.1.bias:
-  device: cpu
+  device: cuda:0
   max: '6.107e-03'
   mean: '1.775e-04'
   min: '-5.292e-03'
@@ -26,7 +26,7 @@ grads.network.0.1.bias:
   - 128
   sum: '2.272e-02'
 grads.network.0.1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.307e-02'
   mean: '4.693e-05'
   min: '-1.141e-02'
@@ -35,7 +35,7 @@ grads.network.0.1.weight:
   - 3072
   sum: '1.845e+01'
 grads.network.1.0.bias:
-  device: cpu
+  device: cuda:0
   max: '1.041e-02'
   mean: '6.975e-04'
   min: '-8.782e-03'
@@ -43,7 +43,7 @@ grads.network.1.0.bias:
   - 128
   sum: '8.928e-02'
 grads.network.1.0.weight:
-  device: cpu
+  device: cuda:0
   max: '1.584e-02'
   mean: '1.481e-04'
   min: '-1.507e-02'
@@ -52,7 +52,7 @@ grads.network.1.0.weight:
   - 128
   sum: '2.426e+00'
 grads.network.2.0.bias:
-  device: cpu
+  device: cuda:0
   max: '3.282e-02'
   mean: '-1.956e-09'
   min: '-2.134e-02'
@@ -60,16 +60,16 @@ grads.network.2.0.bias:
   - 10
   sum: '-1.956e-08'
 grads.network.2.0.weight:
-  device: cpu
+  device: cuda:0
   max: '2.200e-02'
-  mean: '-2.874e-10'
+  mean: '-2.561e-10'
   min: '-5.831e-02'
   shape:
   - 10
   - 128
-  sum: '-3.679e-07'
+  sum: '-3.278e-07'
 outputs.logits:
-  device: cpu
+  device: cuda:0
   max: '7.036e-01'
   mean: '-8.651e-03'
   min: '-8.180e-01'
@@ -78,14 +78,14 @@ outputs.logits:
   - 10
   sum: '-1.107e+01'
 outputs.loss:
-  device: cpu
+  device: cuda:0
   max: '2.316e+00'
   mean: '2.316e+00'
   min: '2.316e+00'
   shape: []
   sum: '2.316e+00'
 outputs.y:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.555e+00'
   min: 0
diff --git a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_fashion_mnist_image_classifier.yaml
similarity index 84%
rename from .regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml
rename to .regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_fashion_mnist_image_classifier.yaml
index ee70a8f8..8be326eb 100644
--- a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml
+++ b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_fashion_mnist_image_classifier.yaml
@@ -1,5 +1,5 @@
 batch.0:
-  device: cpu
+  device: cuda:0
   max: '2.821e+00'
   mean: '4.822e-01'
   min: '-4.242e-01'
@@ -10,7 +10,7 @@ batch.0:
   - 28
   sum: '4.839e+04'
 batch.1:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.555e+00'
   min: 0
@@ -18,7 +18,7 @@ batch.1:
   - 128
   sum: 583
 grads.network.0.1.bias:
-  device: cpu
+  device: cuda:0
   max: '6.875e-03'
   mean: '2.096e-04'
   min: '-8.370e-03'
@@ -26,7 +26,7 @@ grads.network.0.1.bias:
   - 128
   sum: '2.683e-02'
 grads.network.0.1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.948e-02'
   mean: '2.916e-04'
   min: '-2.213e-02'
@@ -35,7 +35,7 @@ grads.network.0.1.weight:
   - 784
   sum: '2.926e+01'
 grads.network.1.0.bias:
-  device: cpu
+  device: cuda:0
   max: '1.109e-02'
   mean: '2.213e-04'
   min: '-1.267e-02'
@@ -43,7 +43,7 @@ grads.network.1.0.bias:
   - 128
   sum: '2.832e-02'
 grads.network.1.0.weight:
-  device: cpu
+  device: cuda:0
   max: '2.374e-02'
   mean: '9.326e-05'
   min: '-2.32e-02'
@@ -52,7 +52,7 @@ grads.network.1.0.weight:
   - 128
   sum: '1.528e+00'
 grads.network.2.0.bias:
-  device: cpu
+  device: cuda:0
   max: '3.847e-02'
   mean: '-3.353e-09'
   min: '-4.706e-02'
@@ -60,16 +60,16 @@ grads.network.2.0.bias:
   - 10
   sum: '-3.353e-08'
 grads.network.2.0.weight:
-  device: cpu
+  device: cuda:0
   max: '5.741e-02'
-  mean: '-4.195e-10'
+  mean: '-3.929e-10'
   min: '-6.431e-02'
   shape:
   - 10
   - 128
-  sum: '-5.369e-07'
+  sum: '-5.029e-07'
 outputs.logits:
-  device: cpu
+  device: cuda:0
   max: '9.872e-01'
   mean: '-1.288e-02'
   min: '-7.225e-01'
@@ -78,14 +78,14 @@ outputs.logits:
   - 10
   sum: '-1.648e+01'
 outputs.loss:
-  device: cpu
+  device: cuda:0
   max: '2.311e+00'
   mean: '2.311e+00'
   min: '2.311e+00'
   shape: []
   sum: '2.311e+00'
 outputs.y:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.555e+00'
   min: 0
diff --git a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_mnist_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_mnist_image_classifier.yaml
similarity index 81%
rename from .regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_mnist_example.yaml
rename to .regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_mnist_image_classifier.yaml
index 90b624d9..232a8e50 100644
--- a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_mnist_example.yaml
+++ b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_mnist_image_classifier.yaml
@@ -1,5 +1,5 @@
 batch.0:
-  device: cpu
+  device: cuda:0
   max: '2.821e+00'
   mean: '1.432e-02'
   min: '-4.242e-01'
@@ -10,7 +10,7 @@ batch.0:
   - 28
   sum: '1.437e+03'
 batch.1:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.242e+00'
   min: 0
@@ -18,7 +18,7 @@ batch.1:
   - 128
   sum: 543
 grads.network.0.1.bias:
-  device: cpu
+  device: cuda:0
   max: '1.075e-02'
   mean: '2.421e-04'
   min: '-7.844e-03'
@@ -26,7 +26,7 @@ grads.network.0.1.bias:
   - 128
   sum: '3.099e-02'
 grads.network.0.1.weight:
-  device: cpu
+  device: cuda:0
   max: '2.006e-02'
   mean: '5.258e-05'
   min: '-1.844e-02'
@@ -35,7 +35,7 @@ grads.network.0.1.weight:
   - 784
   sum: '5.277e+00'
 grads.network.1.0.bias:
-  device: cpu
+  device: cuda:0
   max: '1.169e-02'
   mean: '4.285e-04'
   min: '-1.152e-02'
@@ -43,7 +43,7 @@ grads.network.1.0.bias:
   - 128
   sum: '5.485e-02'
 grads.network.1.0.weight:
-  device: cpu
+  device: cuda:0
   max: '1.753e-02'
   mean: '1.016e-04'
   min: '-2.219e-02'
@@ -52,24 +52,24 @@ grads.network.1.0.weight:
   - 128
   sum: '1.665e+00'
 grads.network.2.0.bias:
-  device: cpu
+  device: cuda:0
   max: '3.969e-02'
-  mean: '-1.304e-09'
+  mean: '-1.490e-09'
   min: '-7.979e-02'
   shape:
   - 10
-  sum: '-1.304e-08'
+  sum: '-1.490e-08'
 grads.network.2.0.weight:
-  device: cpu
+  device: cuda:0
   max: '3.221e-02'
-  mean: '-1.306e-10'
+  mean: '-1.928e-10'
   min: '-6.755e-02'
   shape:
   - 10
   - 128
-  sum: '-1.672e-07'
+  sum: '-2.468e-07'
 outputs.logits:
-  device: cpu
+  device: cuda:0
   max: '7.029e-01'
   mean: '-3.564e-02'
   min: '-7.781e-01'
@@ -78,14 +78,14 @@ outputs.logits:
   - 10
   sum: '-4.562e+01'
 outputs.loss:
-  device: cpu
+  device: cuda:0
   max: '2.304e+00'
   mean: '2.304e+00'
   min: '2.304e+00'
   shape: []
   sum: '2.304e+00'
 outputs.y:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.242e+00'
   min: 0
diff --git a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/resnet18_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet18_cifar10_image_classifier.yaml
similarity index 86%
rename from .regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/resnet18_cifar10_example.yaml
rename to .regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet18_cifar10_image_classifier.yaml
index f9556c68..1ada67d1 100644
--- a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/resnet18_cifar10_example.yaml
+++ b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet18_cifar10_image_classifier.yaml
@@ -1,5 +1,5 @@
 batch.0:
-  device: cpu
+  device: cuda:0
   max: '2.126e+00'
   mean: '-6.179e-03'
   min: '-1.989e+00'
@@ -10,7 +10,7 @@ batch.0:
   - 32
   sum: '-2.43e+03'
 batch.1:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.555e+00'
   min: 0
@@ -18,7 +18,7 @@ batch.1:
   - 128
   sum: 583
 grads.network.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '4.94e-02'
   mean: '3.131e-04'
   min: '-4.549e-02'
@@ -26,7 +26,7 @@ grads.network.bn1.bias:
   - 64
   sum: '2.004e-02'
 grads.network.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '7.001e-02'
   mean: '1.024e-03'
   min: '-7.857e-02'
@@ -34,7 +34,7 @@ grads.network.bn1.weight:
   - 64
   sum: '6.554e-02'
 grads.network.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '6.192e-01'
   mean: '1.341e-03'
   min: '-7.564e-01'
@@ -45,7 +45,7 @@ grads.network.conv1.weight:
   - 7
   sum: '1.261e+01'
 grads.network.fc.bias:
-  device: cpu
+  device: cuda:0
   max: '8.718e-02'
   mean: '-2.235e-09'
   min: '-7.594e-02'
@@ -53,16 +53,16 @@ grads.network.fc.bias:
   - 10
   sum: '-2.235e-08'
 grads.network.fc.weight:
-  device: cpu
+  device: cuda:0
   max: '1.526e-01'
-  mean: '-8.327e-10'
+  mean: '-7.902e-10'
   min: '-1.636e-01'
   shape:
   - 10
   - 512
-  sum: '-4.264e-06'
+  sum: '-4.046e-06'
 grads.network.layer1.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '4.809e-02'
   mean: '-6.887e-05'
   min: '-4.261e-02'
@@ -70,15 +70,15 @@ grads.network.layer1.0.bn1.bias:
   - 64
   sum: '-4.407e-03'
 grads.network.layer1.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '5.681e-02'
-  mean: '-2.846e-08'
+  mean: '-2.87e-08'
   min: '-6.472e-02'
   shape:
   - 64
-  sum: '-1.822e-06'
+  sum: '-1.837e-06'
 grads.network.layer1.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '2.823e-02'
   mean: '6.060e-04'
   min: '-3.829e-02'
@@ -86,7 +86,7 @@ grads.network.layer1.0.bn2.bias:
   - 64
   sum: '3.878e-02'
 grads.network.layer1.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '4.298e-02'
   mean: '-1.402e-03'
   min: '-5.307e-02'
@@ -94,7 +94,7 @@ grads.network.layer1.0.bn2.weight:
   - 64
   sum: '-8.975e-02'
 grads.network.layer1.0.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.152e-01'
   mean: '2.658e-05'
   min: '-1.006e-01'
@@ -105,7 +105,7 @@ grads.network.layer1.0.conv1.weight:
   - 3
   sum: '9.8e-01'
 grads.network.layer1.0.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '7.023e-02'
   mean: '2.208e-04'
   min: '-8.426e-02'
@@ -116,7 +116,7 @@ grads.network.layer1.0.conv2.weight:
   - 3
   sum: '8.138e+00'
 grads.network.layer1.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '5.121e-02'
   mean: '1.57e-05'
   min: '-3.888e-02'
@@ -124,15 +124,15 @@ grads.network.layer1.1.bn1.bias:
   - 64
   sum: '1.005e-03'
 grads.network.layer1.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '3.775e-02'
-  mean: '4.249e-09'
+  mean: '4.075e-09'
   min: '-3.404e-02'
   shape:
   - 64
-  sum: '2.719e-07'
+  sum: '2.608e-07'
 grads.network.layer1.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '2.051e-02'
   mean: '1.167e-03'
   min: '-2.095e-02'
@@ -140,7 +140,7 @@ grads.network.layer1.1.bn2.bias:
   - 64
   sum: '7.466e-02'
 grads.network.layer1.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '3.145e-02'
   mean: '3.783e-04'
   min: '-3.695e-02'
@@ -148,7 +148,7 @@ grads.network.layer1.1.bn2.weight:
   - 64
   sum: '2.421e-02'
 grads.network.layer1.1.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '7.035e-02'
   mean: '-9.996e-04'
   min: '-7.167e-02'
@@ -159,7 +159,7 @@ grads.network.layer1.1.conv1.weight:
   - 3
   sum: '-3.685e+01'
 grads.network.layer1.1.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '7.708e-02'
   mean: '3.07e-04'
   min: '-5.375e-02'
@@ -170,7 +170,7 @@ grads.network.layer1.1.conv2.weight:
   - 3
   sum: '1.132e+01'
 grads.network.layer2.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '2.687e-02'
   mean: '5.859e-04'
   min: '-2.458e-02'
@@ -178,7 +178,7 @@ grads.network.layer2.0.bn1.bias:
   - 128
   sum: '7.500e-02'
 grads.network.layer2.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '2.383e-02'
   mean: '-1.983e-08'
   min: '-3.218e-02'
@@ -186,7 +186,7 @@ grads.network.layer2.0.bn1.weight:
   - 128
   sum: '-2.539e-06'
 grads.network.layer2.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '1.778e-02'
   mean: '-7.097e-04'
   min: '-2.318e-02'
@@ -194,7 +194,7 @@ grads.network.layer2.0.bn2.bias:
   - 128
   sum: '-9.084e-02'
 grads.network.layer2.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '2.506e-02'
   mean: '-1.001e-03'
   min: '-2.575e-02'
@@ -202,7 +202,7 @@ grads.network.layer2.0.bn2.weight:
   - 128
   sum: '-1.281e-01'
 grads.network.layer2.0.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '7.148e-02'
   mean: '8.56e-04'
   min: '-6.533e-02'
@@ -213,7 +213,7 @@ grads.network.layer2.0.conv1.weight:
   - 3
   sum: '6.311e+01'
 grads.network.layer2.0.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '4.581e-02'
   mean: '5.887e-06'
   min: '-4.373e-02'
@@ -224,7 +224,7 @@ grads.network.layer2.0.conv2.weight:
   - 3
   sum: '8.681e-01'
 grads.network.layer2.0.downsample.0.weight:
-  device: cpu
+  device: cuda:0
   max: '5.408e-02'
   mean: '6.587e-05'
   min: '-6.218e-02'
@@ -235,7 +235,7 @@ grads.network.layer2.0.downsample.0.weight:
   - 1
   sum: '5.396e-01'
 grads.network.layer2.0.downsample.1.bias:
-  device: cpu
+  device: cuda:0
   max: '1.778e-02'
   mean: '-7.097e-04'
   min: '-2.318e-02'
@@ -243,7 +243,7 @@ grads.network.layer2.0.downsample.1.bias:
   - 128
   sum: '-9.084e-02'
 grads.network.layer2.0.downsample.1.weight:
-  device: cpu
+  device: cuda:0
   max: '2.67e-02'
   mean: '7.026e-04'
   min: '-2.834e-02'
@@ -251,7 +251,7 @@ grads.network.layer2.0.downsample.1.weight:
   - 128
   sum: '8.994e-02'
 grads.network.layer2.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '2.282e-02'
   mean: '4.179e-04'
   min: '-1.989e-02'
@@ -259,15 +259,15 @@ grads.network.layer2.1.bn1.bias:
   - 128
   sum: '5.349e-02'
 grads.network.layer2.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '2.738e-02'
-  mean: '3.405e-09'
+  mean: '3.492e-09'
   min: '-2.028e-02'
   shape:
   - 128
-  sum: '4.359e-07'
+  sum: '4.470e-07'
 grads.network.layer2.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '1.634e-02'
   mean: '4.516e-04'
   min: '-1.524e-02'
@@ -275,7 +275,7 @@ grads.network.layer2.1.bn2.bias:
   - 128
   sum: '5.78e-02'
 grads.network.layer2.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '2.251e-02'
   mean: '2.985e-04'
   min: '-2.765e-02'
@@ -283,7 +283,7 @@ grads.network.layer2.1.bn2.weight:
   - 128
   sum: '3.821e-02'
 grads.network.layer2.1.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '4.786e-02'
   mean: '-1.842e-04'
   min: '-4.788e-02'
@@ -294,7 +294,7 @@ grads.network.layer2.1.conv1.weight:
   - 3
   sum: '-2.716e+01'
 grads.network.layer2.1.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '3.281e-02'
   mean: '-1.638e-05'
   min: '-3.597e-02'
@@ -305,7 +305,7 @@ grads.network.layer2.1.conv2.weight:
   - 3
   sum: '-2.415e+00'
 grads.network.layer3.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '1.373e-02'
   mean: '-1.949e-05'
   min: '-1.339e-02'
@@ -313,15 +313,15 @@ grads.network.layer3.0.bn1.bias:
   - 256
   sum: '-4.989e-03'
 grads.network.layer3.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.651e-02'
-  mean: '-1.781e-08'
+  mean: '-1.778e-08'
   min: '-1.433e-02'
   shape:
   - 256
-  sum: '-4.56e-06'
+  sum: '-4.552e-06'
 grads.network.layer3.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '1.342e-02'
   mean: '-1.425e-04'
   min: '-1.272e-02'
@@ -329,7 +329,7 @@ grads.network.layer3.0.bn2.bias:
   - 256
   sum: '-3.647e-02'
 grads.network.layer3.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.591e-02'
   mean: '-4.350e-04'
   min: '-1.678e-02'
@@ -337,7 +337,7 @@ grads.network.layer3.0.bn2.weight:
   - 256
   sum: '-1.114e-01'
 grads.network.layer3.0.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '3.91e-02'
   mean: '1.103e-04'
   min: '-3.65e-02'
@@ -348,7 +348,7 @@ grads.network.layer3.0.conv1.weight:
   - 3
   sum: '3.254e+01'
 grads.network.layer3.0.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '2.947e-02'
   mean: '-2.338e-05'
   min: '-3.166e-02'
@@ -359,7 +359,7 @@ grads.network.layer3.0.conv2.weight:
   - 3
   sum: '-1.379e+01'
 grads.network.layer3.0.downsample.0.weight:
-  device: cpu
+  device: cuda:0
   max: '3.125e-02'
   mean: '-1.221e-06'
   min: '-2.705e-02'
@@ -370,7 +370,7 @@ grads.network.layer3.0.downsample.0.weight:
   - 1
   sum: '-4.002e-02'
 grads.network.layer3.0.downsample.1.bias:
-  device: cpu
+  device: cuda:0
   max: '1.342e-02'
   mean: '-1.425e-04'
   min: '-1.272e-02'
@@ -378,7 +378,7 @@ grads.network.layer3.0.downsample.1.bias:
   - 256
   sum: '-3.647e-02'
 grads.network.layer3.0.downsample.1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.214e-02'
   mean: '5.825e-05'
   min: '-1.422e-02'
@@ -386,7 +386,7 @@ grads.network.layer3.0.downsample.1.weight:
   - 256
   sum: '1.491e-02'
 grads.network.layer3.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '1.198e-02'
   mean: '1.985e-04'
   min: '-9.063e-03'
@@ -394,15 +394,15 @@ grads.network.layer3.1.bn1.bias:
   - 256
   sum: '5.082e-02'
 grads.network.layer3.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.364e-02'
-  mean: '1.122e-08'
+  mean: '1.119e-08'
   min: '-1.406e-02'
   shape:
   - 256
-  sum: '2.874e-06'
+  sum: '2.865e-06'
 grads.network.layer3.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '6.948e-03'
   mean: '1.387e-04'
   min: '-6.29e-03'
@@ -410,7 +410,7 @@ grads.network.layer3.1.bn2.bias:
   - 256
   sum: '3.551e-02'
 grads.network.layer3.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.099e-02'
   mean: '3.768e-04'
   min: '-1.145e-02'
@@ -418,7 +418,7 @@ grads.network.layer3.1.bn2.weight:
   - 256
   sum: '9.646e-02'
 grads.network.layer3.1.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '2.413e-02'
   mean: '-6.619e-06'
   min: '-2.651e-02'
@@ -429,7 +429,7 @@ grads.network.layer3.1.conv1.weight:
   - 3
   sum: '-3.904e+00'
 grads.network.layer3.1.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '2.347e-02'
   mean: '-3.211e-05'
   min: '-2.596e-02'
@@ -440,7 +440,7 @@ grads.network.layer3.1.conv2.weight:
   - 3
   sum: '-1.894e+01'
 grads.network.layer4.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '6.987e-03'
   mean: '-5.95e-06'
   min: '-6.451e-03'
@@ -448,7 +448,7 @@ grads.network.layer4.0.bn1.bias:
   - 512
   sum: '-3.046e-03'
 grads.network.layer4.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '8.782e-03'
   mean: '5.227e-08'
   min: '-8.326e-03'
@@ -456,7 +456,7 @@ grads.network.layer4.0.bn1.weight:
   - 512
   sum: '2.676e-05'
 grads.network.layer4.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '7.944e-03'
   mean: '4.654e-04'
   min: '-5.159e-03'
@@ -464,7 +464,7 @@ grads.network.layer4.0.bn2.bias:
   - 512
   sum: '2.383e-01'
 grads.network.layer4.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '7.365e-03'
   mean: '3.815e-04'
   min: '-7.759e-03'
@@ -472,7 +472,7 @@ grads.network.layer4.0.bn2.weight:
   - 512
   sum: '1.953e-01'
 grads.network.layer4.0.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '3.395e-02'
   mean: '1.298e-05'
   min: '-3.451e-02'
@@ -483,7 +483,7 @@ grads.network.layer4.0.conv1.weight:
   - 3
   sum: '1.531e+01'
 grads.network.layer4.0.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '2.825e-02'
   mean: '-1.254e-06'
   min: '-2.923e-02'
@@ -494,7 +494,7 @@ grads.network.layer4.0.conv2.weight:
   - 3
   sum: '-2.96e+00'
 grads.network.layer4.0.downsample.0.weight:
-  device: cpu
+  device: cuda:0
   max: '1.519e-02'
   mean: '2.644e-06'
   min: '-1.993e-02'
@@ -505,7 +505,7 @@ grads.network.layer4.0.downsample.0.weight:
   - 1
   sum: '3.466e-01'
 grads.network.layer4.0.downsample.1.bias:
-  device: cpu
+  device: cuda:0
   max: '7.944e-03'
   mean: '4.654e-04'
   min: '-5.159e-03'
@@ -513,7 +513,7 @@ grads.network.layer4.0.downsample.1.bias:
   - 512
   sum: '2.383e-01'
 grads.network.layer4.0.downsample.1.weight:
-  device: cpu
+  device: cuda:0
   max: '6.664e-03'
   mean: '3.273e-04'
   min: '-6.98e-03'
@@ -521,7 +521,7 @@ grads.network.layer4.0.downsample.1.weight:
   - 512
   sum: '1.676e-01'
 grads.network.layer4.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '5.407e-03'
   mean: '9.024e-05'
   min: '-4.404e-03'
@@ -529,15 +529,15 @@ grads.network.layer4.1.bn1.bias:
   - 512
   sum: '4.620e-02'
 grads.network.layer4.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '5.791e-03'
-  mean: '4.915e-08'
+  mean: '4.913e-08'
   min: '-5.188e-03'
   shape:
   - 512
-  sum: '2.516e-05'
+  sum: '2.515e-05'
 grads.network.layer4.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '8.746e-03'
   mean: '4.971e-04'
   min: '-9.116e-03'
@@ -545,7 +545,7 @@ grads.network.layer4.1.bn2.bias:
   - 512
   sum: '2.545e-01'
 grads.network.layer4.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '6.717e-03'
   mean: '3.269e-04'
   min: '-5.782e-03'
@@ -553,7 +553,7 @@ grads.network.layer4.1.bn2.weight:
   - 512
   sum: '1.674e-01'
 grads.network.layer4.1.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '2.951e-02'
   mean: '-5.57e-06'
   min: '-3.434e-02'
@@ -564,7 +564,7 @@ grads.network.layer4.1.conv1.weight:
   - 3
   sum: '-1.314e+01'
 grads.network.layer4.1.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '2.492e-02'
   mean: '-1.259e-06'
   min: '-2.262e-02'
@@ -575,7 +575,7 @@ grads.network.layer4.1.conv2.weight:
   - 3
   sum: '-2.971e+00'
 outputs.logits:
-  device: cpu
+  device: cuda:0
   max: '2.728e+00'
   mean: '8.106e-02'
   min: '-2.536e+00'
@@ -584,14 +584,14 @@ outputs.logits:
   - 10
   sum: '1.038e+02'
 outputs.loss:
-  device: cpu
+  device: cuda:0
   max: '2.593e+00'
   mean: '2.593e+00'
   min: '2.593e+00'
   shape: []
   sum: '2.593e+00'
 outputs.y:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.555e+00'
   min: 0
diff --git a/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet18_imagenet_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet18_imagenet_image_classifier.yaml
new file mode 100644
index 00000000..938d81f2
--- /dev/null
+++ b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet18_imagenet_image_classifier.yaml
@@ -0,0 +1,600 @@
+batch.0:
+  device: cuda:0
+  max: '2.640e+00'
+  mean: '-6.663e-02'
+  min: '-2.118e+00'
+  shape:
+  - 64
+  - 3
+  - 224
+  - 224
+  sum: '-6.419e+05'
+batch.1:
+  device: cuda:0
+  max: 988
+  mean: '5.182e+02'
+  min: 0
+  shape:
+  - 64
+  sum: 33166
+grads.network.bn1.bias:
+  device: cuda:0
+  max: '1.433e-02'
+  mean: '1.035e-03'
+  min: '-1.257e-02'
+  shape:
+  - 64
+  sum: '6.621e-02'
+grads.network.bn1.weight:
+  device: cuda:0
+  max: '1.866e-02'
+  mean: '9.764e-05'
+  min: '-2.028e-02'
+  shape:
+  - 64
+  sum: '6.249e-03'
+grads.network.conv1.weight:
+  device: cuda:0
+  max: '1.798e-01'
+  mean: '6.264e-03'
+  min: '-1.354e-01'
+  shape:
+  - 64
+  - 3
+  - 7
+  - 7
+  sum: '5.893e+01'
+grads.network.fc.bias:
+  device: cuda:0
+  max: '3.523e-03'
+  mean: '2.235e-11'
+  min: '-3.062e-02'
+  shape:
+  - 1000
+  sum: '2.235e-08'
+grads.network.fc.weight:
+  device: cuda:0
+  max: '4.594e-03'
+  mean: '1.490e-11'
+  min: '-8.777e-02'
+  shape:
+  - 1000
+  - 512
+  sum: '7.629e-06'
+grads.network.layer1.0.bn1.bias:
+  device: cuda:0
+  max: '1.035e-02'
+  mean: '-8.887e-05'
+  min: '-1.081e-02'
+  shape:
+  - 64
+  sum: '-5.688e-03'
+grads.network.layer1.0.bn1.weight:
+  device: cuda:0
+  max: '1.322e-02'
+  mean: '3.085e-09'
+  min: '-1.446e-02'
+  shape:
+  - 64
+  sum: '1.974e-07'
+grads.network.layer1.0.bn2.bias:
+  device: cuda:0
+  max: '5.771e-03'
+  mean: '2.727e-04'
+  min: '-8.209e-03'
+  shape:
+  - 64
+  sum: '1.745e-02'
+grads.network.layer1.0.bn2.weight:
+  device: cuda:0
+  max: '9.735e-03'
+  mean: '3.428e-05'
+  min: '-7.881e-03'
+  shape:
+  - 64
+  sum: '2.194e-03'
+grads.network.layer1.0.conv1.weight:
+  device: cuda:0
+  max: '3.228e-02'
+  mean: '-2.187e-04'
+  min: '-3.009e-02'
+  shape:
+  - 64
+  - 64
+  - 3
+  - 3
+  sum: '-8.063e+00'
+grads.network.layer1.0.conv2.weight:
+  device: cuda:0
+  max: '2.011e-02'
+  mean: '-8.082e-05'
+  min: '-2.321e-02'
+  shape:
+  - 64
+  - 64
+  - 3
+  - 3
+  sum: '-2.979e+00'
+grads.network.layer1.1.bn1.bias:
+  device: cuda:0
+  max: '8.757e-03'
+  mean: '3.335e-04'
+  min: '-8.009e-03'
+  shape:
+  - 64
+  sum: '2.134e-02'
+grads.network.layer1.1.bn1.weight:
+  device: cuda:0
+  max: '1.031e-02'
+  mean: '-1.251e-09'
+  min: '-8.325e-03'
+  shape:
+  - 64
+  sum: '-8.009e-08'
+grads.network.layer1.1.bn2.bias:
+  device: cuda:0
+  max: '3.688e-03'
+  mean: '-1.159e-04'
+  min: '-3.878e-03'
+  shape:
+  - 64
+  sum: '-7.419e-03'
+grads.network.layer1.1.bn2.weight:
+  device: cuda:0
+  max: '7.533e-03'
+  mean: '-1.319e-04'
+  min: '-1.042e-02'
+  shape:
+  - 64
+  sum: '-8.443e-03'
+grads.network.layer1.1.conv1.weight:
+  device: cuda:0
+  max: '1.682e-02'
+  mean: '7.859e-05'
+  min: '-1.756e-02'
+  shape:
+  - 64
+  - 64
+  - 3
+  - 3
+  sum: '2.897e+00'
+grads.network.layer1.1.conv2.weight:
+  device: cuda:0
+  max: '1.164e-02'
+  mean: '-8.183e-05'
+  min: '-1.057e-02'
+  shape:
+  - 64
+  - 64
+  - 3
+  - 3
+  sum: '-3.017e+00'
+grads.network.layer2.0.bn1.bias:
+  device: cuda:0
+  max: '6.346e-03'
+  mean: '3.467e-04'
+  min: '-5.223e-03'
+  shape:
+  - 128
+  sum: '4.438e-02'
+grads.network.layer2.0.bn1.weight:
+  device: cuda:0
+  max: '4.709e-03'
+  mean: '8.731e-11'
+  min: '-5.212e-03'
+  shape:
+  - 128
+  sum: '1.118e-08'
+grads.network.layer2.0.bn2.bias:
+  device: cuda:0
+  max: '4.109e-03'
+  mean: '1.036e-04'
+  min: '-5.165e-03'
+  shape:
+  - 128
+  sum: '1.326e-02'
+grads.network.layer2.0.bn2.weight:
+  device: cuda:0
+  max: '7.476e-03'
+  mean: '-1.799e-05'
+  min: '-5.677e-03'
+  shape:
+  - 128
+  sum: '-2.302e-03'
+grads.network.layer2.0.conv1.weight:
+  device: cuda:0
+  max: '1.684e-02'
+  mean: '-1.249e-04'
+  min: '-1.531e-02'
+  shape:
+  - 128
+  - 64
+  - 3
+  - 3
+  sum: '-9.211e+00'
+grads.network.layer2.0.conv2.weight:
+  device: cuda:0
+  max: '9.979e-03'
+  mean: '-4.225e-05'
+  min: '-9.486e-03'
+  shape:
+  - 128
+  - 128
+  - 3
+  - 3
+  sum: '-6.229e+00'
+grads.network.layer2.0.downsample.0.weight:
+  device: cuda:0
+  max: '1.095e-02'
+  mean: '-1.596e-04'
+  min: '-1.44e-02'
+  shape:
+  - 128
+  - 64
+  - 1
+  - 1
+  sum: '-1.307e+00'
+grads.network.layer2.0.downsample.1.bias:
+  device: cuda:0
+  max: '4.109e-03'
+  mean: '1.036e-04'
+  min: '-5.165e-03'
+  shape:
+  - 128
+  sum: '1.326e-02'
+grads.network.layer2.0.downsample.1.weight:
+  device: cuda:0
+  max: '5.643e-03'
+  mean: '-9.116e-05'
+  min: '-5.724e-03'
+  shape:
+  - 128
+  sum: '-1.167e-02'
+grads.network.layer2.1.bn1.bias:
+  device: cuda:0
+  max: '3.875e-03'
+  mean: '2.269e-04'
+  min: '-3.296e-03'
+  shape:
+  - 128
+  sum: '2.904e-02'
+grads.network.layer2.1.bn1.weight:
+  device: cuda:0
+  max: '3.931e-03'
+  mean: '1.222e-09'
+  min: '-5.433e-03'
+  shape:
+  - 128
+  sum: '1.565e-07'
+grads.network.layer2.1.bn2.bias:
+  device: cuda:0
+  max: '3.029e-03'
+  mean: '1.229e-04'
+  min: '-2.608e-03'
+  shape:
+  - 128
+  sum: '1.574e-02'
+grads.network.layer2.1.bn2.weight:
+  device: cuda:0
+  max: '4.324e-03'
+  mean: '1.091e-04'
+  min: '-4.632e-03'
+  shape:
+  - 128
+  sum: '1.397e-02'
+grads.network.layer2.1.conv1.weight:
+  device: cuda:0
+  max: '8.457e-03'
+  mean: '-2.224e-05'
+  min: '-8.334e-03'
+  shape:
+  - 128
+  - 128
+  - 3
+  - 3
+  sum: '-3.279e+00'
+grads.network.layer2.1.conv2.weight:
+  device: cuda:0
+  max: '6.936e-03'
+  mean: '-2.779e-05'
+  min: '-6.811e-03'
+  shape:
+  - 128
+  - 128
+  - 3
+  - 3
+  sum: '-4.098e+00'
+grads.network.layer3.0.bn1.bias:
+  device: cuda:0
+  max: '2.770e-03'
+  mean: '5.8e-05'
+  min: '-3.176e-03'
+  shape:
+  - 256
+  sum: '1.485e-02'
+grads.network.layer3.0.bn1.weight:
+  device: cuda:0
+  max: '4.501e-03'
+  mean: '-1.965e-09'
+  min: '-3.247e-03'
+  shape:
+  - 256
+  sum: '-5.029e-07'
+grads.network.layer3.0.bn2.bias:
+  device: cuda:0
+  max: '2.85e-03'
+  mean: '2.536e-05'
+  min: '-3.149e-03'
+  shape:
+  - 256
+  sum: '6.493e-03'
+grads.network.layer3.0.bn2.weight:
+  device: cuda:0
+  max: '3.689e-03'
+  mean: '-1.113e-04'
+  min: '-3.318e-03'
+  shape:
+  - 256
+  sum: '-2.850e-02'
+grads.network.layer3.0.conv1.weight:
+  device: cuda:0
+  max: '8.373e-03'
+  mean: '1.589e-06'
+  min: '-8.216e-03'
+  shape:
+  - 256
+  - 128
+  - 3
+  - 3
+  sum: '4.685e-01'
+grads.network.layer3.0.conv2.weight:
+  device: cuda:0
+  max: '7.279e-03'
+  mean: '3.597e-07'
+  min: '-6.876e-03'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '2.122e-01'
+grads.network.layer3.0.downsample.0.weight:
+  device: cuda:0
+  max: '7.642e-03'
+  mean: '7.352e-06'
+  min: '-6.323e-03'
+  shape:
+  - 256
+  - 128
+  - 1
+  - 1
+  sum: '2.409e-01'
+grads.network.layer3.0.downsample.1.bias:
+  device: cuda:0
+  max: '2.85e-03'
+  mean: '2.536e-05'
+  min: '-3.149e-03'
+  shape:
+  - 256
+  sum: '6.493e-03'
+grads.network.layer3.0.downsample.1.weight:
+  device: cuda:0
+  max: '3.721e-03'
+  mean: '1.250e-04'
+  min: '-3.504e-03'
+  shape:
+  - 256
+  sum: '3.201e-02'
+grads.network.layer3.1.bn1.bias:
+  device: cuda:0
+  max: '2.634e-03'
+  mean: '3.564e-05'
+  min: '-2.17e-03'
+  shape:
+  - 256
+  sum: '9.124e-03'
+grads.network.layer3.1.bn1.weight:
+  device: cuda:0
+  max: '2.518e-03'
+  mean: '1.983e-10'
+  min: '-2.539e-03'
+  shape:
+  - 256
+  sum: '5.076e-08'
+grads.network.layer3.1.bn2.bias:
+  device: cuda:0
+  max: '2.024e-03'
+  mean: '6.733e-05'
+  min: '-1.777e-03'
+  shape:
+  - 256
+  sum: '1.724e-02'
+grads.network.layer3.1.bn2.weight:
+  device: cuda:0
+  max: '2.737e-03'
+  mean: '-1.37e-05'
+  min: '-2.669e-03'
+  shape:
+  - 256
+  sum: '-3.507e-03'
+grads.network.layer3.1.conv1.weight:
+  device: cuda:0
+  max: '5.457e-03'
+  mean: '-1.498e-06'
+  min: '-5.48e-03'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '-8.836e-01'
+grads.network.layer3.1.conv2.weight:
+  device: cuda:0
+  max: '4.436e-03'
+  mean: '7.578e-07'
+  min: '-4.453e-03'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '4.469e-01'
+grads.network.layer4.0.bn1.bias:
+  device: cuda:0
+  max: '1.529e-03'
+  mean: '4.731e-05'
+  min: '-1.600e-03'
+  shape:
+  - 512
+  sum: '2.422e-02'
+grads.network.layer4.0.bn1.weight:
+  device: cuda:0
+  max: '2.836e-03'
+  mean: '3.382e-09'
+  min: '-1.948e-03'
+  shape:
+  - 512
+  sum: '1.731e-06'
+grads.network.layer4.0.bn2.bias:
+  device: cuda:0
+  max: '4.572e-03'
+  mean: '2.561e-04'
+  min: '-3.552e-03'
+  shape:
+  - 512
+  sum: '1.311e-01'
+grads.network.layer4.0.bn2.weight:
+  device: cuda:0
+  max: '4.103e-03'
+  mean: '2.118e-04'
+  min: '-2.870e-03'
+  shape:
+  - 512
+  sum: '1.084e-01'
+grads.network.layer4.0.conv1.weight:
+  device: cuda:0
+  max: '5.52e-03'
+  mean: '-1.319e-05'
+  min: '-5.398e-03'
+  shape:
+  - 512
+  - 256
+  - 3
+  - 3
+  sum: '-1.556e+01'
+grads.network.layer4.0.conv2.weight:
+  device: cuda:0
+  max: '3.6e-03'
+  mean: '-4.087e-06'
+  min: '-4.384e-03'
+  shape:
+  - 512
+  - 512
+  - 3
+  - 3
+  sum: '-9.643e+00'
+grads.network.layer4.0.downsample.0.weight:
+  device: cuda:0
+  max: '4.390e-03'
+  mean: '-2.207e-06'
+  min: '-5.205e-03'
+  shape:
+  - 512
+  - 256
+  - 1
+  - 1
+  sum: '-2.893e-01'
+grads.network.layer4.0.downsample.1.bias:
+  device: cuda:0
+  max: '4.572e-03'
+  mean: '2.561e-04'
+  min: '-3.552e-03'
+  shape:
+  - 512
+  sum: '1.311e-01'
+grads.network.layer4.0.downsample.1.weight:
+  device: cuda:0
+  max: '3.626e-03'
+  mean: '1.351e-04'
+  min: '-3.259e-03'
+  shape:
+  - 512
+  sum: '6.917e-02'
+grads.network.layer4.1.bn1.bias:
+  device: cuda:0
+  max: '1.327e-03'
+  mean: '1.918e-05'
+  min: '-1.29e-03'
+  shape:
+  - 512
+  sum: '9.818e-03'
+grads.network.layer4.1.bn1.weight:
+  device: cuda:0
+  max: '2.764e-03'
+  mean: '3.335e-09'
+  min: '-2.679e-03'
+  shape:
+  - 512
+  sum: '1.707e-06'
+grads.network.layer4.1.bn2.bias:
+  device: cuda:0
+  max: '7.656e-03'
+  mean: '4.169e-04'
+  min: '-5.189e-03'
+  shape:
+  - 512
+  sum: '2.134e-01'
+grads.network.layer4.1.bn2.weight:
+  device: cuda:0
+  max: '3.609e-03'
+  mean: '2.029e-04'
+  min: '-3.125e-03'
+  shape:
+  - 512
+  sum: '1.039e-01'
+grads.network.layer4.1.conv1.weight:
+  device: cuda:0
+  max: '4.400e-03'
+  mean: '-9.705e-06'
+  min: '-3.475e-03'
+  shape:
+  - 512
+  - 512
+  - 3
+  - 3
+  sum: '-2.29e+01'
+grads.network.layer4.1.conv2.weight:
+  device: cuda:0
+  max: '3.91e-03'
+  mean: '1.074e-05'
+  min: '-2.999e-03'
+  shape:
+  - 512
+  - 512
+  - 3
+  - 3
+  sum: '2.535e+01'
+outputs.logits:
+  device: cuda:0
+  max: '2.934e+00'
+  mean: '-8.071e-04'
+  min: '-2.896e+00'
+  shape:
+  - 64
+  - 1000
+  sum: '-5.165e+01'
+outputs.loss:
+  device: cuda:0
+  max: '7.073e+00'
+  mean: '7.073e+00'
+  min: '7.073e+00'
+  shape: []
+  sum: '7.073e+00'
+outputs.y:
+  device: cuda:0
+  max: 988
+  mean: '5.182e+02'
+  min: 0
+  shape:
+  - 64
+  sum: 33166
diff --git a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/resnet50_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet50_cifar10_image_classifier.yaml
similarity index 84%
rename from .regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/resnet50_cifar10_example.yaml
rename to .regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet50_cifar10_image_classifier.yaml
index fb60cb5a..3fafcadf 100644
--- a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/resnet50_cifar10_example.yaml
+++ b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet50_cifar10_image_classifier.yaml
@@ -1,5 +1,5 @@
 batch.0:
-  device: cpu
+  device: cuda:0
   max: '2.126e+00'
   mean: '-6.179e-03'
   min: '-1.989e+00'
@@ -10,7 +10,7 @@ batch.0:
   - 32
   sum: '-2.43e+03'
 batch.1:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.555e+00'
   min: 0
@@ -18,7 +18,7 @@ batch.1:
   - 128
   sum: 583
 grads.network.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '9.205e-01'
   mean: '4.814e-02'
   min: '-1.080e+00'
@@ -26,15 +26,15 @@ grads.network.bn1.bias:
   - 64
   sum: '3.081e+00'
 grads.network.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.441e+00'
-  mean: '3.663e-06'
+  mean: '3.662e-06'
   min: '-1.737e+00'
   shape:
   - 64
   sum: '2.344e-04'
 grads.network.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.895e+01'
   mean: '-8.353e-03'
   min: '-1.422e+01'
@@ -45,24 +45,24 @@ grads.network.conv1.weight:
   - 7
   sum: '-7.858e+01'
 grads.network.fc.bias:
-  device: cpu
+  device: cuda:0
   max: '1.341e-01'
-  mean: '7.451e-10'
+  mean: '1.490e-09'
   min: '-6.681e-02'
   shape:
   - 10
-  sum: '7.451e-09'
+  sum: '1.490e-08'
 grads.network.fc.weight:
-  device: cpu
+  device: cuda:0
   max: '3.777e-01'
-  mean: '6.054e-10'
+  mean: '5.101e-10'
   min: '-2.029e-01'
   shape:
   - 10
   - 2048
-  sum: '1.24e-05'
+  sum: '1.045e-05'
 grads.network.layer1.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '8.082e-01'
   mean: '1.893e-02'
   min: '-8.557e-01'
@@ -70,15 +70,15 @@ grads.network.layer1.0.bn1.bias:
   - 64
   sum: '1.211e+00'
 grads.network.layer1.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '7.796e-01'
-  mean: '-1.29e-07'
+  mean: '-1.248e-07'
   min: '-9.923e-01'
   shape:
   - 64
-  sum: '-8.255e-06'
+  sum: '-7.987e-06'
 grads.network.layer1.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '6.138e-01'
   mean: '-3.147e-02'
   min: '-7.454e-01'
@@ -86,15 +86,15 @@ grads.network.layer1.0.bn2.bias:
   - 64
   sum: '-2.014e+00'
 grads.network.layer1.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '8.566e-01'
-  mean: '-4.082e-06'
+  mean: '-4.075e-06'
   min: '-8.725e-01'
   shape:
   - 64
-  sum: '-2.613e-04'
+  sum: '-2.608e-04'
 grads.network.layer1.0.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '4.064e-01'
   mean: '-1.042e-04'
   min: '-4.231e-01'
@@ -102,7 +102,7 @@ grads.network.layer1.0.bn3.bias:
   - 256
   sum: '-2.667e-02'
 grads.network.layer1.0.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '5.445e-01'
   mean: '-1.607e-02'
   min: '-5.301e-01'
@@ -110,7 +110,7 @@ grads.network.layer1.0.bn3.weight:
   - 256
   sum: '-4.115e+00'
 grads.network.layer1.0.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.995e+00'
   mean: '5.037e-03'
   min: '-2.531e+00'
@@ -121,7 +121,7 @@ grads.network.layer1.0.conv1.weight:
   - 1
   sum: '2.063e+01'
 grads.network.layer1.0.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.94e+00'
   mean: '9.205e-03'
   min: '-1.562e+00'
@@ -132,7 +132,7 @@ grads.network.layer1.0.conv2.weight:
   - 3
   sum: '3.393e+02'
 grads.network.layer1.0.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.516e+00'
   mean: '1.730e-03'
   min: '-1.296e+00'
@@ -143,7 +143,7 @@ grads.network.layer1.0.conv3.weight:
   - 1
   sum: '2.835e+01'
 grads.network.layer1.0.downsample.0.weight:
-  device: cpu
+  device: cuda:0
   max: '1.394e+00'
   mean: '6.997e-03'
   min: '-1.394e+00'
@@ -154,7 +154,7 @@ grads.network.layer1.0.downsample.0.weight:
   - 1
   sum: '1.146e+02'
 grads.network.layer1.0.downsample.1.bias:
-  device: cpu
+  device: cuda:0
   max: '4.064e-01'
   mean: '-1.042e-04'
   min: '-4.231e-01'
@@ -162,7 +162,7 @@ grads.network.layer1.0.downsample.1.bias:
   - 256
   sum: '-2.667e-02'
 grads.network.layer1.0.downsample.1.weight:
-  device: cpu
+  device: cuda:0
   max: '7.517e-01'
   mean: '1.179e-02'
   min: '-4.804e-01'
@@ -170,7 +170,7 @@ grads.network.layer1.0.downsample.1.weight:
   - 256
   sum: '3.017e+00'
 grads.network.layer1.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '5.352e-01'
   mean: '-5.139e-03'
   min: '-6.301e-01'
@@ -178,15 +178,15 @@ grads.network.layer1.1.bn1.bias:
   - 64
   sum: '-3.289e-01'
 grads.network.layer1.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '7.305e-01'
-  mean: '-1.327e-07'
+  mean: '-1.322e-07'
   min: '-6.086e-01'
   shape:
   - 64
-  sum: '-8.494e-06'
+  sum: '-8.464e-06'
 grads.network.layer1.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '6.326e-01'
   mean: '-2.056e-03'
   min: '-4.814e-01'
@@ -194,15 +194,15 @@ grads.network.layer1.1.bn2.bias:
   - 64
   sum: '-1.316e-01'
 grads.network.layer1.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '7.657e-01'
-  mean: '2.468e-08'
+  mean: '2.328e-08'
   min: '-5.989e-01'
   shape:
   - 64
-  sum: '1.58e-06'
+  sum: '1.490e-06'
 grads.network.layer1.1.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '2.399e-01'
   mean: '5.205e-03'
   min: '-1.858e-01'
@@ -210,7 +210,7 @@ grads.network.layer1.1.bn3.bias:
   - 256
   sum: '1.333e+00'
 grads.network.layer1.1.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '3.889e-01'
   mean: '2.229e-03'
   min: '-3.122e-01'
@@ -218,7 +218,7 @@ grads.network.layer1.1.bn3.weight:
   - 256
   sum: '5.706e-01'
 grads.network.layer1.1.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '6.541e-01'
   mean: '6.722e-04'
   min: '-6.24e-01'
@@ -229,7 +229,7 @@ grads.network.layer1.1.conv1.weight:
   - 1
   sum: '1.101e+01'
 grads.network.layer1.1.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.279e+00'
   mean: '6.102e-03'
   min: '-1.024e+00'
@@ -240,7 +240,7 @@ grads.network.layer1.1.conv2.weight:
   - 3
   sum: '2.249e+02'
 grads.network.layer1.1.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '9.491e-01'
   mean: '2.511e-03'
   min: '-9.537e-01'
@@ -251,7 +251,7 @@ grads.network.layer1.1.conv3.weight:
   - 1
   sum: '4.114e+01'
 grads.network.layer1.2.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '4.21e-01'
   mean: '-1.548e-02'
   min: '-4.326e-01'
@@ -259,7 +259,7 @@ grads.network.layer1.2.bn1.bias:
   - 64
   sum: '-9.907e-01'
 grads.network.layer1.2.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '5.188e-01'
   mean: '1.397e-08'
   min: '-3.354e-01'
@@ -267,7 +267,7 @@ grads.network.layer1.2.bn1.weight:
   - 64
   sum: '8.941e-07'
 grads.network.layer1.2.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '4.175e-01'
   mean: '-7.536e-03'
   min: '-3.544e-01'
@@ -275,15 +275,15 @@ grads.network.layer1.2.bn2.bias:
   - 64
   sum: '-4.823e-01'
 grads.network.layer1.2.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '2.97e-01'
-  mean: '5.030e-07'
+  mean: '5.048e-07'
   min: '-3.822e-01'
   shape:
   - 64
-  sum: '3.219e-05'
+  sum: '3.231e-05'
 grads.network.layer1.2.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '1.238e-01'
   mean: '2.877e-03'
   min: '-1.060e-01'
@@ -291,7 +291,7 @@ grads.network.layer1.2.bn3.bias:
   - 256
   sum: '7.366e-01'
 grads.network.layer1.2.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '2.316e-01'
   mean: '2.059e-03'
   min: '-2.506e-01'
@@ -299,7 +299,7 @@ grads.network.layer1.2.bn3.weight:
   - 256
   sum: '5.272e-01'
 grads.network.layer1.2.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '3.633e-01'
   mean: '3.658e-03'
   min: '-4.331e-01'
@@ -310,7 +310,7 @@ grads.network.layer1.2.conv1.weight:
   - 1
   sum: '5.993e+01'
 grads.network.layer1.2.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '6.992e-01'
   mean: '2.97e-03'
   min: '-7.175e-01'
@@ -321,7 +321,7 @@ grads.network.layer1.2.conv2.weight:
   - 3
   sum: '1.095e+02'
 grads.network.layer1.2.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '5.388e-01'
   mean: '-1.901e-04'
   min: '-6.321e-01'
@@ -332,7 +332,7 @@ grads.network.layer1.2.conv3.weight:
   - 1
   sum: '-3.115e+00'
 grads.network.layer2.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '2.419e-01'
   mean: '-5.441e-03'
   min: '-2.731e-01'
@@ -340,15 +340,15 @@ grads.network.layer2.0.bn1.bias:
   - 128
   sum: '-6.964e-01'
 grads.network.layer2.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '3.249e-01'
-  mean: '2.375e-08'
+  mean: '2.258e-08'
   min: '-2.792e-01'
   shape:
   - 128
-  sum: '3.04e-06'
+  sum: '2.891e-06'
 grads.network.layer2.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '1.974e-01'
   mean: '-7.017e-03'
   min: '-2.037e-01'
@@ -356,15 +356,15 @@ grads.network.layer2.0.bn2.bias:
   - 128
   sum: '-8.981e-01'
 grads.network.layer2.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '3.613e-01'
-  mean: '6.624e-08'
+  mean: '6.775e-08'
   min: '-2.713e-01'
   shape:
   - 128
-  sum: '8.479e-06'
+  sum: '8.672e-06'
 grads.network.layer2.0.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '1.091e-01'
   mean: '6.263e-04'
   min: '-1.059e-01'
@@ -372,7 +372,7 @@ grads.network.layer2.0.bn3.bias:
   - 512
   sum: '3.207e-01'
 grads.network.layer2.0.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.658e-01'
   mean: '-1.899e-04'
   min: '-1.353e-01'
@@ -380,7 +380,7 @@ grads.network.layer2.0.bn3.weight:
   - 512
   sum: '-9.725e-02'
 grads.network.layer2.0.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '3.953e-01'
   mean: '1.031e-03'
   min: '-3.708e-01'
@@ -391,7 +391,7 @@ grads.network.layer2.0.conv1.weight:
   - 1
   sum: '3.38e+01'
 grads.network.layer2.0.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '4.388e-01'
   mean: '1.736e-03'
   min: '-4.009e-01'
@@ -402,7 +402,7 @@ grads.network.layer2.0.conv2.weight:
   - 3
   sum: '2.560e+02'
 grads.network.layer2.0.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '3.455e-01'
   mean: '8.466e-04'
   min: '-3.519e-01'
@@ -413,7 +413,7 @@ grads.network.layer2.0.conv3.weight:
   - 1
   sum: '5.548e+01'
 grads.network.layer2.0.downsample.0.weight:
-  device: cpu
+  device: cuda:0
   max: '2.479e-01'
   mean: '3.199e-04'
   min: '-2.569e-01'
@@ -424,7 +424,7 @@ grads.network.layer2.0.downsample.0.weight:
   - 1
   sum: '4.193e+01'
 grads.network.layer2.0.downsample.1.bias:
-  device: cpu
+  device: cuda:0
   max: '1.091e-01'
   mean: '6.263e-04'
   min: '-1.059e-01'
@@ -432,7 +432,7 @@ grads.network.layer2.0.downsample.1.bias:
   - 512
   sum: '3.207e-01'
 grads.network.layer2.0.downsample.1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.697e-01'
   mean: '1.416e-03'
   min: '-1.327e-01'
@@ -440,7 +440,7 @@ grads.network.layer2.0.downsample.1.weight:
   - 512
   sum: '7.250e-01'
 grads.network.layer2.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '1.482e-01'
   mean: '-1.673e-03'
   min: '-1.761e-01'
@@ -448,15 +448,15 @@ grads.network.layer2.1.bn1.bias:
   - 128
   sum: '-2.141e-01'
 grads.network.layer2.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.848e-01'
-  mean: '-3.946e-08'
+  mean: '-3.888e-08'
   min: '-2.179e-01'
   shape:
   - 128
-  sum: '-5.051e-06'
+  sum: '-4.977e-06'
 grads.network.layer2.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '1.764e-01'
   mean: '5.389e-03'
   min: '-1.466e-01'
@@ -464,15 +464,15 @@ grads.network.layer2.1.bn2.bias:
   - 128
   sum: '6.898e-01'
 grads.network.layer2.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '2.348e-01'
-  mean: '-1.397e-07'
+  mean: '-1.404e-07'
   min: '-2.435e-01'
   shape:
   - 128
-  sum: '-1.788e-05'
+  sum: '-1.797e-05'
 grads.network.layer2.1.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '8.049e-02'
   mean: '-1.62e-04'
   min: '-6.643e-02'
@@ -480,7 +480,7 @@ grads.network.layer2.1.bn3.bias:
   - 512
   sum: '-8.292e-02'
 grads.network.layer2.1.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.130e-01'
   mean: '1.227e-04'
   min: '-9.870e-02'
@@ -488,7 +488,7 @@ grads.network.layer2.1.bn3.weight:
   - 512
   sum: '6.285e-02'
 grads.network.layer2.1.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '2.100e-01'
   mean: '-3.326e-04'
   min: '-1.831e-01'
@@ -499,7 +499,7 @@ grads.network.layer2.1.conv1.weight:
   - 1
   sum: '-2.18e+01'
 grads.network.layer2.1.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '3.447e-01'
   mean: '-9.641e-04'
   min: '-3.505e-01'
@@ -510,7 +510,7 @@ grads.network.layer2.1.conv2.weight:
   - 3
   sum: '-1.422e+02'
 grads.network.layer2.1.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '2.356e-01'
   mean: '-1.869e-04'
   min: '-2.254e-01'
@@ -521,7 +521,7 @@ grads.network.layer2.1.conv3.weight:
   - 1
   sum: '-1.225e+01'
 grads.network.layer2.2.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '1.512e-01'
   mean: '-1.99e-03'
   min: '-1.240e-01'
@@ -529,15 +529,15 @@ grads.network.layer2.2.bn1.bias:
   - 128
   sum: '-2.547e-01'
 grads.network.layer2.2.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.999e-01'
-  mean: '2.258e-08'
+  mean: '2.270e-08'
   min: '-1.396e-01'
   shape:
   - 128
-  sum: '2.891e-06'
+  sum: '2.906e-06'
 grads.network.layer2.2.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '1.029e-01'
   mean: '-3.850e-04'
   min: '-1.010e-01'
@@ -545,15 +545,15 @@ grads.network.layer2.2.bn2.bias:
   - 128
   sum: '-4.928e-02'
 grads.network.layer2.2.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.463e-01'
-  mean: '-1.159e-07'
+  mean: '-1.162e-07'
   min: '-1.46e-01'
   shape:
   - 128
-  sum: '-1.484e-05'
+  sum: '-1.487e-05'
 grads.network.layer2.2.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '4.505e-02'
   mean: '-9.093e-05'
   min: '-3.943e-02'
@@ -561,7 +561,7 @@ grads.network.layer2.2.bn3.bias:
   - 512
   sum: '-4.656e-02'
 grads.network.layer2.2.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '8.137e-02'
   mean: '-4.692e-04'
   min: '-6.764e-02'
@@ -569,7 +569,7 @@ grads.network.layer2.2.bn3.weight:
   - 512
   sum: '-2.402e-01'
 grads.network.layer2.2.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.230e-01'
   mean: '2.737e-04'
   min: '-1.255e-01'
@@ -580,7 +580,7 @@ grads.network.layer2.2.conv1.weight:
   - 1
   sum: '1.794e+01'
 grads.network.layer2.2.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '2.359e-01'
   mean: '4.964e-04'
   min: '-2.379e-01'
@@ -591,7 +591,7 @@ grads.network.layer2.2.conv2.weight:
   - 3
   sum: '7.32e+01'
 grads.network.layer2.2.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.738e-01'
   mean: '4.385e-04'
   min: '-1.777e-01'
@@ -602,7 +602,7 @@ grads.network.layer2.2.conv3.weight:
   - 1
   sum: '2.874e+01'
 grads.network.layer2.3.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '1.279e-01'
   mean: '6.022e-03'
   min: '-8.782e-02'
@@ -610,15 +610,15 @@ grads.network.layer2.3.bn1.bias:
   - 128
   sum: '7.708e-01'
 grads.network.layer2.3.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.222e-01'
-  mean: '1.257e-08'
+  mean: '1.199e-08'
   min: '-1.526e-01'
   shape:
   - 128
-  sum: '1.609e-06'
+  sum: '1.535e-06'
 grads.network.layer2.3.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '9.101e-02'
   mean: '-1.522e-03'
   min: '-7.893e-02'
@@ -626,15 +626,15 @@ grads.network.layer2.3.bn2.bias:
   - 128
   sum: '-1.948e-01'
 grads.network.layer2.3.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '8.481e-02'
-  mean: '-1.930e-07'
+  mean: '-1.932e-07'
   min: '-8.458e-02'
   shape:
   - 128
-  sum: '-2.471e-05'
+  sum: '-2.474e-05'
 grads.network.layer2.3.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '2.302e-02'
   mean: '1.906e-05'
   min: '-3.022e-02'
@@ -642,7 +642,7 @@ grads.network.layer2.3.bn3.bias:
   - 512
   sum: '9.761e-03'
 grads.network.layer2.3.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '4.318e-02'
   mean: '-8.797e-04'
   min: '-4.599e-02'
@@ -650,7 +650,7 @@ grads.network.layer2.3.bn3.weight:
   - 512
   sum: '-4.504e-01'
 grads.network.layer2.3.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '8.230e-02'
   mean: '-3.507e-04'
   min: '-9.358e-02'
@@ -661,7 +661,7 @@ grads.network.layer2.3.conv1.weight:
   - 1
   sum: '-2.298e+01'
 grads.network.layer2.3.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.666e-01'
   mean: '8.926e-04'
   min: '-1.69e-01'
@@ -672,7 +672,7 @@ grads.network.layer2.3.conv2.weight:
   - 3
   sum: '1.316e+02'
 grads.network.layer2.3.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.444e-01'
   mean: '1.829e-04'
   min: '-1.152e-01'
@@ -683,7 +683,7 @@ grads.network.layer2.3.conv3.weight:
   - 1
   sum: '1.199e+01'
 grads.network.layer3.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '6.992e-02'
   mean: '1.721e-03'
   min: '-8.225e-02'
@@ -691,15 +691,15 @@ grads.network.layer3.0.bn1.bias:
   - 256
   sum: '4.405e-01'
 grads.network.layer3.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '8.985e-02'
-  mean: '-2.648e-09'
+  mean: '-2.561e-09'
   min: '-1.042e-01'
   shape:
   - 256
-  sum: '-6.780e-07'
+  sum: '-6.557e-07'
 grads.network.layer3.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '6.940e-02'
   mean: '5.335e-04'
   min: '-5.311e-02'
@@ -707,15 +707,15 @@ grads.network.layer3.0.bn2.bias:
   - 256
   sum: '1.366e-01'
 grads.network.layer3.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '5.623e-02'
-  mean: '-2.305e-08'
+  mean: '-2.282e-08'
   min: '-7.762e-02'
   shape:
   - 256
-  sum: '-5.901e-06'
+  sum: '-5.841e-06'
 grads.network.layer3.0.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '3.228e-02'
   mean: '-1.181e-04'
   min: '-2.608e-02'
@@ -723,7 +723,7 @@ grads.network.layer3.0.bn3.bias:
   - 1024
   sum: '-1.209e-01'
 grads.network.layer3.0.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '3.652e-02'
   mean: '-7.228e-05'
   min: '-4.893e-02'
@@ -731,7 +731,7 @@ grads.network.layer3.0.bn3.weight:
   - 1024
   sum: '-7.401e-02'
 grads.network.layer3.0.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '9.913e-02'
   mean: '-3.902e-04'
   min: '-9.101e-02'
@@ -742,7 +742,7 @@ grads.network.layer3.0.conv1.weight:
   - 1
   sum: '-5.114e+01'
 grads.network.layer3.0.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.257e-01'
   mean: '-8.546e-05'
   min: '-1.265e-01'
@@ -753,7 +753,7 @@ grads.network.layer3.0.conv2.weight:
   - 3
   sum: '-5.040e+01'
 grads.network.layer3.0.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '9.508e-02'
   mean: '4.733e-05'
   min: '-1.04e-01'
@@ -764,7 +764,7 @@ grads.network.layer3.0.conv3.weight:
   - 1
   sum: '1.241e+01'
 grads.network.layer3.0.downsample.0.weight:
-  device: cpu
+  device: cuda:0
   max: '7.85e-02'
   mean: '-3.186e-05'
   min: '-9.409e-02'
@@ -775,7 +775,7 @@ grads.network.layer3.0.downsample.0.weight:
   - 1
   sum: '-1.671e+01'
 grads.network.layer3.0.downsample.1.bias:
-  device: cpu
+  device: cuda:0
   max: '3.228e-02'
   mean: '-1.181e-04'
   min: '-2.608e-02'
@@ -783,7 +783,7 @@ grads.network.layer3.0.downsample.1.bias:
   - 1024
   sum: '-1.209e-01'
 grads.network.layer3.0.downsample.1.weight:
-  device: cpu
+  device: cuda:0
   max: '3.657e-02'
   mean: '-7.938e-05'
   min: '-3.968e-02'
@@ -791,7 +791,7 @@ grads.network.layer3.0.downsample.1.weight:
   - 1024
   sum: '-8.128e-02'
 grads.network.layer3.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '5.199e-02'
   mean: '-3.091e-04'
   min: '-6.523e-02'
@@ -799,15 +799,15 @@ grads.network.layer3.1.bn1.bias:
   - 256
   sum: '-7.912e-02'
 grads.network.layer3.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '7.237e-02'
-  mean: '1.156e-08'
+  mean: '1.141e-08'
   min: '-5.789e-02'
   shape:
   - 256
-  sum: '2.959e-06'
+  sum: '2.921e-06'
 grads.network.layer3.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '4.225e-02'
   mean: '7.41e-04'
   min: '-4.171e-02'
@@ -815,15 +815,15 @@ grads.network.layer3.1.bn2.bias:
   - 256
   sum: '1.897e-01'
 grads.network.layer3.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '3.798e-02'
-  mean: '3.897e-08'
+  mean: '3.9e-08'
   min: '-5.021e-02'
   shape:
   - 256
-  sum: '9.976e-06'
+  sum: '9.984e-06'
 grads.network.layer3.1.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '1.976e-02'
   mean: '-1.692e-04'
   min: '-2.215e-02'
@@ -831,7 +831,7 @@ grads.network.layer3.1.bn3.bias:
   - 1024
   sum: '-1.733e-01'
 grads.network.layer3.1.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '2.348e-02'
   mean: '1.549e-04'
   min: '-2.379e-02'
@@ -839,7 +839,7 @@ grads.network.layer3.1.bn3.weight:
   - 1024
   sum: '1.587e-01'
 grads.network.layer3.1.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '4.929e-02'
   mean: '4.316e-05'
   min: '-4.696e-02'
@@ -850,7 +850,7 @@ grads.network.layer3.1.conv1.weight:
   - 1
   sum: '1.131e+01'
 grads.network.layer3.1.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.156e-01'
   mean: '-8.390e-05'
   min: '-1.048e-01'
@@ -861,7 +861,7 @@ grads.network.layer3.1.conv2.weight:
   - 3
   sum: '-4.949e+01'
 grads.network.layer3.1.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '6.757e-02'
   mean: '3.39e-05'
   min: '-6.879e-02'
@@ -872,7 +872,7 @@ grads.network.layer3.1.conv3.weight:
   - 1
   sum: '8.886e+00'
 grads.network.layer3.2.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '3.715e-02'
   mean: '-3.498e-04'
   min: '-4.113e-02'
@@ -880,15 +880,15 @@ grads.network.layer3.2.bn1.bias:
   - 256
   sum: '-8.956e-02'
 grads.network.layer3.2.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '4.569e-02'
-  mean: '2.794e-09'
+  mean: '2.867e-09'
   min: '-4.962e-02'
   shape:
   - 256
-  sum: '7.153e-07'
+  sum: '7.339e-07'
 grads.network.layer3.2.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '3.029e-02'
   mean: '-4.436e-04'
   min: '-2.692e-02'
@@ -896,15 +896,15 @@ grads.network.layer3.2.bn2.bias:
   - 256
   sum: '-1.135e-01'
 grads.network.layer3.2.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '3.397e-02'
-  mean: '-1.458e-08'
+  mean: '-1.461e-08'
   min: '-3.55e-02'
   shape:
   - 256
-  sum: '-3.733e-06'
+  sum: '-3.740e-06'
 grads.network.layer3.2.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '1.074e-02'
   mean: '-9.653e-05'
   min: '-1.428e-02'
@@ -912,7 +912,7 @@ grads.network.layer3.2.bn3.bias:
   - 1024
   sum: '-9.884e-02'
 grads.network.layer3.2.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '2.000e-02'
   mean: '-7.752e-05'
   min: '-1.676e-02'
@@ -920,7 +920,7 @@ grads.network.layer3.2.bn3.weight:
   - 1024
   sum: '-7.938e-02'
 grads.network.layer3.2.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '3.134e-02'
   mean: '6.29e-05'
   min: '-3.177e-02'
@@ -931,7 +931,7 @@ grads.network.layer3.2.conv1.weight:
   - 1
   sum: '1.649e+01'
 grads.network.layer3.2.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '7.868e-02'
   mean: '7.155e-06'
   min: '-7.522e-02'
@@ -942,7 +942,7 @@ grads.network.layer3.2.conv2.weight:
   - 3
   sum: '4.220e+00'
 grads.network.layer3.2.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '4.457e-02'
   mean: '-6.326e-05'
   min: '-4.720e-02'
@@ -953,7 +953,7 @@ grads.network.layer3.2.conv3.weight:
   - 1
   sum: '-1.658e+01'
 grads.network.layer3.3.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '4.017e-02'
   mean: '6.214e-05'
   min: '-2.511e-02'
@@ -961,15 +961,15 @@ grads.network.layer3.3.bn1.bias:
   - 256
   sum: '1.591e-02'
 grads.network.layer3.3.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '3.217e-02'
-  mean: '-1.31e-10'
+  mean: '-2.183e-10'
   min: '-3.779e-02'
   shape:
   - 256
-  sum: '-3.353e-08'
+  sum: '-5.588e-08'
 grads.network.layer3.3.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '2.313e-02'
   mean: '-2.275e-06'
   min: '-2.476e-02'
@@ -977,15 +977,15 @@ grads.network.layer3.3.bn2.bias:
   - 256
   sum: '-5.825e-04'
 grads.network.layer3.3.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '2.436e-02'
-  mean: '-1.283e-08'
+  mean: '-1.279e-08'
   min: '-2.400e-02'
   shape:
   - 256
-  sum: '-3.286e-06'
+  sum: '-3.275e-06'
 grads.network.layer3.3.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '9.701e-03'
   mean: '-4.152e-05'
   min: '-8.985e-03'
@@ -993,7 +993,7 @@ grads.network.layer3.3.bn3.bias:
   - 1024
   sum: '-4.251e-02'
 grads.network.layer3.3.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.274e-02'
   mean: '-5.492e-05'
   min: '-1.673e-02'
@@ -1001,7 +1001,7 @@ grads.network.layer3.3.bn3.weight:
   - 1024
   sum: '-5.623e-02'
 grads.network.layer3.3.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '2.719e-02'
   mean: '-4.864e-05'
   min: '-2.668e-02'
@@ -1012,7 +1012,7 @@ grads.network.layer3.3.conv1.weight:
   - 1
   sum: '-1.275e+01'
 grads.network.layer3.3.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '6.36e-02'
   mean: '7.046e-05'
   min: '-5.796e-02'
@@ -1023,7 +1023,7 @@ grads.network.layer3.3.conv2.weight:
   - 3
   sum: '4.156e+01'
 grads.network.layer3.3.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '4.141e-02'
   mean: '1.489e-05'
   min: '-3.670e-02'
@@ -1034,7 +1034,7 @@ grads.network.layer3.3.conv3.weight:
   - 1
   sum: '3.903e+00'
 grads.network.layer3.4.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '2.147e-02'
   mean: '3.403e-05'
   min: '-2.25e-02'
@@ -1042,7 +1042,7 @@ grads.network.layer3.4.bn1.bias:
   - 256
   sum: '8.711e-03'
 grads.network.layer3.4.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '3.626e-02'
   mean: '-1.892e-09'
   min: '-2.356e-02'
@@ -1050,7 +1050,7 @@ grads.network.layer3.4.bn1.weight:
   - 256
   sum: '-4.843e-07'
 grads.network.layer3.4.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '1.518e-02'
   mean: '3.233e-04'
   min: '-1.562e-02'
@@ -1058,7 +1058,7 @@ grads.network.layer3.4.bn2.bias:
   - 256
   sum: '8.277e-02'
 grads.network.layer3.4.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '2.106e-02'
   mean: '4.386e-08'
   min: '-2.206e-02'
@@ -1066,7 +1066,7 @@ grads.network.layer3.4.bn2.weight:
   - 256
   sum: '1.123e-05'
 grads.network.layer3.4.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '6.997e-03'
   mean: '-6.533e-05'
   min: '-7.944e-03'
@@ -1074,7 +1074,7 @@ grads.network.layer3.4.bn3.bias:
   - 1024
   sum: '-6.689e-02'
 grads.network.layer3.4.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.064e-02'
   mean: '1.463e-04'
   min: '-9.902e-03'
@@ -1082,7 +1082,7 @@ grads.network.layer3.4.bn3.weight:
   - 1024
   sum: '1.498e-01'
 grads.network.layer3.4.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.904e-02'
   mean: '-2.754e-05'
   min: '-1.891e-02'
@@ -1093,7 +1093,7 @@ grads.network.layer3.4.conv1.weight:
   - 1
   sum: '-7.22e+00'
 grads.network.layer3.4.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '4.254e-02'
   mean: '-2.627e-05'
   min: '-5.017e-02'
@@ -1104,7 +1104,7 @@ grads.network.layer3.4.conv2.weight:
   - 3
   sum: '-1.549e+01'
 grads.network.layer3.4.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '2.563e-02'
   mean: '-3.938e-06'
   min: '-2.833e-02'
@@ -1115,7 +1115,7 @@ grads.network.layer3.4.conv3.weight:
   - 1
   sum: '-1.032e+00'
 grads.network.layer3.5.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '1.901e-02'
   mean: '2.356e-04'
   min: '-1.961e-02'
@@ -1123,7 +1123,7 @@ grads.network.layer3.5.bn1.bias:
   - 256
   sum: '6.031e-02'
 grads.network.layer3.5.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '2.546e-02'
   mean: '-9.313e-10'
   min: '-2.608e-02'
@@ -1131,7 +1131,7 @@ grads.network.layer3.5.bn1.weight:
   - 256
   sum: '-2.384e-07'
 grads.network.layer3.5.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '1.274e-02'
   mean: '-1.438e-04'
   min: '-1.364e-02'
@@ -1139,15 +1139,15 @@ grads.network.layer3.5.bn2.bias:
   - 256
   sum: '-3.680e-02'
 grads.network.layer3.5.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.536e-02'
-  mean: '-3.049e-09'
+  mean: '-3.012e-09'
   min: '-2.043e-02'
   shape:
   - 256
-  sum: '-7.804e-07'
+  sum: '-7.711e-07'
 grads.network.layer3.5.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '4.202e-03'
   mean: '-2.573e-05'
   min: '-4.034e-03'
@@ -1155,7 +1155,7 @@ grads.network.layer3.5.bn3.bias:
   - 1024
   sum: '-2.634e-02'
 grads.network.layer3.5.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '9.836e-03'
   mean: '-1.711e-05'
   min: '-8.328e-03'
@@ -1163,7 +1163,7 @@ grads.network.layer3.5.bn3.weight:
   - 1024
   sum: '-1.752e-02'
 grads.network.layer3.5.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.525e-02'
   mean: '-3.503e-05'
   min: '-1.432e-02'
@@ -1174,7 +1174,7 @@ grads.network.layer3.5.conv1.weight:
   - 1
   sum: '-9.184e+00'
 grads.network.layer3.5.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '4.67e-02'
   mean: '-7.542e-05'
   min: '-3.959e-02'
@@ -1185,7 +1185,7 @@ grads.network.layer3.5.conv2.weight:
   - 3
   sum: '-4.448e+01'
 grads.network.layer3.5.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '2.486e-02'
   mean: '-4.622e-05'
   min: '-2.199e-02'
@@ -1196,7 +1196,7 @@ grads.network.layer3.5.conv3.weight:
   - 1
   sum: '-1.212e+01'
 grads.network.layer4.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '1.216e-02'
   mean: '1.105e-04'
   min: '-1.527e-02'
@@ -1204,15 +1204,15 @@ grads.network.layer4.0.bn1.bias:
   - 512
   sum: '5.66e-02'
 grads.network.layer4.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.341e-02'
-  mean: '2.485e-09'
+  mean: '2.454e-09'
   min: '-1.568e-02'
   shape:
   - 512
-  sum: '1.272e-06'
+  sum: '1.256e-06'
 grads.network.layer4.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '1.081e-02'
   mean: '-9.498e-06'
   min: '-1.008e-02'
@@ -1220,15 +1220,15 @@ grads.network.layer4.0.bn2.bias:
   - 512
   sum: '-4.863e-03'
 grads.network.layer4.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.896e-02'
-  mean: '3.363e-08'
+  mean: '3.362e-08'
   min: '-1.575e-02'
   shape:
   - 512
-  sum: '1.722e-05'
+  sum: '1.721e-05'
 grads.network.layer4.0.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '6.932e-03'
   mean: '1.369e-04'
   min: '-6.060e-03'
@@ -1236,7 +1236,7 @@ grads.network.layer4.0.bn3.bias:
   - 2048
   sum: '2.805e-01'
 grads.network.layer4.0.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '8.164e-03'
   mean: '1.423e-04'
   min: '-7.306e-03'
@@ -1244,7 +1244,7 @@ grads.network.layer4.0.bn3.weight:
   - 2048
   sum: '2.915e-01'
 grads.network.layer4.0.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.748e-02'
   mean: '-2.425e-05'
   min: '-1.699e-02'
@@ -1255,7 +1255,7 @@ grads.network.layer4.0.conv1.weight:
   - 1
   sum: '-1.271e+01'
 grads.network.layer4.0.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '4.355e-02'
   mean: '-2.123e-06'
   min: '-4.091e-02'
@@ -1266,7 +1266,7 @@ grads.network.layer4.0.conv2.weight:
   - 3
   sum: '-5.008e+00'
 grads.network.layer4.0.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.988e-02'
   mean: '2.471e-05'
   min: '-2.667e-02'
@@ -1277,7 +1277,7 @@ grads.network.layer4.0.conv3.weight:
   - 1
   sum: '2.591e+01'
 grads.network.layer4.0.downsample.0.weight:
-  device: cpu
+  device: cuda:0
   max: '1.62e-02'
   mean: '1.449e-05'
   min: '-2.14e-02'
@@ -1288,7 +1288,7 @@ grads.network.layer4.0.downsample.0.weight:
   - 1
   sum: '3.038e+01'
 grads.network.layer4.0.downsample.1.bias:
-  device: cpu
+  device: cuda:0
   max: '6.932e-03'
   mean: '1.369e-04'
   min: '-6.060e-03'
@@ -1296,7 +1296,7 @@ grads.network.layer4.0.downsample.1.bias:
   - 2048
   sum: '2.805e-01'
 grads.network.layer4.0.downsample.1.weight:
-  device: cpu
+  device: cuda:0
   max: '7.480e-03'
   mean: '2.966e-05'
   min: '-7.067e-03'
@@ -1304,7 +1304,7 @@ grads.network.layer4.0.downsample.1.weight:
   - 2048
   sum: '6.073e-02'
 grads.network.layer4.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '8.244e-03'
   mean: '2.764e-05'
   min: '-1.008e-02'
@@ -1312,15 +1312,15 @@ grads.network.layer4.1.bn1.bias:
   - 512
   sum: '1.415e-02'
 grads.network.layer4.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.030e-02'
-  mean: '7.105e-09'
+  mean: '7.094e-09'
   min: '-1.473e-02'
   shape:
   - 512
-  sum: '3.638e-06'
+  sum: '3.632e-06'
 grads.network.layer4.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '9.241e-03'
   mean: '1.883e-05'
   min: '-6.795e-03'
@@ -1328,15 +1328,15 @@ grads.network.layer4.1.bn2.bias:
   - 512
   sum: '9.642e-03'
 grads.network.layer4.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '9.995e-03'
-  mean: '2.547e-08'
+  mean: '2.548e-08'
   min: '-9.566e-03'
   shape:
   - 512
-  sum: '1.304e-05'
+  sum: '1.305e-05'
 grads.network.layer4.1.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '5.288e-03'
   mean: '1.693e-04'
   min: '-5.143e-03'
@@ -1344,7 +1344,7 @@ grads.network.layer4.1.bn3.bias:
   - 2048
   sum: '3.468e-01'
 grads.network.layer4.1.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '5.510e-03'
   mean: '1.148e-04'
   min: '-4.869e-03'
@@ -1352,7 +1352,7 @@ grads.network.layer4.1.bn3.weight:
   - 2048
   sum: '2.352e-01'
 grads.network.layer4.1.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.323e-02'
   mean: '-7.145e-06'
   min: '-1.063e-02'
@@ -1363,7 +1363,7 @@ grads.network.layer4.1.conv1.weight:
   - 1
   sum: '-7.492e+00'
 grads.network.layer4.1.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '4.482e-02'
   mean: '4.064e-06'
   min: '-4.435e-02'
@@ -1374,7 +1374,7 @@ grads.network.layer4.1.conv2.weight:
   - 3
   sum: '9.588e+00'
 grads.network.layer4.1.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.372e-02'
   mean: '-7.804e-07'
   min: '-1.28e-02'
@@ -1385,7 +1385,7 @@ grads.network.layer4.1.conv3.weight:
   - 1
   sum: '-8.183e-01'
 grads.network.layer4.2.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '5.947e-03'
   mean: '3.877e-05'
   min: '-7.937e-03'
@@ -1393,15 +1393,15 @@ grads.network.layer4.2.bn1.bias:
   - 512
   sum: '1.985e-02'
 grads.network.layer4.2.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '8.022e-03'
-  mean: '1.703e-09'
+  mean: '1.71e-09'
   min: '-9.428e-03'
   shape:
   - 512
-  sum: '8.717e-07'
+  sum: '8.754e-07'
 grads.network.layer4.2.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '5.880e-03'
   mean: '9.59e-05'
   min: '-4.611e-03'
@@ -1409,15 +1409,15 @@ grads.network.layer4.2.bn2.bias:
   - 512
   sum: '4.91e-02'
 grads.network.layer4.2.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '7.32e-03'
-  mean: '2.75e-08'
+  mean: '2.751e-08'
   min: '-5.822e-03'
   shape:
   - 512
-  sum: '1.408e-05'
+  sum: '1.409e-05'
 grads.network.layer4.2.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '6.23e-03'
   mean: '2.174e-04'
   min: '-6.104e-03'
@@ -1425,7 +1425,7 @@ grads.network.layer4.2.bn3.bias:
   - 2048
   sum: '4.453e-01'
 grads.network.layer4.2.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '4.123e-03'
   mean: '1.086e-04'
   min: '-4.657e-03'
@@ -1433,7 +1433,7 @@ grads.network.layer4.2.bn3.weight:
   - 2048
   sum: '2.225e-01'
 grads.network.layer4.2.conv1.weight:
-  device: cpu
+  device: cuda:0
   max: '8.671e-03'
   mean: '-1.917e-05'
   min: '-8.358e-03'
@@ -1444,7 +1444,7 @@ grads.network.layer4.2.conv1.weight:
   - 1
   sum: '-2.010e+01'
 grads.network.layer4.2.conv2.weight:
-  device: cpu
+  device: cuda:0
   max: '3.57e-02'
   mean: '-5.759e-06'
   min: '-3.629e-02'
@@ -1455,7 +1455,7 @@ grads.network.layer4.2.conv2.weight:
   - 3
   sum: '-1.359e+01'
 grads.network.layer4.2.conv3.weight:
-  device: cpu
+  device: cuda:0
   max: '9.38e-03'
   mean: '2.033e-05'
   min: '-1.081e-02'
@@ -1466,7 +1466,7 @@ grads.network.layer4.2.conv3.weight:
   - 1
   sum: '2.131e+01'
 outputs.logits:
-  device: cpu
+  device: cuda:0
   max: '5.678e+00'
   mean: '-2.389e-03'
   min: '-5.650e+00'
@@ -1475,14 +1475,14 @@ outputs.logits:
   - 10
   sum: '-3.058e+00'
 outputs.loss:
-  device: cpu
+  device: cuda:0
   max: '2.735e+00'
   mean: '2.735e+00'
   min: '2.735e+00'
   shape: []
   sum: '2.735e+00'
 outputs.y:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.555e+00'
   min: 0
diff --git a/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet50_imagenet_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet50_imagenet_image_classifier.yaml
new file mode 100644
index 00000000..6da0613a
--- /dev/null
+++ b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet50_imagenet_image_classifier.yaml
@@ -0,0 +1,1491 @@
+batch.0:
+  device: cuda:0
+  max: '2.640e+00'
+  mean: '-6.663e-02'
+  min: '-2.118e+00'
+  shape:
+  - 64
+  - 3
+  - 224
+  - 224
+  sum: '-6.419e+05'
+batch.1:
+  device: cuda:0
+  max: 988
+  mean: '5.182e+02'
+  min: 0
+  shape:
+  - 64
+  sum: 33166
+grads.network.bn1.bias:
+  device: cuda:0
+  max: '2.068e-01'
+  mean: '-9.46e-03'
+  min: '-2.002e-01'
+  shape:
+  - 64
+  sum: '-6.054e-01'
+grads.network.bn1.weight:
+  device: cuda:0
+  max: '2.498e-01'
+  mean: '2.254e-07'
+  min: '-3.246e-01'
+  shape:
+  - 64
+  sum: '1.442e-05'
+grads.network.conv1.weight:
+  device: cuda:0
+  max: '4.087e+00'
+  mean: '2.056e-01'
+  min: '-2.608e+00'
+  shape:
+  - 64
+  - 3
+  - 7
+  - 7
+  sum: '1.934e+03'
+grads.network.fc.bias:
+  device: cuda:0
+  max: '4.933e-03'
+  mean: '-2.235e-11'
+  min: '-3.081e-02'
+  shape:
+  - 1000
+  sum: '-2.235e-08'
+grads.network.fc.weight:
+  device: cuda:0
+  max: '9.717e-03'
+  mean: '-1.118e-11'
+  min: '-9.624e-02'
+  shape:
+  - 1000
+  - 2048
+  sum: '-2.289e-05'
+grads.network.layer1.0.bn1.bias:
+  device: cuda:0
+  max: '1.701e-01'
+  mean: '-1.097e-02'
+  min: '-2.24e-01'
+  shape:
+  - 64
+  sum: '-7.022e-01'
+grads.network.layer1.0.bn1.weight:
+  device: cuda:0
+  max: '2.153e-01'
+  mean: '-6.054e-09'
+  min: '-2.101e-01'
+  shape:
+  - 64
+  sum: '-3.874e-07'
+grads.network.layer1.0.bn2.bias:
+  device: cuda:0
+  max: '2.238e-01'
+  mean: '2.082e-03'
+  min: '-1.410e-01'
+  shape:
+  - 64
+  sum: '1.333e-01'
+grads.network.layer1.0.bn2.weight:
+  device: cuda:0
+  max: '1.821e-01'
+  mean: '-9.057e-08'
+  min: '-2.169e-01'
+  shape:
+  - 64
+  sum: '-5.797e-06'
+grads.network.layer1.0.bn3.bias:
+  device: cuda:0
+  max: '6.3e-02'
+  mean: '-6.664e-04'
+  min: '-6.507e-02'
+  shape:
+  - 256
+  sum: '-1.706e-01'
+grads.network.layer1.0.bn3.weight:
+  device: cuda:0
+  max: '9.049e-02'
+  mean: '-6.014e-04'
+  min: '-9.014e-02'
+  shape:
+  - 256
+  sum: '-1.539e-01'
+grads.network.layer1.0.conv1.weight:
+  device: cuda:0
+  max: '3.310e-01'
+  mean: '-6.233e-04'
+  min: '-4.917e-01'
+  shape:
+  - 64
+  - 64
+  - 1
+  - 1
+  sum: '-2.553e+00'
+grads.network.layer1.0.conv2.weight:
+  device: cuda:0
+  max: '2.914e-01'
+  mean: '1.291e-03'
+  min: '-3.517e-01'
+  shape:
+  - 64
+  - 64
+  - 3
+  - 3
+  sum: '4.760e+01'
+grads.network.layer1.0.conv3.weight:
+  device: cuda:0
+  max: '2.922e-01'
+  mean: '9.76e-04'
+  min: '-2.715e-01'
+  shape:
+  - 256
+  - 64
+  - 1
+  - 1
+  sum: '1.599e+01'
+grads.network.layer1.0.downsample.0.weight:
+  device: cuda:0
+  max: '3.240e-01'
+  mean: '6.147e-04'
+  min: '-4.201e-01'
+  shape:
+  - 256
+  - 64
+  - 1
+  - 1
+  sum: '1.007e+01'
+grads.network.layer1.0.downsample.1.bias:
+  device: cuda:0
+  max: '6.3e-02'
+  mean: '-6.664e-04'
+  min: '-6.507e-02'
+  shape:
+  - 256
+  sum: '-1.706e-01'
+grads.network.layer1.0.downsample.1.weight:
+  device: cuda:0
+  max: '1.168e-01'
+  mean: '8.313e-04'
+  min: '-7.264e-02'
+  shape:
+  - 256
+  sum: '2.128e-01'
+grads.network.layer1.1.bn1.bias:
+  device: cuda:0
+  max: '1.160e-01'
+  mean: '9.456e-04'
+  min: '-1.079e-01'
+  shape:
+  - 64
+  sum: '6.052e-02'
+grads.network.layer1.1.bn1.weight:
+  device: cuda:0
+  max: '1.274e-01'
+  mean: '3.097e-08'
+  min: '-1.296e-01'
+  shape:
+  - 64
+  sum: '1.982e-06'
+grads.network.layer1.1.bn2.bias:
+  device: cuda:0
+  max: '9.845e-02'
+  mean: '5.403e-03'
+  min: '-7.661e-02'
+  shape:
+  - 64
+  sum: '3.458e-01'
+grads.network.layer1.1.bn2.weight:
+  device: cuda:0
+  max: '1.274e-01'
+  mean: '-4.994e-08'
+  min: '-1.105e-01'
+  shape:
+  - 64
+  sum: '-3.196e-06'
+grads.network.layer1.1.bn3.bias:
+  device: cuda:0
+  max: '4.778e-02'
+  mean: '9.509e-04'
+  min: '-3.793e-02'
+  shape:
+  - 256
+  sum: '2.434e-01'
+grads.network.layer1.1.bn3.weight:
+  device: cuda:0
+  max: '7.710e-02'
+  mean: '2.718e-04'
+  min: '-5.506e-02'
+  shape:
+  - 256
+  sum: '6.959e-02'
+grads.network.layer1.1.conv1.weight:
+  device: cuda:0
+  max: '1.421e-01'
+  mean: '3.867e-04'
+  min: '-1.254e-01'
+  shape:
+  - 64
+  - 256
+  - 1
+  - 1
+  sum: '6.335e+00'
+grads.network.layer1.1.conv2.weight:
+  device: cuda:0
+  max: '2.049e-01'
+  mean: '-3.724e-04'
+  min: '-2.049e-01'
+  shape:
+  - 64
+  - 64
+  - 3
+  - 3
+  sum: '-1.373e+01'
+grads.network.layer1.1.conv3.weight:
+  device: cuda:0
+  max: '1.850e-01'
+  mean: '-1.549e-04'
+  min: '-1.803e-01'
+  shape:
+  - 256
+  - 64
+  - 1
+  - 1
+  sum: '-2.539e+00'
+grads.network.layer1.2.bn1.bias:
+  device: cuda:0
+  max: '5.462e-02'
+  mean: '-5.246e-04'
+  min: '-8.094e-02'
+  shape:
+  - 64
+  sum: '-3.358e-02'
+grads.network.layer1.2.bn1.weight:
+  device: cuda:0
+  max: '1.337e-01'
+  mean: '9.662e-09'
+  min: '-7.616e-02'
+  shape:
+  - 64
+  sum: '6.184e-07'
+grads.network.layer1.2.bn2.bias:
+  device: cuda:0
+  max: '5.837e-02'
+  mean: '-2.464e-04'
+  min: '-6.975e-02'
+  shape:
+  - 64
+  sum: '-1.577e-02'
+grads.network.layer1.2.bn2.weight:
+  device: cuda:0
+  max: '7.667e-02'
+  mean: '-1.267e-07'
+  min: '-6.187e-02'
+  shape:
+  - 64
+  sum: '-8.106e-06'
+grads.network.layer1.2.bn3.bias:
+  device: cuda:0
+  max: '2.286e-02'
+  mean: '7.026e-04'
+  min: '-2.327e-02'
+  shape:
+  - 256
+  sum: '1.799e-01'
+grads.network.layer1.2.bn3.weight:
+  device: cuda:0
+  max: '4.287e-02'
+  mean: '-5.017e-04'
+  min: '-4.000e-02'
+  shape:
+  - 256
+  sum: '-1.284e-01'
+grads.network.layer1.2.conv1.weight:
+  device: cuda:0
+  max: '8.545e-02'
+  mean: '-3.494e-04'
+  min: '-9.286e-02'
+  shape:
+  - 64
+  - 256
+  - 1
+  - 1
+  sum: '-5.725e+00'
+grads.network.layer1.2.conv2.weight:
+  device: cuda:0
+  max: '1.467e-01'
+  mean: '-1.392e-04'
+  min: '-1.282e-01'
+  shape:
+  - 64
+  - 64
+  - 3
+  - 3
+  sum: '-5.132e+00'
+grads.network.layer1.2.conv3.weight:
+  device: cuda:0
+  max: '1.048e-01'
+  mean: '-1.928e-04'
+  min: '-1.267e-01'
+  shape:
+  - 256
+  - 64
+  - 1
+  - 1
+  sum: '-3.16e+00'
+grads.network.layer2.0.bn1.bias:
+  device: cuda:0
+  max: '4.211e-02'
+  mean: '1.735e-03'
+  min: '-5.167e-02'
+  shape:
+  - 128
+  sum: '2.221e-01'
+grads.network.layer2.0.bn1.weight:
+  device: cuda:0
+  max: '4.957e-02'
+  mean: '8.149e-09'
+  min: '-4.993e-02'
+  shape:
+  - 128
+  sum: '1.043e-06'
+grads.network.layer2.0.bn2.bias:
+  device: cuda:0
+  max: '3.316e-02'
+  mean: '7.625e-04'
+  min: '-3.657e-02'
+  shape:
+  - 128
+  sum: '9.760e-02'
+grads.network.layer2.0.bn2.weight:
+  device: cuda:0
+  max: '5.121e-02'
+  mean: '-4.243e-08'
+  min: '-4.316e-02'
+  shape:
+  - 128
+  sum: '-5.431e-06'
+grads.network.layer2.0.bn3.bias:
+  device: cuda:0
+  max: '2.226e-02'
+  mean: '1.177e-04'
+  min: '-1.811e-02'
+  shape:
+  - 512
+  sum: '6.026e-02'
+grads.network.layer2.0.bn3.weight:
+  device: cuda:0
+  max: '2.429e-02'
+  mean: '-2.402e-04'
+  min: '-2.550e-02'
+  shape:
+  - 512
+  sum: '-1.230e-01'
+grads.network.layer2.0.conv1.weight:
+  device: cuda:0
+  max: '8.179e-02'
+  mean: '-1.704e-05'
+  min: '-7.493e-02'
+  shape:
+  - 128
+  - 256
+  - 1
+  - 1
+  sum: '-5.582e-01'
+grads.network.layer2.0.conv2.weight:
+  device: cuda:0
+  max: '8.488e-02'
+  mean: '-2.583e-04'
+  min: '-8.498e-02'
+  shape:
+  - 128
+  - 128
+  - 3
+  - 3
+  sum: '-3.809e+01'
+grads.network.layer2.0.conv3.weight:
+  device: cuda:0
+  max: '7.02e-02'
+  mean: '1.67e-05'
+  min: '-7.408e-02'
+  shape:
+  - 512
+  - 128
+  - 1
+  - 1
+  sum: '1.094e+00'
+grads.network.layer2.0.downsample.0.weight:
+  device: cuda:0
+  max: '5.65e-02'
+  mean: '3.045e-05'
+  min: '-5.636e-02'
+  shape:
+  - 512
+  - 256
+  - 1
+  - 1
+  sum: '3.991e+00'
+grads.network.layer2.0.downsample.1.bias:
+  device: cuda:0
+  max: '2.226e-02'
+  mean: '1.177e-04'
+  min: '-1.811e-02'
+  shape:
+  - 512
+  sum: '6.026e-02'
+grads.network.layer2.0.downsample.1.weight:
+  device: cuda:0
+  max: '2.814e-02'
+  mean: '4.625e-04'
+  min: '-2.305e-02'
+  shape:
+  - 512
+  sum: '2.368e-01'
+grads.network.layer2.1.bn1.bias:
+  device: cuda:0
+  max: '3.645e-02'
+  mean: '-7.118e-04'
+  min: '-3.115e-02'
+  shape:
+  - 128
+  sum: '-9.111e-02'
+grads.network.layer2.1.bn1.weight:
+  device: cuda:0
+  max: '4.458e-02'
+  mean: '-6.869e-09'
+  min: '-3.865e-02'
+  shape:
+  - 128
+  sum: '-8.792e-07'
+grads.network.layer2.1.bn2.bias:
+  device: cuda:0
+  max: '2.695e-02'
+  mean: '-9.38e-04'
+  min: '-2.543e-02'
+  shape:
+  - 128
+  sum: '-1.201e-01'
+grads.network.layer2.1.bn2.weight:
+  device: cuda:0
+  max: '2.824e-02'
+  mean: '-1.768e-08'
+  min: '-2.943e-02'
+  shape:
+  - 128
+  sum: '-2.263e-06'
+grads.network.layer2.1.bn3.bias:
+  device: cuda:0
+  max: '1.148e-02'
+  mean: '2.42e-04'
+  min: '-9.819e-03'
+  shape:
+  - 512
+  sum: '1.239e-01'
+grads.network.layer2.1.bn3.weight:
+  device: cuda:0
+  max: '1.542e-02'
+  mean: '-9.633e-05'
+  min: '-1.593e-02'
+  shape:
+  - 512
+  sum: '-4.932e-02'
+grads.network.layer2.1.conv1.weight:
+  device: cuda:0
+  max: '3.077e-02'
+  mean: '3.157e-04'
+  min: '-3.122e-02'
+  shape:
+  - 128
+  - 512
+  - 1
+  - 1
+  sum: '2.069e+01'
+grads.network.layer2.1.conv2.weight:
+  device: cuda:0
+  max: '5.878e-02'
+  mean: '5.832e-05'
+  min: '-5.409e-02'
+  shape:
+  - 128
+  - 128
+  - 3
+  - 3
+  sum: '8.600e+00'
+grads.network.layer2.1.conv3.weight:
+  device: cuda:0
+  max: '5.426e-02'
+  mean: '6.567e-05'
+  min: '-3.881e-02'
+  shape:
+  - 512
+  - 128
+  - 1
+  - 1
+  sum: '4.303e+00'
+grads.network.layer2.2.bn1.bias:
+  device: cuda:0
+  max: '3.436e-02'
+  mean: '1.063e-05'
+  min: '-2.625e-02'
+  shape:
+  - 128
+  sum: '1.361e-03'
+grads.network.layer2.2.bn1.weight:
+  device: cuda:0
+  max: '2.442e-02'
+  mean: '-6.228e-09'
+  min: '-3.548e-02'
+  shape:
+  - 128
+  sum: '-7.972e-07'
+grads.network.layer2.2.bn2.bias:
+  device: cuda:0
+  max: '1.91e-02'
+  mean: '8.820e-05'
+  min: '-1.719e-02'
+  shape:
+  - 128
+  sum: '1.129e-02'
+grads.network.layer2.2.bn2.weight:
+  device: cuda:0
+  max: '2.045e-02'
+  mean: '7.683e-09'
+  min: '-2.136e-02'
+  shape:
+  - 128
+  sum: '9.835e-07'
+grads.network.layer2.2.bn3.bias:
+  device: cuda:0
+  max: '7.928e-03'
+  mean: '-9.574e-05'
+  min: '-7.345e-03'
+  shape:
+  - 512
+  sum: '-4.902e-02'
+grads.network.layer2.2.bn3.weight:
+  device: cuda:0
+  max: '1.170e-02'
+  mean: '2.873e-05'
+  min: '-1.136e-02'
+  shape:
+  - 512
+  sum: '1.471e-02'
+grads.network.layer2.2.conv1.weight:
+  device: cuda:0
+  max: '2.182e-02'
+  mean: '5.088e-05'
+  min: '-2.084e-02'
+  shape:
+  - 128
+  - 512
+  - 1
+  - 1
+  sum: '3.334e+00'
+grads.network.layer2.2.conv2.weight:
+  device: cuda:0
+  max: '4.288e-02'
+  mean: '-5.458e-05'
+  min: '-4.216e-02'
+  shape:
+  - 128
+  - 128
+  - 3
+  - 3
+  sum: '-8.048e+00'
+grads.network.layer2.2.conv3.weight:
+  device: cuda:0
+  max: '3.284e-02'
+  mean: '4.204e-05'
+  min: '-3.245e-02'
+  shape:
+  - 512
+  - 128
+  - 1
+  - 1
+  sum: '2.755e+00'
+grads.network.layer2.3.bn1.bias:
+  device: cuda:0
+  max: '1.834e-02'
+  mean: '4.186e-04'
+  min: '-2.066e-02'
+  shape:
+  - 128
+  sum: '5.358e-02'
+grads.network.layer2.3.bn1.weight:
+  device: cuda:0
+  max: '2.448e-02'
+  mean: '-2.095e-09'
+  min: '-2.123e-02'
+  shape:
+  - 128
+  sum: '-2.682e-07'
+grads.network.layer2.3.bn2.bias:
+  device: cuda:0
+  max: '1.283e-02'
+  mean: '2.229e-04'
+  min: '-1.321e-02'
+  shape:
+  - 128
+  sum: '2.853e-02'
+grads.network.layer2.3.bn2.weight:
+  device: cuda:0
+  max: '1.610e-02'
+  mean: '-3.396e-08'
+  min: '-2.095e-02'
+  shape:
+  - 128
+  sum: '-4.347e-06'
+grads.network.layer2.3.bn3.bias:
+  device: cuda:0
+  max: '4.654e-03'
+  mean: '-2.983e-05'
+  min: '-5.059e-03'
+  shape:
+  - 512
+  sum: '-1.527e-02'
+grads.network.layer2.3.bn3.weight:
+  device: cuda:0
+  max: '1.013e-02'
+  mean: '-1.547e-04'
+  min: '-1.059e-02'
+  shape:
+  - 512
+  sum: '-7.918e-02'
+grads.network.layer2.3.conv1.weight:
+  device: cuda:0
+  max: '1.884e-02'
+  mean: '1.101e-04'
+  min: '-1.608e-02'
+  shape:
+  - 128
+  - 512
+  - 1
+  - 1
+  sum: '7.213e+00'
+grads.network.layer2.3.conv2.weight:
+  device: cuda:0
+  max: '2.661e-02'
+  mean: '6.131e-05'
+  min: '-2.643e-02'
+  shape:
+  - 128
+  - 128
+  - 3
+  - 3
+  sum: '9.040e+00'
+grads.network.layer2.3.conv3.weight:
+  device: cuda:0
+  max: '2.310e-02'
+  mean: '4.181e-05'
+  min: '-2.429e-02'
+  shape:
+  - 512
+  - 128
+  - 1
+  - 1
+  sum: '2.74e+00'
+grads.network.layer3.0.bn1.bias:
+  device: cuda:0
+  max: '1.159e-02'
+  mean: '6.957e-05'
+  min: '-1.154e-02'
+  shape:
+  - 256
+  sum: '1.781e-02'
+grads.network.layer3.0.bn1.weight:
+  device: cuda:0
+  max: '1.38e-02'
+  mean: '-4.657e-10'
+  min: '-1.321e-02'
+  shape:
+  - 256
+  sum: '-1.192e-07'
+grads.network.layer3.0.bn2.bias:
+  device: cuda:0
+  max: '1.036e-02'
+  mean: '1.608e-04'
+  min: '-1.092e-02'
+  shape:
+  - 256
+  sum: '4.116e-02'
+grads.network.layer3.0.bn2.weight:
+  device: cuda:0
+  max: '1.286e-02'
+  mean: '-9.262e-09'
+  min: '-1.329e-02'
+  shape:
+  - 256
+  sum: '-2.371e-06'
+grads.network.layer3.0.bn3.bias:
+  device: cuda:0
+  max: '4.818e-03'
+  mean: '1.895e-05'
+  min: '-4.491e-03'
+  shape:
+  - 1024
+  sum: '1.940e-02'
+grads.network.layer3.0.bn3.weight:
+  device: cuda:0
+  max: '6.393e-03'
+  mean: '-5.269e-05'
+  min: '-5.746e-03'
+  shape:
+  - 1024
+  sum: '-5.396e-02'
+grads.network.layer3.0.conv1.weight:
+  device: cuda:0
+  max: '1.654e-02'
+  mean: '-4.966e-05'
+  min: '-1.824e-02'
+  shape:
+  - 256
+  - 512
+  - 1
+  - 1
+  sum: '-6.51e+00'
+grads.network.layer3.0.conv2.weight:
+  device: cuda:0
+  max: '1.841e-02'
+  mean: '-1.719e-05'
+  min: '-1.882e-02'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '-1.014e+01'
+grads.network.layer3.0.conv3.weight:
+  device: cuda:0
+  max: '1.641e-02'
+  mean: '-2.978e-05'
+  min: '-1.824e-02'
+  shape:
+  - 1024
+  - 256
+  - 1
+  - 1
+  sum: '-7.806e+00'
+grads.network.layer3.0.downsample.0.weight:
+  device: cuda:0
+  max: '1.271e-02'
+  mean: '-2.944e-05'
+  min: '-1.281e-02'
+  shape:
+  - 1024
+  - 512
+  - 1
+  - 1
+  sum: '-1.544e+01'
+grads.network.layer3.0.downsample.1.bias:
+  device: cuda:0
+  max: '4.818e-03'
+  mean: '1.895e-05'
+  min: '-4.491e-03'
+  shape:
+  - 1024
+  sum: '1.940e-02'
+grads.network.layer3.0.downsample.1.weight:
+  device: cuda:0
+  max: '7.039e-03'
+  mean: '-1.403e-05'
+  min: '-5.472e-03'
+  shape:
+  - 1024
+  sum: '-1.437e-02'
+grads.network.layer3.1.bn1.bias:
+  device: cuda:0
+  max: '1.027e-02'
+  mean: '-7.899e-05'
+  min: '-7.042e-03'
+  shape:
+  - 256
+  sum: '-2.022e-02'
+grads.network.layer3.1.bn1.weight:
+  device: cuda:0
+  max: '9.592e-03'
+  mean: '-1.186e-09'
+  min: '-9.877e-03'
+  shape:
+  - 256
+  sum: '-3.036e-07'
+grads.network.layer3.1.bn2.bias:
+  device: cuda:0
+  max: '5.802e-03'
+  mean: '-1.144e-04'
+  min: '-6.516e-03'
+  shape:
+  - 256
+  sum: '-2.929e-02'
+grads.network.layer3.1.bn2.weight:
+  device: cuda:0
+  max: '7.174e-03'
+  mean: '1.312e-08'
+  min: '-7.594e-03'
+  shape:
+  - 256
+  sum: '3.359e-06'
+grads.network.layer3.1.bn3.bias:
+  device: cuda:0
+  max: '2.986e-03'
+  mean: '-8.18e-06'
+  min: '-3.319e-03'
+  shape:
+  - 1024
+  sum: '-8.376e-03'
+grads.network.layer3.1.bn3.weight:
+  device: cuda:0
+  max: '4.028e-03'
+  mean: '6.062e-05'
+  min: '-3.991e-03'
+  shape:
+  - 1024
+  sum: '6.207e-02'
+grads.network.layer3.1.conv1.weight:
+  device: cuda:0
+  max: '8.729e-03'
+  mean: '-2.166e-05'
+  min: '-7.953e-03'
+  shape:
+  - 256
+  - 1024
+  - 1
+  - 1
+  sum: '-5.678e+00'
+grads.network.layer3.1.conv2.weight:
+  device: cuda:0
+  max: '1.39e-02'
+  mean: '-2.612e-05'
+  min: '-1.387e-02'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '-1.541e+01'
+grads.network.layer3.1.conv3.weight:
+  device: cuda:0
+  max: '1.024e-02'
+  mean: '-1.092e-05'
+  min: '-1.074e-02'
+  shape:
+  - 1024
+  - 256
+  - 1
+  - 1
+  sum: '-2.863e+00'
+grads.network.layer3.2.bn1.bias:
+  device: cuda:0
+  max: '7.474e-03'
+  mean: '1.205e-04'
+  min: '-6.481e-03'
+  shape:
+  - 256
+  sum: '3.085e-02'
+grads.network.layer3.2.bn1.weight:
+  device: cuda:0
+  max: '9.865e-03'
+  mean: '-9.313e-10'
+  min: '-7.930e-03'
+  shape:
+  - 256
+  sum: '-2.384e-07'
+grads.network.layer3.2.bn2.bias:
+  device: cuda:0
+  max: '5.072e-03'
+  mean: '1.298e-04'
+  min: '-4.838e-03'
+  shape:
+  - 256
+  sum: '3.323e-02'
+grads.network.layer3.2.bn2.weight:
+  device: cuda:0
+  max: '6.424e-03'
+  mean: '9.468e-09'
+  min: '-5.991e-03'
+  shape:
+  - 256
+  sum: '2.424e-06'
+grads.network.layer3.2.bn3.bias:
+  device: cuda:0
+  max: '1.696e-03'
+  mean: '2.526e-05'
+  min: '-1.766e-03'
+  shape:
+  - 1024
+  sum: '2.587e-02'
+grads.network.layer3.2.bn3.weight:
+  device: cuda:0
+  max: '3.010e-03'
+  mean: '3.859e-05'
+  min: '-2.832e-03'
+  shape:
+  - 1024
+  sum: '3.952e-02'
+grads.network.layer3.2.conv1.weight:
+  device: cuda:0
+  max: '6.116e-03'
+  mean: '-1.069e-05'
+  min: '-6.560e-03'
+  shape:
+  - 256
+  - 1024
+  - 1
+  - 1
+  sum: '-2.802e+00'
+grads.network.layer3.2.conv2.weight:
+  device: cuda:0
+  max: '9.867e-03'
+  mean: '-6.347e-06'
+  min: '-9.511e-03'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '-3.744e+00'
+grads.network.layer3.2.conv3.weight:
+  device: cuda:0
+  max: '7.406e-03'
+  mean: '-2.159e-05'
+  min: '-7.51e-03'
+  shape:
+  - 1024
+  - 256
+  - 1
+  - 1
+  sum: '-5.66e+00'
+grads.network.layer3.3.bn1.bias:
+  device: cuda:0
+  max: '3.839e-03'
+  mean: '4.194e-05'
+  min: '-4.033e-03'
+  shape:
+  - 256
+  sum: '1.074e-02'
+grads.network.layer3.3.bn1.weight:
+  device: cuda:0
+  max: '5.956e-03'
+  mean: '1.382e-10'
+  min: '-5.073e-03'
+  shape:
+  - 256
+  sum: '3.539e-08'
+grads.network.layer3.3.bn2.bias:
+  device: cuda:0
+  max: '4.210e-03'
+  mean: '3.714e-05'
+  min: '-3.497e-03'
+  shape:
+  - 256
+  sum: '9.507e-03'
+grads.network.layer3.3.bn2.weight:
+  device: cuda:0
+  max: '4.847e-03'
+  mean: '-6.614e-09'
+  min: '-4.154e-03'
+  shape:
+  - 256
+  sum: '-1.693e-06'
+grads.network.layer3.3.bn3.bias:
+  device: cuda:0
+  max: '1.448e-03'
+  mean: '1.18e-05'
+  min: '-1.585e-03'
+  shape:
+  - 1024
+  sum: '1.208e-02'
+grads.network.layer3.3.bn3.weight:
+  device: cuda:0
+  max: '2.472e-03'
+  mean: '-3.084e-05'
+  min: '-2.461e-03'
+  shape:
+  - 1024
+  sum: '-3.158e-02'
+grads.network.layer3.3.conv1.weight:
+  device: cuda:0
+  max: '4.561e-03'
+  mean: '-1.505e-06'
+  min: '-4.213e-03'
+  shape:
+  - 256
+  - 1024
+  - 1
+  - 1
+  sum: '-3.946e-01'
+grads.network.layer3.3.conv2.weight:
+  device: cuda:0
+  max: '7.155e-03'
+  mean: '-1.727e-05'
+  min: '-7.462e-03'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '-1.019e+01'
+grads.network.layer3.3.conv3.weight:
+  device: cuda:0
+  max: '7.199e-03'
+  mean: '-1.848e-05'
+  min: '-6.481e-03'
+  shape:
+  - 1024
+  - 256
+  - 1
+  - 1
+  sum: '-4.844e+00'
+grads.network.layer3.4.bn1.bias:
+  device: cuda:0
+  max: '3.403e-03'
+  mean: '2.286e-05'
+  min: '-3.422e-03'
+  shape:
+  - 256
+  sum: '5.853e-03'
+grads.network.layer3.4.bn1.weight:
+  device: cuda:0
+  max: '3.392e-03'
+  mean: '7.512e-10'
+  min: '-4.168e-03'
+  shape:
+  - 256
+  sum: '1.923e-07'
+grads.network.layer3.4.bn2.bias:
+  device: cuda:0
+  max: '2.511e-03'
+  mean: '5.277e-05'
+  min: '-3.381e-03'
+  shape:
+  - 256
+  sum: '1.351e-02'
+grads.network.layer3.4.bn2.weight:
+  device: cuda:0
+  max: '4.038e-03'
+  mean: '3.572e-09'
+  min: '-3.609e-03'
+  shape:
+  - 256
+  sum: '9.146e-07'
+grads.network.layer3.4.bn3.bias:
+  device: cuda:0
+  max: '1.408e-03'
+  mean: '1.227e-05'
+  min: '-8.456e-04'
+  shape:
+  - 1024
+  sum: '1.256e-02'
+grads.network.layer3.4.bn3.weight:
+  device: cuda:0
+  max: '1.611e-03'
+  mean: '1.336e-05'
+  min: '-1.889e-03'
+  shape:
+  - 1024
+  sum: '1.368e-02'
+grads.network.layer3.4.conv1.weight:
+  device: cuda:0
+  max: '3.532e-03'
+  mean: '-8.469e-06'
+  min: '-4.099e-03'
+  shape:
+  - 256
+  - 1024
+  - 1
+  - 1
+  sum: '-2.220e+00'
+grads.network.layer3.4.conv2.weight:
+  device: cuda:0
+  max: '5.658e-03'
+  mean: '-1.714e-05'
+  min: '-5.384e-03'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '-1.011e+01'
+grads.network.layer3.4.conv3.weight:
+  device: cuda:0
+  max: '4.909e-03'
+  mean: '-1.151e-05'
+  min: '-4.874e-03'
+  shape:
+  - 1024
+  - 256
+  - 1
+  - 1
+  sum: '-3.016e+00'
+grads.network.layer3.5.bn1.bias:
+  device: cuda:0
+  max: '2.425e-03'
+  mean: '-1.526e-05'
+  min: '-2.448e-03'
+  shape:
+  - 256
+  sum: '-3.906e-03'
+grads.network.layer3.5.bn1.weight:
+  device: cuda:0
+  max: '3.617e-03'
+  mean: '7.203e-10'
+  min: '-2.678e-03'
+  shape:
+  - 256
+  sum: '1.844e-07'
+grads.network.layer3.5.bn2.bias:
+  device: cuda:0
+  max: '2.354e-03'
+  mean: '5.188e-05'
+  min: '-3.471e-03'
+  shape:
+  - 256
+  sum: '1.328e-02'
+grads.network.layer3.5.bn2.weight:
+  device: cuda:0
+  max: '2.992e-03'
+  mean: '-3.147e-09'
+  min: '-2.420e-03'
+  shape:
+  - 256
+  sum: '-8.056e-07'
+grads.network.layer3.5.bn3.bias:
+  device: cuda:0
+  max: '6.43e-04'
+  mean: '8.147e-06'
+  min: '-6.512e-04'
+  shape:
+  - 1024
+  sum: '8.342e-03'
+grads.network.layer3.5.bn3.weight:
+  device: cuda:0
+  max: '1.439e-03'
+  mean: '-1.501e-05'
+  min: '-1.433e-03'
+  shape:
+  - 1024
+  sum: '-1.537e-02'
+grads.network.layer3.5.conv1.weight:
+  device: cuda:0
+  max: '2.588e-03'
+  mean: '-1.225e-05'
+  min: '-3.101e-03'
+  shape:
+  - 256
+  - 1024
+  - 1
+  - 1
+  sum: '-3.211e+00'
+grads.network.layer3.5.conv2.weight:
+  device: cuda:0
+  max: '4.908e-03'
+  mean: '-1.443e-05'
+  min: '-4.324e-03'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '-8.509e+00'
+grads.network.layer3.5.conv3.weight:
+  device: cuda:0
+  max: '4.695e-03'
+  mean: '-1.048e-05'
+  min: '-4.000e-03'
+  shape:
+  - 1024
+  - 256
+  - 1
+  - 1
+  sum: '-2.746e+00'
+grads.network.layer4.0.bn1.bias:
+  device: cuda:0
+  max: '2.172e-03'
+  mean: '-1.531e-06'
+  min: '-2.475e-03'
+  shape:
+  - 512
+  sum: '-7.838e-04'
+grads.network.layer4.0.bn1.weight:
+  device: cuda:0
+  max: '2.885e-03'
+  mean: '1.164e-10'
+  min: '-3.367e-03'
+  shape:
+  - 512
+  sum: '5.960e-08'
+grads.network.layer4.0.bn2.bias:
+  device: cuda:0
+  max: '1.743e-03'
+  mean: '4.506e-05'
+  min: '-1.865e-03'
+  shape:
+  - 512
+  sum: '2.307e-02'
+grads.network.layer4.0.bn2.weight:
+  device: cuda:0
+  max: '2.32e-03'
+  mean: '1.145e-08'
+  min: '-3.617e-03'
+  shape:
+  - 512
+  sum: '5.864e-06'
+grads.network.layer4.0.bn3.bias:
+  device: cuda:0
+  max: '2.545e-03'
+  mean: '8.033e-05'
+  min: '-2.183e-03'
+  shape:
+  - 2048
+  sum: '1.645e-01'
+grads.network.layer4.0.bn3.weight:
+  device: cuda:0
+  max: '2.965e-03'
+  mean: '4.471e-05'
+  min: '-2.004e-03'
+  shape:
+  - 2048
+  sum: '9.156e-02'
+grads.network.layer4.0.conv1.weight:
+  device: cuda:0
+  max: '3.048e-03'
+  mean: '-1.777e-05'
+  min: '-2.91e-03'
+  shape:
+  - 512
+  - 1024
+  - 1
+  - 1
+  sum: '-9.317e+00'
+grads.network.layer4.0.conv2.weight:
+  device: cuda:0
+  max: '4.142e-03'
+  mean: '-8.243e-06'
+  min: '-3.973e-03'
+  shape:
+  - 512
+  - 512
+  - 3
+  - 3
+  sum: '-1.945e+01'
+grads.network.layer4.0.conv3.weight:
+  device: cuda:0
+  max: '3.856e-03'
+  mean: '-4.106e-06'
+  min: '-4.645e-03'
+  shape:
+  - 2048
+  - 512
+  - 1
+  - 1
+  sum: '-4.306e+00'
+grads.network.layer4.0.downsample.0.weight:
+  device: cuda:0
+  max: '3.427e-03'
+  mean: '1.003e-06'
+  min: '-3.696e-03'
+  shape:
+  - 2048
+  - 1024
+  - 1
+  - 1
+  sum: '2.104e+00'
+grads.network.layer4.0.downsample.1.bias:
+  device: cuda:0
+  max: '2.545e-03'
+  mean: '8.033e-05'
+  min: '-2.183e-03'
+  shape:
+  - 2048
+  sum: '1.645e-01'
+grads.network.layer4.0.downsample.1.weight:
+  device: cuda:0
+  max: '2.177e-03'
+  mean: '3.785e-05'
+  min: '-2.256e-03'
+  shape:
+  - 2048
+  sum: '7.751e-02'
+grads.network.layer4.1.bn1.bias:
+  device: cuda:0
+  max: '1.501e-03'
+  mean: '2.144e-05'
+  min: '-1.368e-03'
+  shape:
+  - 512
+  sum: '1.098e-02'
+grads.network.layer4.1.bn1.weight:
+  device: cuda:0
+  max: '2.379e-03'
+  mean: '7.913e-11'
+  min: '-2.5e-03'
+  shape:
+  - 512
+  sum: '4.051e-08'
+grads.network.layer4.1.bn2.bias:
+  device: cuda:0
+  max: '1.778e-03'
+  mean: '4.209e-05'
+  min: '-1.812e-03'
+  shape:
+  - 512
+  sum: '2.155e-02'
+grads.network.layer4.1.bn2.weight:
+  device: cuda:0
+  max: '2.058e-03'
+  mean: '1.25e-08'
+  min: '-2.322e-03'
+  shape:
+  - 512
+  sum: '6.399e-06'
+grads.network.layer4.1.bn3.bias:
+  device: cuda:0
+  max: '2.914e-03'
+  mean: '1.136e-04'
+  min: '-3.222e-03'
+  shape:
+  - 2048
+  sum: '2.327e-01'
+grads.network.layer4.1.bn3.weight:
+  device: cuda:0
+  max: '2.364e-03'
+  mean: '5.421e-05'
+  min: '-2.150e-03'
+  shape:
+  - 2048
+  sum: '1.110e-01'
+grads.network.layer4.1.conv1.weight:
+  device: cuda:0
+  max: '1.885e-03'
+  mean: '-2.997e-06'
+  min: '-1.927e-03'
+  shape:
+  - 512
+  - 2048
+  - 1
+  - 1
+  sum: '-3.143e+00'
+grads.network.layer4.1.conv2.weight:
+  device: cuda:0
+  max: '3.744e-03'
+  mean: '-1.002e-05'
+  min: '-3.811e-03'
+  shape:
+  - 512
+  - 512
+  - 3
+  - 3
+  sum: '-2.364e+01'
+grads.network.layer4.1.conv3.weight:
+  device: cuda:0
+  max: '5.011e-03'
+  mean: '2.916e-07'
+  min: '-3.704e-03'
+  shape:
+  - 2048
+  - 512
+  - 1
+  - 1
+  sum: '3.058e-01'
+grads.network.layer4.2.bn1.bias:
+  device: cuda:0
+  max: '1.331e-03'
+  mean: '2.21e-05'
+  min: '-1.425e-03'
+  shape:
+  - 512
+  sum: '1.131e-02'
+grads.network.layer4.2.bn1.weight:
+  device: cuda:0
+  max: '2.19e-03'
+  mean: '2.183e-10'
+  min: '-2.435e-03'
+  shape:
+  - 512
+  sum: '1.118e-07'
+grads.network.layer4.2.bn2.bias:
+  device: cuda:0
+  max: '1.404e-03'
+  mean: '9.475e-06'
+  min: '-1.412e-03'
+  shape:
+  - 512
+  sum: '4.851e-03'
+grads.network.layer4.2.bn2.weight:
+  device: cuda:0
+  max: '3.054e-03'
+  mean: '1.17e-08'
+  min: '-2.907e-03'
+  shape:
+  - 512
+  sum: '5.990e-06'
+grads.network.layer4.2.bn3.bias:
+  device: cuda:0
+  max: '4.169e-03'
+  mean: '1.393e-04'
+  min: '-4.317e-03'
+  shape:
+  - 2048
+  sum: '2.852e-01'
+grads.network.layer4.2.bn3.weight:
+  device: cuda:0
+  max: '2.599e-03'
+  mean: '5.148e-05'
+  min: '-1.775e-03'
+  shape:
+  - 2048
+  sum: '1.054e-01'
+grads.network.layer4.2.conv1.weight:
+  device: cuda:0
+  max: '1.832e-03'
+  mean: '-4.348e-06'
+  min: '-1.785e-03'
+  shape:
+  - 512
+  - 2048
+  - 1
+  - 1
+  sum: '-4.559e+00'
+grads.network.layer4.2.conv2.weight:
+  device: cuda:0
+  max: '4.026e-03'
+  mean: '4.673e-06'
+  min: '-3.410e-03'
+  shape:
+  - 512
+  - 512
+  - 3
+  - 3
+  sum: '1.102e+01'
+grads.network.layer4.2.conv3.weight:
+  device: cuda:0
+  max: '4.736e-03'
+  mean: '-5.085e-06'
+  min: '-4.618e-03'
+  shape:
+  - 2048
+  - 512
+  - 1
+  - 1
+  sum: '-5.332e+00'
+outputs.logits:
+  device: cuda:0
+  max: '4.058e+00'
+  mean: '1.188e-02'
+  min: '-4.237e+00'
+  shape:
+  - 64
+  - 1000
+  sum: '7.600e+02'
+outputs.loss:
+  device: cuda:0
+  max: '7.112e+00'
+  mean: '7.112e+00'
+  min: '7.112e+00'
+  shape: []
+  sum: '7.112e+00'
+outputs.y:
+  device: cuda:0
+  max: 988
+  mean: '5.182e+02'
+  min: 0
+  shape:
+  - 64
+  sum: 33166
diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/fcnet_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/fcnet_cifar10_image_classifier.yaml
similarity index 100%
rename from .regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/fcnet_cifar10_example.yaml
rename to .regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/fcnet_cifar10_image_classifier.yaml
diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/fcnet_fashion_mnist_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/fcnet_fashion_mnist_image_classifier.yaml
similarity index 100%
rename from .regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/fcnet_fashion_mnist_example.yaml
rename to .regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/fcnet_fashion_mnist_image_classifier.yaml
diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/fcnet_mnist_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/fcnet_mnist_image_classifier.yaml
similarity index 100%
rename from .regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/fcnet_mnist_example.yaml
rename to .regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/fcnet_mnist_image_classifier.yaml
diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/resnet18_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet18_cifar10_image_classifier.yaml
similarity index 100%
rename from .regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/resnet18_cifar10_example.yaml
rename to .regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet18_cifar10_image_classifier.yaml
diff --git a/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet18_imagenet_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet18_imagenet_image_classifier.yaml
new file mode 100644
index 00000000..071379c4
--- /dev/null
+++ b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet18_imagenet_image_classifier.yaml
@@ -0,0 +1,20 @@
+input:
+  device: cuda:0
+  max: '2.640e+00'
+  mean: '-6.663e-02'
+  min: '-2.118e+00'
+  shape:
+  - 64
+  - 3
+  - 224
+  - 224
+  sum: '-6.419e+05'
+out:
+  device: cuda:0
+  max: '2.934e+00'
+  mean: '-8.071e-04'
+  min: '-2.896e+00'
+  shape:
+  - 64
+  - 1000
+  sum: '-5.165e+01'
diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/resnet50_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet50_cifar10_image_classifier.yaml
similarity index 100%
rename from .regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/resnet50_cifar10_example.yaml
rename to .regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet50_cifar10_image_classifier.yaml
diff --git a/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet50_imagenet_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet50_imagenet_image_classifier.yaml
new file mode 100644
index 00000000..bfd8d4f6
--- /dev/null
+++ b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet50_imagenet_image_classifier.yaml
@@ -0,0 +1,20 @@
+input:
+  device: cuda:0
+  max: '2.640e+00'
+  mean: '-6.663e-02'
+  min: '-2.118e+00'
+  shape:
+  - 64
+  - 3
+  - 224
+  - 224
+  sum: '-6.419e+05'
+out:
+  device: cuda:0
+  max: '4.058e+00'
+  mean: '1.188e-02'
+  min: '-4.237e+00'
+  shape:
+  - 64
+  - 1000
+  sum: '7.600e+02'
diff --git a/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_cifar10_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_cifar10_image_classifier.yaml
new file mode 100644
index 00000000..1018428b
--- /dev/null
+++ b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_cifar10_image_classifier.yaml
@@ -0,0 +1,51 @@
+network.0.1.bias:
+  device: cuda:0
+  max: '1.801e-02'
+  mean: '1.029e-03'
+  min: '-1.784e-02'
+  shape:
+  - 128
+  sum: '1.317e-01'
+network.0.1.weight:
+  device: cuda:0
+  max: '1.804e-02'
+  mean: '1.616e-05'
+  min: '-1.804e-02'
+  shape:
+  - 128
+  - 3072
+  sum: '6.354e+00'
+network.1.0.bias:
+  device: cuda:0
+  max: '8.781e-02'
+  mean: '4.829e-04'
+  min: '-8.787e-02'
+  shape:
+  - 128
+  sum: '6.181e-02'
+network.1.0.weight:
+  device: cuda:0
+  max: '8.837e-02'
+  mean: '-9.613e-04'
+  min: '-8.837e-02'
+  shape:
+  - 128
+  - 128
+  sum: '-1.575e+01'
+network.2.0.bias:
+  device: cuda:0
+  max: '8.495e-02'
+  mean: '-9.068e-04'
+  min: '-8.834e-02'
+  shape:
+  - 10
+  sum: '-9.068e-03'
+network.2.0.weight:
+  device: cuda:0
+  max: '8.826e-02'
+  mean: '-3.724e-04'
+  min: '-8.834e-02'
+  shape:
+  - 10
+  - 128
+  sum: '-4.767e-01'
diff --git a/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_fashion_mnist_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_fashion_mnist_image_classifier.yaml
new file mode 100644
index 00000000..c85a5f80
--- /dev/null
+++ b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_fashion_mnist_image_classifier.yaml
@@ -0,0 +1,51 @@
+network.0.1.bias:
+  device: cuda:0
+  max: '3.530e-02'
+  mean: '1.341e-03'
+  min: '-3.541e-02'
+  shape:
+  - 128
+  sum: '1.716e-01'
+network.0.1.weight:
+  device: cuda:0
+  max: '3.571e-02'
+  mean: '9.349e-05'
+  min: '-3.571e-02'
+  shape:
+  - 128
+  - 784
+  sum: '9.382e+00'
+network.1.0.bias:
+  device: cuda:0
+  max: '8.268e-02'
+  mean: '-6.752e-03'
+  min: '-8.591e-02'
+  shape:
+  - 128
+  sum: '-8.642e-01'
+network.1.0.weight:
+  device: cuda:0
+  max: '8.837e-02'
+  mean: '1.286e-04'
+  min: '-8.838e-02'
+  shape:
+  - 128
+  - 128
+  sum: '2.107e+00'
+network.2.0.bias:
+  device: cuda:0
+  max: '4.038e-02'
+  mean: '-3.545e-02'
+  min: '-7.938e-02'
+  shape:
+  - 10
+  sum: '-3.545e-01'
+network.2.0.weight:
+  device: cuda:0
+  max: '8.829e-02'
+  mean: '-5.307e-04'
+  min: '-8.835e-02'
+  shape:
+  - 10
+  - 128
+  sum: '-6.793e-01'
diff --git a/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_mnist_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_mnist_image_classifier.yaml
new file mode 100644
index 00000000..c85a5f80
--- /dev/null
+++ b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_mnist_image_classifier.yaml
@@ -0,0 +1,51 @@
+network.0.1.bias:
+  device: cuda:0
+  max: '3.530e-02'
+  mean: '1.341e-03'
+  min: '-3.541e-02'
+  shape:
+  - 128
+  sum: '1.716e-01'
+network.0.1.weight:
+  device: cuda:0
+  max: '3.571e-02'
+  mean: '9.349e-05'
+  min: '-3.571e-02'
+  shape:
+  - 128
+  - 784
+  sum: '9.382e+00'
+network.1.0.bias:
+  device: cuda:0
+  max: '8.268e-02'
+  mean: '-6.752e-03'
+  min: '-8.591e-02'
+  shape:
+  - 128
+  sum: '-8.642e-01'
+network.1.0.weight:
+  device: cuda:0
+  max: '8.837e-02'
+  mean: '1.286e-04'
+  min: '-8.838e-02'
+  shape:
+  - 128
+  - 128
+  sum: '2.107e+00'
+network.2.0.bias:
+  device: cuda:0
+  max: '4.038e-02'
+  mean: '-3.545e-02'
+  min: '-7.938e-02'
+  shape:
+  - 10
+  sum: '-3.545e-01'
+network.2.0.weight:
+  device: cuda:0
+  max: '8.829e-02'
+  mean: '-5.307e-04'
+  min: '-8.835e-02'
+  shape:
+  - 10
+  - 128
+  sum: '-6.793e-01'
diff --git a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/resnet18_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet18_cifar10_image_classifier.yaml
similarity index 76%
rename from .regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/resnet18_cifar10_example.yaml
rename to .regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet18_cifar10_image_classifier.yaml
index ba0cad92..61ccf18e 100644
--- a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/resnet18_cifar10_example.yaml
+++ b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet18_cifar10_image_classifier.yaml
@@ -1,5 +1,5 @@
 network.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -7,14 +7,14 @@ network.bn1.bias:
   - 64
   sum: '0.e+00'
 network.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -22,7 +22,7 @@ network.bn1.running_mean:
   - 64
   sum: '0.e+00'
 network.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -30,7 +30,7 @@ network.bn1.running_var:
   - 64
   sum: '6.4e+01'
 network.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -38,35 +38,35 @@ network.bn1.weight:
   - 64
   sum: '6.4e+01'
 network.conv1.weight:
-  device: cpu
-  max: '1.098e-01'
-  mean: '1.139e-04'
-  min: '-8.341e-02'
+  device: cuda:0
+  max: '8.688e-02'
+  mean: '5.299e-04'
+  min: '-9.862e-02'
   shape:
   - 64
   - 3
   - 7
   - 7
-  sum: '1.072e+00'
+  sum: '4.986e+00'
 network.fc.bias:
-  device: cpu
-  max: '3.715e-02'
-  mean: '-1.094e-02'
-  min: '-3.341e-02'
+  device: cuda:0
+  max: '4.314e-02'
+  mean: '2.057e-04'
+  min: '-3.14e-02'
   shape:
   - 10
-  sum: '-1.094e-01'
+  sum: '2.057e-03'
 network.fc.weight:
-  device: cpu
+  device: cuda:0
   max: '4.418e-02'
-  mean: '-4.792e-04'
-  min: '-4.418e-02'
+  mean: '1.848e-04'
+  min: '-4.414e-02'
   shape:
   - 10
   - 512
-  sum: '-2.454e+00'
+  sum: '9.461e-01'
 network.layer1.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -74,14 +74,14 @@ network.layer1.0.bn1.bias:
   - 64
   sum: '0.e+00'
 network.layer1.0.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer1.0.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -89,7 +89,7 @@ network.layer1.0.bn1.running_mean:
   - 64
   sum: '0.e+00'
 network.layer1.0.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -97,7 +97,7 @@ network.layer1.0.bn1.running_var:
   - 64
   sum: '6.4e+01'
 network.layer1.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -105,7 +105,7 @@ network.layer1.0.bn1.weight:
   - 64
   sum: '6.4e+01'
 network.layer1.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -113,14 +113,14 @@ network.layer1.0.bn2.bias:
   - 64
   sum: '0.e+00'
 network.layer1.0.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer1.0.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -128,7 +128,7 @@ network.layer1.0.bn2.running_mean:
   - 64
   sum: '0.e+00'
 network.layer1.0.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -136,7 +136,7 @@ network.layer1.0.bn2.running_var:
   - 64
   sum: '6.4e+01'
 network.layer1.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -144,29 +144,29 @@ network.layer1.0.bn2.weight:
   - 64
   sum: '6.4e+01'
 network.layer1.0.conv1.weight:
-  device: cpu
-  max: '2.499e-01'
-  mean: '2.448e-04'
-  min: '-2.519e-01'
+  device: cuda:0
+  max: '2.433e-01'
+  mean: '1.396e-04'
+  min: '-2.501e-01'
   shape:
   - 64
   - 64
   - 3
   - 3
-  sum: '9.024e+00'
+  sum: '5.148e+00'
 network.layer1.0.conv2.weight:
-  device: cpu
-  max: '2.35e-01'
-  mean: '-2.816e-04'
-  min: '-2.581e-01'
+  device: cuda:0
+  max: '2.442e-01'
+  mean: '1.259e-04'
+  min: '-2.666e-01'
   shape:
   - 64
   - 64
   - 3
   - 3
-  sum: '-1.038e+01'
+  sum: '4.642e+00'
 network.layer1.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -174,14 +174,14 @@ network.layer1.1.bn1.bias:
   - 64
   sum: '0.e+00'
 network.layer1.1.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer1.1.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -189,7 +189,7 @@ network.layer1.1.bn1.running_mean:
   - 64
   sum: '0.e+00'
 network.layer1.1.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -197,7 +197,7 @@ network.layer1.1.bn1.running_var:
   - 64
   sum: '6.4e+01'
 network.layer1.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -205,7 +205,7 @@ network.layer1.1.bn1.weight:
   - 64
   sum: '6.4e+01'
 network.layer1.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -213,14 +213,14 @@ network.layer1.1.bn2.bias:
   - 64
   sum: '0.e+00'
 network.layer1.1.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer1.1.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -228,7 +228,7 @@ network.layer1.1.bn2.running_mean:
   - 64
   sum: '0.e+00'
 network.layer1.1.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -236,7 +236,7 @@ network.layer1.1.bn2.running_var:
   - 64
   sum: '6.4e+01'
 network.layer1.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -244,29 +244,29 @@ network.layer1.1.bn2.weight:
   - 64
   sum: '6.4e+01'
 network.layer1.1.conv1.weight:
-  device: cpu
-  max: '2.130e-01'
-  mean: '-9.64e-05'
-  min: '-2.213e-01'
+  device: cuda:0
+  max: '2.456e-01'
+  mean: '1.807e-04'
+  min: '-2.376e-01'
   shape:
   - 64
   - 64
   - 3
   - 3
-  sum: '-3.554e+00'
+  sum: '6.660e+00'
 network.layer1.1.conv2.weight:
-  device: cpu
-  max: '2.414e-01'
-  mean: '1.006e-04'
-  min: '-2.212e-01'
+  device: cuda:0
+  max: '2.338e-01'
+  mean: '-3.408e-04'
+  min: '-2.402e-01'
   shape:
   - 64
   - 64
   - 3
   - 3
-  sum: '3.709e+00'
+  sum: '-1.256e+01'
 network.layer2.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -274,14 +274,14 @@ network.layer2.0.bn1.bias:
   - 128
   sum: '0.e+00'
 network.layer2.0.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.0.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -289,7 +289,7 @@ network.layer2.0.bn1.running_mean:
   - 128
   sum: '0.e+00'
 network.layer2.0.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -297,7 +297,7 @@ network.layer2.0.bn1.running_var:
   - 128
   sum: '1.28e+02'
 network.layer2.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -305,7 +305,7 @@ network.layer2.0.bn1.weight:
   - 128
   sum: '1.28e+02'
 network.layer2.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -313,14 +313,14 @@ network.layer2.0.bn2.bias:
   - 128
   sum: '0.e+00'
 network.layer2.0.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.0.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -328,7 +328,7 @@ network.layer2.0.bn2.running_mean:
   - 128
   sum: '0.e+00'
 network.layer2.0.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -336,7 +336,7 @@ network.layer2.0.bn2.running_var:
   - 128
   sum: '1.28e+02'
 network.layer2.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -344,40 +344,40 @@ network.layer2.0.bn2.weight:
   - 128
   sum: '1.28e+02'
 network.layer2.0.conv1.weight:
-  device: cpu
-  max: '1.781e-01'
-  mean: '-2.81e-04'
-  min: '-1.729e-01'
+  device: cuda:0
+  max: '1.681e-01'
+  mean: '2.319e-04'
+  min: '-1.830e-01'
   shape:
   - 128
   - 64
   - 3
   - 3
-  sum: '-2.072e+01'
+  sum: '1.71e+01'
 network.layer2.0.conv2.weight:
-  device: cpu
-  max: '1.949e-01'
-  mean: '-2.364e-04'
-  min: '-1.890e-01'
+  device: cuda:0
+  max: '2.008e-01'
+  mean: '-6.267e-05'
+  min: '-1.870e-01'
   shape:
   - 128
   - 128
   - 3
   - 3
-  sum: '-3.485e+01'
+  sum: '-9.240e+00'
 network.layer2.0.downsample.0.weight:
-  device: cpu
-  max: '5.532e-01'
-  mean: '2.595e-04'
-  min: '-4.129e-01'
+  device: cuda:0
+  max: '5.180e-01'
+  mean: '-2.705e-03'
+  min: '-5.316e-01'
   shape:
   - 128
   - 64
   - 1
   - 1
-  sum: '2.126e+00'
+  sum: '-2.216e+01'
 network.layer2.0.downsample.1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -385,14 +385,14 @@ network.layer2.0.downsample.1.bias:
   - 128
   sum: '0.e+00'
 network.layer2.0.downsample.1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.0.downsample.1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -400,7 +400,7 @@ network.layer2.0.downsample.1.running_mean:
   - 128
   sum: '0.e+00'
 network.layer2.0.downsample.1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -408,7 +408,7 @@ network.layer2.0.downsample.1.running_var:
   - 128
   sum: '1.28e+02'
 network.layer2.0.downsample.1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -416,7 +416,7 @@ network.layer2.0.downsample.1.weight:
   - 128
   sum: '1.28e+02'
 network.layer2.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -424,14 +424,14 @@ network.layer2.1.bn1.bias:
   - 128
   sum: '0.e+00'
 network.layer2.1.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.1.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -439,7 +439,7 @@ network.layer2.1.bn1.running_mean:
   - 128
   sum: '0.e+00'
 network.layer2.1.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -447,7 +447,7 @@ network.layer2.1.bn1.running_var:
   - 128
   sum: '1.28e+02'
 network.layer2.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -455,7 +455,7 @@ network.layer2.1.bn1.weight:
   - 128
   sum: '1.28e+02'
 network.layer2.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -463,14 +463,14 @@ network.layer2.1.bn2.bias:
   - 128
   sum: '0.e+00'
 network.layer2.1.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.1.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -478,7 +478,7 @@ network.layer2.1.bn2.running_mean:
   - 128
   sum: '0.e+00'
 network.layer2.1.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -486,7 +486,7 @@ network.layer2.1.bn2.running_var:
   - 128
   sum: '1.28e+02'
 network.layer2.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -494,29 +494,29 @@ network.layer2.1.bn2.weight:
   - 128
   sum: '1.28e+02'
 network.layer2.1.conv1.weight:
-  device: cpu
-  max: '1.921e-01'
-  mean: '3.336e-05'
-  min: '-1.785e-01'
+  device: cuda:0
+  max: '1.750e-01'
+  mean: '7.981e-05'
+  min: '-1.909e-01'
   shape:
   - 128
   - 128
   - 3
   - 3
-  sum: '4.92e+00'
+  sum: '1.177e+01'
 network.layer2.1.conv2.weight:
-  device: cpu
-  max: '1.825e-01'
-  mean: '-3.207e-05'
-  min: '-1.989e-01'
+  device: cuda:0
+  max: '1.714e-01'
+  mean: '6.508e-05'
+  min: '-1.811e-01'
   shape:
   - 128
   - 128
   - 3
   - 3
-  sum: '-4.729e+00'
+  sum: '9.597e+00'
 network.layer3.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -524,14 +524,14 @@ network.layer3.0.bn1.bias:
   - 256
   sum: '0.e+00'
 network.layer3.0.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.0.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -539,7 +539,7 @@ network.layer3.0.bn1.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.0.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -547,7 +547,7 @@ network.layer3.0.bn1.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -555,7 +555,7 @@ network.layer3.0.bn1.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -563,14 +563,14 @@ network.layer3.0.bn2.bias:
   - 256
   sum: '0.e+00'
 network.layer3.0.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.0.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -578,7 +578,7 @@ network.layer3.0.bn2.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.0.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -586,7 +586,7 @@ network.layer3.0.bn2.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -594,40 +594,40 @@ network.layer3.0.bn2.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.0.conv1.weight:
-  device: cpu
-  max: '1.418e-01'
-  mean: '4.759e-05'
-  min: '-1.425e-01'
+  device: cuda:0
+  max: '1.186e-01'
+  mean: '-5.228e-06'
+  min: '-1.308e-01'
   shape:
   - 256
   - 128
   - 3
   - 3
-  sum: '1.403e+01'
+  sum: '-1.542e+00'
 network.layer3.0.conv2.weight:
-  device: cpu
-  max: '1.464e-01'
-  mean: '3.416e-05'
-  min: '-1.367e-01'
+  device: cuda:0
+  max: '1.360e-01'
+  mean: '-1.566e-05'
+  min: '-1.442e-01'
   shape:
   - 256
   - 256
   - 3
   - 3
-  sum: '2.015e+01'
+  sum: '-9.235e+00'
 network.layer3.0.downsample.0.weight:
-  device: cpu
-  max: '3.724e-01'
-  mean: '-3.193e-04'
-  min: '-4.37e-01'
+  device: cuda:0
+  max: '4.034e-01'
+  mean: '-7.003e-06'
+  min: '-3.510e-01'
   shape:
   - 256
   - 128
   - 1
   - 1
-  sum: '-1.046e+01'
+  sum: '-2.295e-01'
 network.layer3.0.downsample.1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -635,14 +635,14 @@ network.layer3.0.downsample.1.bias:
   - 256
   sum: '0.e+00'
 network.layer3.0.downsample.1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.0.downsample.1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -650,7 +650,7 @@ network.layer3.0.downsample.1.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.0.downsample.1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -658,7 +658,7 @@ network.layer3.0.downsample.1.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.0.downsample.1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -666,7 +666,7 @@ network.layer3.0.downsample.1.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -674,14 +674,14 @@ network.layer3.1.bn1.bias:
   - 256
   sum: '0.e+00'
 network.layer3.1.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.1.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -689,7 +689,7 @@ network.layer3.1.bn1.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.1.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -697,7 +697,7 @@ network.layer3.1.bn1.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -705,7 +705,7 @@ network.layer3.1.bn1.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -713,14 +713,14 @@ network.layer3.1.bn2.bias:
   - 256
   sum: '0.e+00'
 network.layer3.1.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.1.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -728,7 +728,7 @@ network.layer3.1.bn2.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.1.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -736,7 +736,7 @@ network.layer3.1.bn2.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -744,29 +744,29 @@ network.layer3.1.bn2.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.1.conv1.weight:
-  device: cpu
-  max: '1.478e-01'
-  mean: '-4.980e-05'
-  min: '-1.411e-01'
+  device: cuda:0
+  max: '1.435e-01'
+  mean: '1.374e-05'
+  min: '-1.476e-01'
   shape:
   - 256
   - 256
   - 3
   - 3
-  sum: '-2.938e+01'
+  sum: '8.106e+00'
 network.layer3.1.conv2.weight:
-  device: cpu
-  max: '1.369e-01'
-  mean: '-3.677e-05'
-  min: '-1.348e-01'
+  device: cuda:0
+  max: '1.273e-01'
+  mean: '8.978e-05'
+  min: '-1.346e-01'
   shape:
   - 256
   - 256
   - 3
   - 3
-  sum: '-2.169e+01'
+  sum: '5.295e+01'
 network.layer4.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -774,14 +774,14 @@ network.layer4.0.bn1.bias:
   - 512
   sum: '0.e+00'
 network.layer4.0.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.0.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -789,7 +789,7 @@ network.layer4.0.bn1.running_mean:
   - 512
   sum: '0.e+00'
 network.layer4.0.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -797,7 +797,7 @@ network.layer4.0.bn1.running_var:
   - 512
   sum: '5.12e+02'
 network.layer4.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -805,7 +805,7 @@ network.layer4.0.bn1.weight:
   - 512
   sum: '5.12e+02'
 network.layer4.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -813,14 +813,14 @@ network.layer4.0.bn2.bias:
   - 512
   sum: '0.e+00'
 network.layer4.0.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.0.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -828,7 +828,7 @@ network.layer4.0.bn2.running_mean:
   - 512
   sum: '0.e+00'
 network.layer4.0.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -836,7 +836,7 @@ network.layer4.0.bn2.running_var:
   - 512
   sum: '5.12e+02'
 network.layer4.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -844,40 +844,40 @@ network.layer4.0.bn2.weight:
   - 512
   sum: '5.12e+02'
 network.layer4.0.conv1.weight:
-  device: cpu
-  max: '9.989e-02'
-  mean: '-7.283e-06'
-  min: '-1.006e-01'
+  device: cuda:0
+  max: '1.020e-01'
+  mean: '-2.986e-06'
+  min: '-1.011e-01'
   shape:
   - 512
   - 256
   - 3
   - 3
-  sum: '-8.591e+00'
+  sum: '-3.522e+00'
 network.layer4.0.conv2.weight:
-  device: cpu
-  max: '1.023e-01'
-  mean: '2.838e-06'
-  min: '-1.135e-01'
+  device: cuda:0
+  max: '1.049e-01'
+  mean: '-2.121e-05'
+  min: '-1.011e-01'
   shape:
   - 512
   - 512
   - 3
   - 3
-  sum: '6.696e+00'
+  sum: '-5.004e+01'
 network.layer4.0.downsample.0.weight:
-  device: cpu
-  max: '2.664e-01'
-  mean: '1.458e-04'
-  min: '-2.861e-01'
+  device: cuda:0
+  max: '2.638e-01'
+  mean: '-1.538e-05'
+  min: '-2.893e-01'
   shape:
   - 512
   - 256
   - 1
   - 1
-  sum: '1.911e+01'
+  sum: '-2.016e+00'
 network.layer4.0.downsample.1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -885,14 +885,14 @@ network.layer4.0.downsample.1.bias:
   - 512
   sum: '0.e+00'
 network.layer4.0.downsample.1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.0.downsample.1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -900,7 +900,7 @@ network.layer4.0.downsample.1.running_mean:
   - 512
   sum: '0.e+00'
 network.layer4.0.downsample.1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -908,7 +908,7 @@ network.layer4.0.downsample.1.running_var:
   - 512
   sum: '5.12e+02'
 network.layer4.0.downsample.1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -916,7 +916,7 @@ network.layer4.0.downsample.1.weight:
   - 512
   sum: '5.12e+02'
 network.layer4.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -924,14 +924,14 @@ network.layer4.1.bn1.bias:
   - 512
   sum: '0.e+00'
 network.layer4.1.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.1.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -939,7 +939,7 @@ network.layer4.1.bn1.running_mean:
   - 512
   sum: '0.e+00'
 network.layer4.1.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -947,7 +947,7 @@ network.layer4.1.bn1.running_var:
   - 512
   sum: '5.12e+02'
 network.layer4.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -955,7 +955,7 @@ network.layer4.1.bn1.weight:
   - 512
   sum: '5.12e+02'
 network.layer4.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -963,14 +963,14 @@ network.layer4.1.bn2.bias:
   - 512
   sum: '0.e+00'
 network.layer4.1.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.1.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -978,7 +978,7 @@ network.layer4.1.bn2.running_mean:
   - 512
   sum: '0.e+00'
 network.layer4.1.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -986,7 +986,7 @@ network.layer4.1.bn2.running_var:
   - 512
   sum: '5.12e+02'
 network.layer4.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -994,24 +994,24 @@ network.layer4.1.bn2.weight:
   - 512
   sum: '5.12e+02'
 network.layer4.1.conv1.weight:
-  device: cpu
-  max: '1.172e-01'
-  mean: '-1.526e-05'
-  min: '-1.015e-01'
+  device: cuda:0
+  max: '1.056e-01'
+  mean: '4.031e-06'
+  min: '-1.011e-01'
   shape:
   - 512
   - 512
   - 3
   - 3
-  sum: '-3.601e+01'
+  sum: '9.511e+00'
 network.layer4.1.conv2.weight:
-  device: cpu
-  max: '9.908e-02'
-  mean: '8.558e-06'
-  min: '-1.071e-01'
+  device: cuda:0
+  max: '1.072e-01'
+  mean: '-1.993e-05'
+  min: '-9.954e-02'
   shape:
   - 512
   - 512
   - 3
   - 3
-  sum: '2.019e+01'
+  sum: '-4.701e+01'
diff --git a/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet18_imagenet_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet18_imagenet_image_classifier.yaml
new file mode 100644
index 00000000..a3a1a99d
--- /dev/null
+++ b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet18_imagenet_image_classifier.yaml
@@ -0,0 +1,1017 @@
+network.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.conv1.weight:
+  device: cuda:0
+  max: '9.327e-02'
+  mean: '4.984e-04'
+  min: '-1.072e-01'
+  shape:
+  - 64
+  - 3
+  - 7
+  - 7
+  sum: '4.689e+00'
+network.fc.bias:
+  device: cuda:0
+  max: '4.419e-02'
+  mean: '1.212e-06'
+  min: '-4.419e-02'
+  shape:
+  - 1000
+  sum: '1.212e-03'
+network.fc.weight:
+  device: cuda:0
+  max: '4.419e-02'
+  mean: '-6.997e-07'
+  min: '-4.419e-02'
+  shape:
+  - 1000
+  - 512
+  sum: '-3.583e-01'
+network.layer1.0.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.0.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer1.0.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.0.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.0.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.0.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.0.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer1.0.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.0.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.0.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.0.conv1.weight:
+  device: cuda:0
+  max: '2.442e-01'
+  mean: '1.259e-04'
+  min: '-2.666e-01'
+  shape:
+  - 64
+  - 64
+  - 3
+  - 3
+  sum: '4.642e+00'
+network.layer1.0.conv2.weight:
+  device: cuda:0
+  max: '2.456e-01'
+  mean: '1.807e-04'
+  min: '-2.376e-01'
+  shape:
+  - 64
+  - 64
+  - 3
+  - 3
+  sum: '6.660e+00'
+network.layer1.1.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.1.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer1.1.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.1.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.1.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.1.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.1.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer1.1.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.1.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.1.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.1.conv1.weight:
+  device: cuda:0
+  max: '2.338e-01'
+  mean: '-3.408e-04'
+  min: '-2.402e-01'
+  shape:
+  - 64
+  - 64
+  - 3
+  - 3
+  sum: '-1.256e+01'
+network.layer1.1.conv2.weight:
+  device: cuda:0
+  max: '2.224e-01'
+  mean: '2.189e-04'
+  min: '-2.588e-01'
+  shape:
+  - 64
+  - 64
+  - 3
+  - 3
+  sum: '8.07e+00'
+network.layer2.0.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.0.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.0.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.0.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.0.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.0.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.0.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.0.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.0.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.0.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.0.conv1.weight:
+  device: cuda:0
+  max: '2.008e-01'
+  mean: '8.513e-05'
+  min: '-1.854e-01'
+  shape:
+  - 128
+  - 64
+  - 3
+  - 3
+  sum: '6.276e+00'
+network.layer2.0.conv2.weight:
+  device: cuda:0
+  max: '1.766e-01'
+  mean: '1.21e-04'
+  min: '-1.79e-01'
+  shape:
+  - 128
+  - 128
+  - 3
+  - 3
+  sum: '1.784e+01'
+network.layer2.0.downsample.0.weight:
+  device: cuda:0
+  max: '5.054e-01'
+  mean: '-9.048e-04'
+  min: '-4.751e-01'
+  shape:
+  - 128
+  - 64
+  - 1
+  - 1
+  sum: '-7.412e+00'
+network.layer2.0.downsample.1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.0.downsample.1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.0.downsample.1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.0.downsample.1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.0.downsample.1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.1.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.1.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.1.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.1.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.1.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.1.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.1.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.1.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.1.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.1.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.1.conv1.weight:
+  device: cuda:0
+  max: '1.714e-01'
+  mean: '6.508e-05'
+  min: '-1.811e-01'
+  shape:
+  - 128
+  - 128
+  - 3
+  - 3
+  sum: '9.597e+00'
+network.layer2.1.conv2.weight:
+  device: cuda:0
+  max: '1.677e-01'
+  mean: '-1.988e-05'
+  min: '-1.746e-01'
+  shape:
+  - 128
+  - 128
+  - 3
+  - 3
+  sum: '-2.932e+00'
+network.layer3.0.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.0.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.0.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.0.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.0.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.0.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.0.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.0.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.0.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.0.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.0.conv1.weight:
+  device: cuda:0
+  max: '1.360e-01'
+  mean: '3.475e-05'
+  min: '-1.442e-01'
+  shape:
+  - 256
+  - 128
+  - 3
+  - 3
+  sum: '1.025e+01'
+network.layer3.0.conv2.weight:
+  device: cuda:0
+  max: '1.345e-01'
+  mean: '-1.856e-05'
+  min: '-1.299e-01'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '-1.095e+01'
+network.layer3.0.downsample.0.weight:
+  device: cuda:0
+  max: '3.523e-01'
+  mean: '1.2e-04'
+  min: '-3.863e-01'
+  shape:
+  - 256
+  - 128
+  - 1
+  - 1
+  sum: '3.931e+00'
+network.layer3.0.downsample.1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.0.downsample.1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.0.downsample.1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.0.downsample.1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.0.downsample.1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.1.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.1.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.1.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.1.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.1.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.1.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.1.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.1.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.1.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.1.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.1.conv1.weight:
+  device: cuda:0
+  max: '1.395e-01'
+  mean: '6.754e-05'
+  min: '-1.476e-01'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '3.984e+01'
+network.layer3.1.conv2.weight:
+  device: cuda:0
+  max: '1.443e-01'
+  mean: '4.953e-05'
+  min: '-1.376e-01'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '2.921e+01'
+network.layer4.0.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.0.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.0.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.0.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.0.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.0.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.0.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.0.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.0.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.0.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.0.conv1.weight:
+  device: cuda:0
+  max: '1.003e-01'
+  mean: '-1.587e-05'
+  min: '-1.011e-01'
+  shape:
+  - 512
+  - 256
+  - 3
+  - 3
+  sum: '-1.872e+01'
+network.layer4.0.conv2.weight:
+  device: cuda:0
+  max: '1.049e-01'
+  mean: '-1.442e-05'
+  min: '-1.011e-01'
+  shape:
+  - 512
+  - 512
+  - 3
+  - 3
+  sum: '-3.403e+01'
+network.layer4.0.downsample.0.weight:
+  device: cuda:0
+  max: '2.673e-01'
+  mean: '2.869e-04'
+  min: '-3.001e-01'
+  shape:
+  - 512
+  - 256
+  - 1
+  - 1
+  sum: '3.761e+01'
+network.layer4.0.downsample.1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.0.downsample.1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.0.downsample.1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.0.downsample.1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.0.downsample.1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.1.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.1.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.1.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.1.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.1.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.1.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.1.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.1.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.1.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.1.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.1.conv1.weight:
+  device: cuda:0
+  max: '1.056e-01'
+  mean: '1.585e-06'
+  min: '-1.011e-01'
+  shape:
+  - 512
+  - 512
+  - 3
+  - 3
+  sum: '3.74e+00'
+network.layer4.1.conv2.weight:
+  device: cuda:0
+  max: '1.072e-01'
+  mean: '-2.285e-05'
+  min: '-1.042e-01'
+  shape:
+  - 512
+  - 512
+  - 3
+  - 3
+  sum: '-5.392e+01'
diff --git a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/resnet50_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet50_cifar10_image_classifier.yaml
similarity index 77%
rename from .regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/resnet50_cifar10_example.yaml
rename to .regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet50_cifar10_image_classifier.yaml
index e6ed0e92..d0fb1b94 100644
--- a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/resnet50_cifar10_example.yaml
+++ b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet50_cifar10_image_classifier.yaml
@@ -1,5 +1,5 @@
 network.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -7,14 +7,14 @@ network.bn1.bias:
   - 64
   sum: '0.e+00'
 network.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -22,7 +22,7 @@ network.bn1.running_mean:
   - 64
   sum: '0.e+00'
 network.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -30,7 +30,7 @@ network.bn1.running_var:
   - 64
   sum: '6.4e+01'
 network.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -38,35 +38,35 @@ network.bn1.weight:
   - 64
   sum: '6.4e+01'
 network.conv1.weight:
-  device: cpu
-  max: '1.063e-01'
-  mean: '4.928e-04'
-  min: '-9.805e-02'
+  device: cuda:0
+  max: '9.646e-02'
+  mean: '3.162e-04'
+  min: '-9.585e-02'
   shape:
   - 64
   - 3
   - 7
   - 7
-  sum: '4.636e+00'
+  sum: '2.975e+00'
 network.fc.bias:
-  device: cpu
-  max: '2.104e-02'
-  mean: '3.192e-04'
-  min: '-2.160e-02'
+  device: cuda:0
+  max: '2.199e-02'
+  mean: '3.231e-03'
+  min: '-2.176e-02'
   shape:
   - 10
-  sum: '3.192e-03'
+  sum: '3.231e-02'
 network.fc.weight:
-  device: cpu
-  max: '2.209e-02'
-  mean: '1.247e-04'
+  device: cuda:0
+  max: '2.21e-02'
+  mean: '-7.184e-06'
   min: '-2.21e-02'
   shape:
   - 10
   - 2048
-  sum: '2.554e+00'
+  sum: '-1.471e-01'
 network.layer1.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -74,14 +74,14 @@ network.layer1.0.bn1.bias:
   - 64
   sum: '0.e+00'
 network.layer1.0.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer1.0.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -89,7 +89,7 @@ network.layer1.0.bn1.running_mean:
   - 64
   sum: '0.e+00'
 network.layer1.0.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -97,7 +97,7 @@ network.layer1.0.bn1.running_var:
   - 64
   sum: '6.4e+01'
 network.layer1.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -105,7 +105,7 @@ network.layer1.0.bn1.weight:
   - 64
   sum: '6.4e+01'
 network.layer1.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -113,14 +113,14 @@ network.layer1.0.bn2.bias:
   - 64
   sum: '0.e+00'
 network.layer1.0.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer1.0.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -128,7 +128,7 @@ network.layer1.0.bn2.running_mean:
   - 64
   sum: '0.e+00'
 network.layer1.0.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -136,7 +136,7 @@ network.layer1.0.bn2.running_var:
   - 64
   sum: '6.4e+01'
 network.layer1.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -144,7 +144,7 @@ network.layer1.0.bn2.weight:
   - 64
   sum: '6.4e+01'
 network.layer1.0.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -152,14 +152,14 @@ network.layer1.0.bn3.bias:
   - 256
   sum: '0.e+00'
 network.layer1.0.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer1.0.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -167,7 +167,7 @@ network.layer1.0.bn3.running_mean:
   - 256
   sum: '0.e+00'
 network.layer1.0.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -175,7 +175,7 @@ network.layer1.0.bn3.running_var:
   - 256
   sum: '2.56e+02'
 network.layer1.0.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -183,51 +183,51 @@ network.layer1.0.bn3.weight:
   - 256
   sum: '2.56e+02'
 network.layer1.0.conv1.weight:
-  device: cpu
-  max: '5.941e-01'
-  mean: '-1.580e-03'
-  min: '-6.47e-01'
+  device: cuda:0
+  max: '7.081e-01'
+  mean: '-3.220e-03'
+  min: '-6.607e-01'
   shape:
   - 64
   - 64
   - 1
   - 1
-  sum: '-6.472e+00'
+  sum: '-1.319e+01'
 network.layer1.0.conv2.weight:
-  device: cpu
-  max: '2.475e-01'
-  mean: '1.651e-05'
-  min: '-2.377e-01'
+  device: cuda:0
+  max: '2.489e-01'
+  mean: '-3.557e-04'
+  min: '-2.330e-01'
   shape:
   - 64
   - 64
   - 3
   - 3
-  sum: '6.087e-01'
+  sum: '-1.311e+01'
 network.layer1.0.conv3.weight:
-  device: cpu
-  max: '3.290e-01'
-  mean: '-1.486e-04'
-  min: '-3.494e-01'
+  device: cuda:0
+  max: '3.157e-01'
+  mean: '2.669e-04'
+  min: '-3.577e-01'
   shape:
   - 256
   - 64
   - 1
   - 1
-  sum: '-2.435e+00'
+  sum: '4.374e+00'
 network.layer1.0.downsample.0.weight:
-  device: cpu
-  max: '3.666e-01'
-  mean: '3.372e-04'
-  min: '-3.401e-01'
+  device: cuda:0
+  max: '3.370e-01'
+  mean: '4.294e-04'
+  min: '-3.389e-01'
   shape:
   - 256
   - 64
   - 1
   - 1
-  sum: '5.525e+00'
+  sum: '7.036e+00'
 network.layer1.0.downsample.1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -235,14 +235,14 @@ network.layer1.0.downsample.1.bias:
   - 256
   sum: '0.e+00'
 network.layer1.0.downsample.1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer1.0.downsample.1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -250,7 +250,7 @@ network.layer1.0.downsample.1.running_mean:
   - 256
   sum: '0.e+00'
 network.layer1.0.downsample.1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -258,7 +258,7 @@ network.layer1.0.downsample.1.running_var:
   - 256
   sum: '2.56e+02'
 network.layer1.0.downsample.1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -266,7 +266,7 @@ network.layer1.0.downsample.1.weight:
   - 256
   sum: '2.56e+02'
 network.layer1.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -274,14 +274,14 @@ network.layer1.1.bn1.bias:
   - 64
   sum: '0.e+00'
 network.layer1.1.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer1.1.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -289,7 +289,7 @@ network.layer1.1.bn1.running_mean:
   - 64
   sum: '0.e+00'
 network.layer1.1.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -297,7 +297,7 @@ network.layer1.1.bn1.running_var:
   - 64
   sum: '6.4e+01'
 network.layer1.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -305,7 +305,7 @@ network.layer1.1.bn1.weight:
   - 64
   sum: '6.4e+01'
 network.layer1.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -313,14 +313,14 @@ network.layer1.1.bn2.bias:
   - 64
   sum: '0.e+00'
 network.layer1.1.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer1.1.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -328,7 +328,7 @@ network.layer1.1.bn2.running_mean:
   - 64
   sum: '0.e+00'
 network.layer1.1.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -336,7 +336,7 @@ network.layer1.1.bn2.running_var:
   - 64
   sum: '6.4e+01'
 network.layer1.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -344,7 +344,7 @@ network.layer1.1.bn2.weight:
   - 64
   sum: '6.4e+01'
 network.layer1.1.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -352,14 +352,14 @@ network.layer1.1.bn3.bias:
   - 256
   sum: '0.e+00'
 network.layer1.1.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer1.1.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -367,7 +367,7 @@ network.layer1.1.bn3.running_mean:
   - 256
   sum: '0.e+00'
 network.layer1.1.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -375,7 +375,7 @@ network.layer1.1.bn3.running_var:
   - 256
   sum: '2.56e+02'
 network.layer1.1.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -383,40 +383,40 @@ network.layer1.1.bn3.weight:
   - 256
   sum: '2.56e+02'
 network.layer1.1.conv1.weight:
-  device: cpu
-  max: '6.431e-01'
-  mean: '-6.870e-05'
-  min: '-7.341e-01'
+  device: cuda:0
+  max: '7.008e-01'
+  mean: '3.792e-04'
+  min: '-6.543e-01'
   shape:
   - 64
   - 256
   - 1
   - 1
-  sum: '-1.126e+00'
+  sum: '6.214e+00'
 network.layer1.1.conv2.weight:
-  device: cpu
-  max: '2.367e-01'
-  mean: '-7.922e-05'
-  min: '-2.362e-01'
+  device: cuda:0
+  max: '2.569e-01'
+  mean: '-2.808e-06'
+  min: '-2.296e-01'
   shape:
   - 64
   - 64
   - 3
   - 3
-  sum: '-2.920e+00'
+  sum: '-1.035e-01'
 network.layer1.1.conv3.weight:
-  device: cpu
-  max: '3.581e-01'
-  mean: '3.216e-04'
-  min: '-3.573e-01'
+  device: cuda:0
+  max: '3.335e-01'
+  mean: '-1.113e-03'
+  min: '-3.427e-01'
   shape:
   - 256
   - 64
   - 1
   - 1
-  sum: '5.268e+00'
+  sum: '-1.824e+01'
 network.layer1.2.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -424,14 +424,14 @@ network.layer1.2.bn1.bias:
   - 64
   sum: '0.e+00'
 network.layer1.2.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer1.2.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -439,7 +439,7 @@ network.layer1.2.bn1.running_mean:
   - 64
   sum: '0.e+00'
 network.layer1.2.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -447,7 +447,7 @@ network.layer1.2.bn1.running_var:
   - 64
   sum: '6.4e+01'
 network.layer1.2.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -455,7 +455,7 @@ network.layer1.2.bn1.weight:
   - 64
   sum: '6.4e+01'
 network.layer1.2.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -463,14 +463,14 @@ network.layer1.2.bn2.bias:
   - 64
   sum: '0.e+00'
 network.layer1.2.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer1.2.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -478,7 +478,7 @@ network.layer1.2.bn2.running_mean:
   - 64
   sum: '0.e+00'
 network.layer1.2.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -486,7 +486,7 @@ network.layer1.2.bn2.running_var:
   - 64
   sum: '6.4e+01'
 network.layer1.2.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -494,7 +494,7 @@ network.layer1.2.bn2.weight:
   - 64
   sum: '6.4e+01'
 network.layer1.2.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -502,14 +502,14 @@ network.layer1.2.bn3.bias:
   - 256
   sum: '0.e+00'
 network.layer1.2.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer1.2.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -517,7 +517,7 @@ network.layer1.2.bn3.running_mean:
   - 256
   sum: '0.e+00'
 network.layer1.2.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -525,7 +525,7 @@ network.layer1.2.bn3.running_var:
   - 256
   sum: '2.56e+02'
 network.layer1.2.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -533,40 +533,40 @@ network.layer1.2.bn3.weight:
   - 256
   sum: '2.56e+02'
 network.layer1.2.conv1.weight:
-  device: cpu
-  max: '6.670e-01'
-  mean: '-1.511e-03'
-  min: '-7.024e-01'
+  device: cuda:0
+  max: '7.078e-01'
+  mean: '2.205e-03'
+  min: '-6.688e-01'
   shape:
   - 64
   - 256
   - 1
   - 1
-  sum: '-2.476e+01'
+  sum: '3.613e+01'
 network.layer1.2.conv2.weight:
-  device: cpu
-  max: '2.378e-01'
-  mean: '-2.972e-04'
-  min: '-2.387e-01'
+  device: cuda:0
+  max: '2.568e-01'
+  mean: '2.909e-04'
+  min: '-2.361e-01'
   shape:
   - 64
   - 64
   - 3
   - 3
-  sum: '-1.095e+01'
+  sum: '1.072e+01'
 network.layer1.2.conv3.weight:
-  device: cpu
-  max: '3.828e-01'
-  mean: '-2.277e-04'
-  min: '-3.256e-01'
+  device: cuda:0
+  max: '3.423e-01'
+  mean: '-6.033e-04'
+  min: '-3.476e-01'
   shape:
   - 256
   - 64
   - 1
   - 1
-  sum: '-3.730e+00'
+  sum: '-9.884e+00'
 network.layer2.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -574,14 +574,14 @@ network.layer2.0.bn1.bias:
   - 128
   sum: '0.e+00'
 network.layer2.0.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.0.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -589,7 +589,7 @@ network.layer2.0.bn1.running_mean:
   - 128
   sum: '0.e+00'
 network.layer2.0.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -597,7 +597,7 @@ network.layer2.0.bn1.running_var:
   - 128
   sum: '1.28e+02'
 network.layer2.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -605,7 +605,7 @@ network.layer2.0.bn1.weight:
   - 128
   sum: '1.28e+02'
 network.layer2.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -613,14 +613,14 @@ network.layer2.0.bn2.bias:
   - 128
   sum: '0.e+00'
 network.layer2.0.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.0.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -628,7 +628,7 @@ network.layer2.0.bn2.running_mean:
   - 128
   sum: '0.e+00'
 network.layer2.0.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -636,7 +636,7 @@ network.layer2.0.bn2.running_var:
   - 128
   sum: '1.28e+02'
 network.layer2.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -644,7 +644,7 @@ network.layer2.0.bn2.weight:
   - 128
   sum: '1.28e+02'
 network.layer2.0.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -652,14 +652,14 @@ network.layer2.0.bn3.bias:
   - 512
   sum: '0.e+00'
 network.layer2.0.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.0.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -667,7 +667,7 @@ network.layer2.0.bn3.running_mean:
   - 512
   sum: '0.e+00'
 network.layer2.0.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -675,7 +675,7 @@ network.layer2.0.bn3.running_var:
   - 512
   sum: '5.12e+02'
 network.layer2.0.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -683,51 +683,51 @@ network.layer2.0.bn3.weight:
   - 512
   sum: '5.12e+02'
 network.layer2.0.conv1.weight:
-  device: cpu
-  max: '4.811e-01'
-  mean: '1.971e-04'
-  min: '-5.037e-01'
+  device: cuda:0
+  max: '5.195e-01'
+  mean: '7.903e-06'
+  min: '-5.187e-01'
   shape:
   - 128
   - 256
   - 1
   - 1
-  sum: '6.458e+00'
+  sum: '2.59e-01'
 network.layer2.0.conv2.weight:
-  device: cpu
-  max: '1.834e-01'
-  mean: '-1.511e-05'
-  min: '-1.870e-01'
+  device: cuda:0
+  max: '1.880e-01'
+  mean: '2.495e-04'
+  min: '-1.736e-01'
   shape:
   - 128
   - 128
   - 3
   - 3
-  sum: '-2.228e+00'
+  sum: '3.678e+01'
 network.layer2.0.conv3.weight:
-  device: cpu
-  max: '2.532e-01'
-  mean: '-9.596e-05'
-  min: '-2.615e-01'
+  device: cuda:0
+  max: '2.546e-01'
+  mean: '2.444e-04'
+  min: '-2.541e-01'
   shape:
   - 512
   - 128
   - 1
   - 1
-  sum: '-6.289e+00'
+  sum: '1.602e+01'
 network.layer2.0.downsample.0.weight:
-  device: cpu
-  max: '2.66e-01'
-  mean: '3.258e-04'
-  min: '-2.709e-01'
+  device: cuda:0
+  max: '3.065e-01'
+  mean: '3.991e-05'
+  min: '-2.480e-01'
   shape:
   - 512
   - 256
   - 1
   - 1
-  sum: '4.270e+01'
+  sum: '5.231e+00'
 network.layer2.0.downsample.1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -735,14 +735,14 @@ network.layer2.0.downsample.1.bias:
   - 512
   sum: '0.e+00'
 network.layer2.0.downsample.1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.0.downsample.1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -750,7 +750,7 @@ network.layer2.0.downsample.1.running_mean:
   - 512
   sum: '0.e+00'
 network.layer2.0.downsample.1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -758,7 +758,7 @@ network.layer2.0.downsample.1.running_var:
   - 512
   sum: '5.12e+02'
 network.layer2.0.downsample.1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -766,7 +766,7 @@ network.layer2.0.downsample.1.weight:
   - 512
   sum: '5.12e+02'
 network.layer2.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -774,14 +774,14 @@ network.layer2.1.bn1.bias:
   - 128
   sum: '0.e+00'
 network.layer2.1.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.1.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -789,7 +789,7 @@ network.layer2.1.bn1.running_mean:
   - 128
   sum: '0.e+00'
 network.layer2.1.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -797,7 +797,7 @@ network.layer2.1.bn1.running_var:
   - 128
   sum: '1.28e+02'
 network.layer2.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -805,7 +805,7 @@ network.layer2.1.bn1.weight:
   - 128
   sum: '1.28e+02'
 network.layer2.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -813,14 +813,14 @@ network.layer2.1.bn2.bias:
   - 128
   sum: '0.e+00'
 network.layer2.1.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.1.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -828,7 +828,7 @@ network.layer2.1.bn2.running_mean:
   - 128
   sum: '0.e+00'
 network.layer2.1.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -836,7 +836,7 @@ network.layer2.1.bn2.running_var:
   - 128
   sum: '1.28e+02'
 network.layer2.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -844,7 +844,7 @@ network.layer2.1.bn2.weight:
   - 128
   sum: '1.28e+02'
 network.layer2.1.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -852,14 +852,14 @@ network.layer2.1.bn3.bias:
   - 512
   sum: '0.e+00'
 network.layer2.1.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.1.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -867,7 +867,7 @@ network.layer2.1.bn3.running_mean:
   - 512
   sum: '0.e+00'
 network.layer2.1.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -875,7 +875,7 @@ network.layer2.1.bn3.running_var:
   - 512
   sum: '5.12e+02'
 network.layer2.1.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -883,40 +883,40 @@ network.layer2.1.bn3.weight:
   - 512
   sum: '5.12e+02'
 network.layer2.1.conv1.weight:
-  device: cpu
-  max: '5.121e-01'
-  mean: '-1.819e-04'
-  min: '-5.277e-01'
+  device: cuda:0
+  max: '5.655e-01'
+  mean: '-1.772e-04'
+  min: '-5.812e-01'
   shape:
   - 128
   - 512
   - 1
   - 1
-  sum: '-1.192e+01'
+  sum: '-1.161e+01'
 network.layer2.1.conv2.weight:
-  device: cpu
-  max: '1.973e-01'
-  mean: '6.795e-05'
-  min: '-1.822e-01'
+  device: cuda:0
+  max: '1.912e-01'
+  mean: '-1.939e-04'
+  min: '-1.828e-01'
   shape:
   - 128
   - 128
   - 3
   - 3
-  sum: '1.002e+01'
+  sum: '-2.859e+01'
 network.layer2.1.conv3.weight:
-  device: cpu
-  max: '2.505e-01'
-  mean: '-7.241e-04'
-  min: '-2.531e-01'
+  device: cuda:0
+  max: '2.647e-01'
+  mean: '1.202e-04'
+  min: '-2.835e-01'
   shape:
   - 512
   - 128
   - 1
   - 1
-  sum: '-4.745e+01'
+  sum: '7.879e+00'
 network.layer2.2.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -924,14 +924,14 @@ network.layer2.2.bn1.bias:
   - 128
   sum: '0.e+00'
 network.layer2.2.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.2.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -939,7 +939,7 @@ network.layer2.2.bn1.running_mean:
   - 128
   sum: '0.e+00'
 network.layer2.2.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -947,7 +947,7 @@ network.layer2.2.bn1.running_var:
   - 128
   sum: '1.28e+02'
 network.layer2.2.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -955,7 +955,7 @@ network.layer2.2.bn1.weight:
   - 128
   sum: '1.28e+02'
 network.layer2.2.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -963,14 +963,14 @@ network.layer2.2.bn2.bias:
   - 128
   sum: '0.e+00'
 network.layer2.2.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.2.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -978,7 +978,7 @@ network.layer2.2.bn2.running_mean:
   - 128
   sum: '0.e+00'
 network.layer2.2.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -986,7 +986,7 @@ network.layer2.2.bn2.running_var:
   - 128
   sum: '1.28e+02'
 network.layer2.2.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -994,7 +994,7 @@ network.layer2.2.bn2.weight:
   - 128
   sum: '1.28e+02'
 network.layer2.2.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1002,14 +1002,14 @@ network.layer2.2.bn3.bias:
   - 512
   sum: '0.e+00'
 network.layer2.2.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.2.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1017,7 +1017,7 @@ network.layer2.2.bn3.running_mean:
   - 512
   sum: '0.e+00'
 network.layer2.2.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1025,7 +1025,7 @@ network.layer2.2.bn3.running_var:
   - 512
   sum: '5.12e+02'
 network.layer2.2.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1033,40 +1033,40 @@ network.layer2.2.bn3.weight:
   - 512
   sum: '5.12e+02'
 network.layer2.2.conv1.weight:
-  device: cpu
-  max: '5.326e-01'
-  mean: '2.855e-04'
-  min: '-4.874e-01'
+  device: cuda:0
+  max: '5.352e-01'
+  mean: '1.514e-04'
+  min: '-4.77e-01'
   shape:
   - 128
   - 512
   - 1
   - 1
-  sum: '1.871e+01'
+  sum: '9.922e+00'
 network.layer2.2.conv2.weight:
-  device: cpu
-  max: '1.926e-01'
-  mean: '1.28e-05'
-  min: '-1.865e-01'
+  device: cuda:0
+  max: '1.992e-01'
+  mean: '-3.131e-05'
+  min: '-1.781e-01'
   shape:
   - 128
   - 128
   - 3
   - 3
-  sum: '1.887e+00'
+  sum: '-4.617e+00'
 network.layer2.2.conv3.weight:
-  device: cpu
-  max: '2.606e-01'
-  mean: '-1.18e-04'
-  min: '-2.621e-01'
+  device: cuda:0
+  max: '3.018e-01'
+  mean: '8.808e-05'
+  min: '-2.617e-01'
   shape:
   - 512
   - 128
   - 1
   - 1
-  sum: '-7.731e+00'
+  sum: '5.772e+00'
 network.layer2.3.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1074,14 +1074,14 @@ network.layer2.3.bn1.bias:
   - 128
   sum: '0.e+00'
 network.layer2.3.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.3.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1089,7 +1089,7 @@ network.layer2.3.bn1.running_mean:
   - 128
   sum: '0.e+00'
 network.layer2.3.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1097,7 +1097,7 @@ network.layer2.3.bn1.running_var:
   - 128
   sum: '1.28e+02'
 network.layer2.3.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1105,7 +1105,7 @@ network.layer2.3.bn1.weight:
   - 128
   sum: '1.28e+02'
 network.layer2.3.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1113,14 +1113,14 @@ network.layer2.3.bn2.bias:
   - 128
   sum: '0.e+00'
 network.layer2.3.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.3.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1128,7 +1128,7 @@ network.layer2.3.bn2.running_mean:
   - 128
   sum: '0.e+00'
 network.layer2.3.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1136,7 +1136,7 @@ network.layer2.3.bn2.running_var:
   - 128
   sum: '1.28e+02'
 network.layer2.3.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1144,7 +1144,7 @@ network.layer2.3.bn2.weight:
   - 128
   sum: '1.28e+02'
 network.layer2.3.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1152,14 +1152,14 @@ network.layer2.3.bn3.bias:
   - 512
   sum: '0.e+00'
 network.layer2.3.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer2.3.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1167,7 +1167,7 @@ network.layer2.3.bn3.running_mean:
   - 512
   sum: '0.e+00'
 network.layer2.3.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1175,7 +1175,7 @@ network.layer2.3.bn3.running_var:
   - 512
   sum: '5.12e+02'
 network.layer2.3.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1183,40 +1183,40 @@ network.layer2.3.bn3.weight:
   - 512
   sum: '5.12e+02'
 network.layer2.3.conv1.weight:
-  device: cpu
-  max: '5.012e-01'
-  mean: '-7.271e-04'
-  min: '-5.501e-01'
+  device: cuda:0
+  max: '5.314e-01'
+  mean: '-3.536e-04'
+  min: '-5.475e-01'
   shape:
   - 128
   - 512
   - 1
   - 1
-  sum: '-4.765e+01'
+  sum: '-2.318e+01'
 network.layer2.3.conv2.weight:
-  device: cpu
-  max: '1.814e-01'
-  mean: '5.993e-05'
-  min: '-2.048e-01'
+  device: cuda:0
+  max: '1.754e-01'
+  mean: '7.783e-05'
+  min: '-1.808e-01'
   shape:
   - 128
   - 128
   - 3
   - 3
-  sum: '8.837e+00'
+  sum: '1.148e+01'
 network.layer2.3.conv3.weight:
-  device: cpu
-  max: '2.943e-01'
-  mean: '-2.147e-04'
-  min: '-2.827e-01'
+  device: cuda:0
+  max: '2.382e-01'
+  mean: '-1.054e-05'
+  min: '-2.517e-01'
   shape:
   - 512
   - 128
   - 1
   - 1
-  sum: '-1.407e+01'
+  sum: '-6.906e-01'
 network.layer3.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1224,14 +1224,14 @@ network.layer3.0.bn1.bias:
   - 256
   sum: '0.e+00'
 network.layer3.0.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.0.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1239,7 +1239,7 @@ network.layer3.0.bn1.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.0.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1247,7 +1247,7 @@ network.layer3.0.bn1.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1255,7 +1255,7 @@ network.layer3.0.bn1.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1263,14 +1263,14 @@ network.layer3.0.bn2.bias:
   - 256
   sum: '0.e+00'
 network.layer3.0.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.0.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1278,7 +1278,7 @@ network.layer3.0.bn2.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.0.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1286,7 +1286,7 @@ network.layer3.0.bn2.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1294,7 +1294,7 @@ network.layer3.0.bn2.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.0.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1302,14 +1302,14 @@ network.layer3.0.bn3.bias:
   - 1024
   sum: '0.e+00'
 network.layer3.0.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.0.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1317,7 +1317,7 @@ network.layer3.0.bn3.running_mean:
   - 1024
   sum: '0.e+00'
 network.layer3.0.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1325,7 +1325,7 @@ network.layer3.0.bn3.running_var:
   - 1024
   sum: '1.024e+03'
 network.layer3.0.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1333,51 +1333,51 @@ network.layer3.0.bn3.weight:
   - 1024
   sum: '1.024e+03'
 network.layer3.0.conv1.weight:
-  device: cpu
-  max: '3.887e-01'
-  mean: '2.347e-04'
-  min: '-3.860e-01'
+  device: cuda:0
+  max: '3.667e-01'
+  mean: '-1.312e-04'
+  min: '-3.741e-01'
   shape:
   - 256
   - 512
   - 1
   - 1
-  sum: '3.076e+01'
+  sum: '-1.72e+01'
 network.layer3.0.conv2.weight:
-  device: cpu
-  max: '1.372e-01'
-  mean: '-1.56e-05'
-  min: '-1.419e-01'
+  device: cuda:0
+  max: '1.525e-01'
+  mean: '3.130e-05'
+  min: '-1.458e-01'
   shape:
   - 256
   - 256
   - 3
   - 3
-  sum: '-9.199e+00'
+  sum: '1.846e+01'
 network.layer3.0.conv3.weight:
-  device: cpu
-  max: '1.974e-01'
-  mean: '-2.099e-05'
-  min: '-2.157e-01'
+  device: cuda:0
+  max: '2.06e-01'
+  mean: '1.398e-05'
+  min: '-2.206e-01'
   shape:
   - 1024
   - 256
   - 1
   - 1
-  sum: '-5.501e+00'
+  sum: '3.665e+00'
 network.layer3.0.downsample.0.weight:
-  device: cpu
-  max: '2.111e-01'
-  mean: '-1.147e-05'
-  min: '-2.026e-01'
+  device: cuda:0
+  max: '1.988e-01'
+  mean: '2.828e-05'
+  min: '-2.006e-01'
   shape:
   - 1024
   - 512
   - 1
   - 1
-  sum: '-6.012e+00'
+  sum: '1.483e+01'
 network.layer3.0.downsample.1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1385,14 +1385,14 @@ network.layer3.0.downsample.1.bias:
   - 1024
   sum: '0.e+00'
 network.layer3.0.downsample.1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.0.downsample.1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1400,7 +1400,7 @@ network.layer3.0.downsample.1.running_mean:
   - 1024
   sum: '0.e+00'
 network.layer3.0.downsample.1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1408,7 +1408,7 @@ network.layer3.0.downsample.1.running_var:
   - 1024
   sum: '1.024e+03'
 network.layer3.0.downsample.1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1416,7 +1416,7 @@ network.layer3.0.downsample.1.weight:
   - 1024
   sum: '1.024e+03'
 network.layer3.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1424,14 +1424,14 @@ network.layer3.1.bn1.bias:
   - 256
   sum: '0.e+00'
 network.layer3.1.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.1.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1439,7 +1439,7 @@ network.layer3.1.bn1.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.1.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1447,7 +1447,7 @@ network.layer3.1.bn1.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1455,7 +1455,7 @@ network.layer3.1.bn1.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1463,14 +1463,14 @@ network.layer3.1.bn2.bias:
   - 256
   sum: '0.e+00'
 network.layer3.1.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.1.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1478,7 +1478,7 @@ network.layer3.1.bn2.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.1.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1486,7 +1486,7 @@ network.layer3.1.bn2.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1494,7 +1494,7 @@ network.layer3.1.bn2.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.1.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1502,14 +1502,14 @@ network.layer3.1.bn3.bias:
   - 1024
   sum: '0.e+00'
 network.layer3.1.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.1.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1517,7 +1517,7 @@ network.layer3.1.bn3.running_mean:
   - 1024
   sum: '0.e+00'
 network.layer3.1.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1525,7 +1525,7 @@ network.layer3.1.bn3.running_var:
   - 1024
   sum: '1.024e+03'
 network.layer3.1.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1533,40 +1533,40 @@ network.layer3.1.bn3.weight:
   - 1024
   sum: '1.024e+03'
 network.layer3.1.conv1.weight:
-  device: cpu
-  max: '4.004e-01'
-  mean: '1.076e-04'
-  min: '-3.917e-01'
+  device: cuda:0
+  max: '3.843e-01'
+  mean: '2.675e-04'
+  min: '-3.99e-01'
   shape:
   - 256
   - 1024
   - 1
   - 1
-  sum: '2.822e+01'
+  sum: '7.013e+01'
 network.layer3.1.conv2.weight:
-  device: cpu
-  max: '1.322e-01'
-  mean: '-7.433e-06'
-  min: '-1.435e-01'
+  device: cuda:0
+  max: '1.38e-01'
+  mean: '-3.53e-06'
+  min: '-1.294e-01'
   shape:
   - 256
   - 256
   - 3
   - 3
-  sum: '-4.384e+00'
+  sum: '-2.082e+00'
 network.layer3.1.conv3.weight:
-  device: cpu
-  max: '2.148e-01'
-  mean: '-2.367e-05'
-  min: '-2.066e-01'
+  device: cuda:0
+  max: '2.052e-01'
+  mean: '-7.496e-06'
+  min: '-1.973e-01'
   shape:
   - 1024
   - 256
   - 1
   - 1
-  sum: '-6.205e+00'
+  sum: '-1.965e+00'
 network.layer3.2.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1574,14 +1574,14 @@ network.layer3.2.bn1.bias:
   - 256
   sum: '0.e+00'
 network.layer3.2.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.2.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1589,7 +1589,7 @@ network.layer3.2.bn1.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.2.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1597,7 +1597,7 @@ network.layer3.2.bn1.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.2.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1605,7 +1605,7 @@ network.layer3.2.bn1.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.2.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1613,14 +1613,14 @@ network.layer3.2.bn2.bias:
   - 256
   sum: '0.e+00'
 network.layer3.2.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.2.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1628,7 +1628,7 @@ network.layer3.2.bn2.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.2.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1636,7 +1636,7 @@ network.layer3.2.bn2.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.2.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1644,7 +1644,7 @@ network.layer3.2.bn2.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.2.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1652,14 +1652,14 @@ network.layer3.2.bn3.bias:
   - 1024
   sum: '0.e+00'
 network.layer3.2.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.2.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1667,7 +1667,7 @@ network.layer3.2.bn3.running_mean:
   - 1024
   sum: '0.e+00'
 network.layer3.2.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1675,7 +1675,7 @@ network.layer3.2.bn3.running_var:
   - 1024
   sum: '1.024e+03'
 network.layer3.2.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1683,40 +1683,40 @@ network.layer3.2.bn3.weight:
   - 1024
   sum: '1.024e+03'
 network.layer3.2.conv1.weight:
-  device: cpu
-  max: '4.098e-01'
-  mean: '7.033e-06'
-  min: '-4.186e-01'
+  device: cuda:0
+  max: '4.040e-01'
+  mean: '5.938e-06'
+  min: '-4.109e-01'
   shape:
   - 256
   - 1024
   - 1
   - 1
-  sum: '1.844e+00'
+  sum: '1.557e+00'
 network.layer3.2.conv2.weight:
-  device: cpu
-  max: '1.384e-01'
-  mean: '5.707e-05'
-  min: '-1.45e-01'
+  device: cuda:0
+  max: '1.381e-01'
+  mean: '-1.49e-05'
+  min: '-1.505e-01'
   shape:
   - 256
   - 256
   - 3
   - 3
-  sum: '3.366e+01'
+  sum: '-8.787e+00'
 network.layer3.2.conv3.weight:
-  device: cpu
-  max: '1.963e-01'
-  mean: '-1.181e-05'
-  min: '-1.884e-01'
+  device: cuda:0
+  max: '1.964e-01'
+  mean: '8.209e-05'
+  min: '-1.861e-01'
   shape:
   - 1024
   - 256
   - 1
   - 1
-  sum: '-3.096e+00'
+  sum: '2.152e+01'
 network.layer3.3.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1724,14 +1724,14 @@ network.layer3.3.bn1.bias:
   - 256
   sum: '0.e+00'
 network.layer3.3.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.3.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1739,7 +1739,7 @@ network.layer3.3.bn1.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.3.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1747,7 +1747,7 @@ network.layer3.3.bn1.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.3.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1755,7 +1755,7 @@ network.layer3.3.bn1.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.3.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1763,14 +1763,14 @@ network.layer3.3.bn2.bias:
   - 256
   sum: '0.e+00'
 network.layer3.3.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.3.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1778,7 +1778,7 @@ network.layer3.3.bn2.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.3.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1786,7 +1786,7 @@ network.layer3.3.bn2.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.3.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1794,7 +1794,7 @@ network.layer3.3.bn2.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.3.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1802,14 +1802,14 @@ network.layer3.3.bn3.bias:
   - 1024
   sum: '0.e+00'
 network.layer3.3.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.3.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1817,7 +1817,7 @@ network.layer3.3.bn3.running_mean:
   - 1024
   sum: '0.e+00'
 network.layer3.3.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1825,7 +1825,7 @@ network.layer3.3.bn3.running_var:
   - 1024
   sum: '1.024e+03'
 network.layer3.3.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1833,40 +1833,40 @@ network.layer3.3.bn3.weight:
   - 1024
   sum: '1.024e+03'
 network.layer3.3.conv1.weight:
-  device: cpu
-  max: '4.032e-01'
-  mean: '6.746e-06'
-  min: '-4.411e-01'
+  device: cuda:0
+  max: '3.85e-01'
+  mean: '-1.446e-04'
+  min: '-4.104e-01'
   shape:
   - 256
   - 1024
   - 1
   - 1
-  sum: '1.768e+00'
+  sum: '-3.789e+01'
 network.layer3.3.conv2.weight:
-  device: cpu
-  max: '1.377e-01'
-  mean: '4.517e-05'
-  min: '-1.378e-01'
+  device: cuda:0
+  max: '1.48e-01'
+  mean: '-4.522e-05'
+  min: '-1.423e-01'
   shape:
   - 256
   - 256
   - 3
   - 3
-  sum: '2.664e+01'
+  sum: '-2.667e+01'
 network.layer3.3.conv3.weight:
-  device: cpu
-  max: '2.2e-01'
-  mean: '8.760e-05'
-  min: '-1.877e-01'
+  device: cuda:0
+  max: '1.972e-01'
+  mean: '-4.765e-05'
+  min: '-2.067e-01'
   shape:
   - 1024
   - 256
   - 1
   - 1
-  sum: '2.296e+01'
+  sum: '-1.249e+01'
 network.layer3.4.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1874,14 +1874,14 @@ network.layer3.4.bn1.bias:
   - 256
   sum: '0.e+00'
 network.layer3.4.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.4.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1889,7 +1889,7 @@ network.layer3.4.bn1.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.4.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1897,7 +1897,7 @@ network.layer3.4.bn1.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.4.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1905,7 +1905,7 @@ network.layer3.4.bn1.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.4.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1913,14 +1913,14 @@ network.layer3.4.bn2.bias:
   - 256
   sum: '0.e+00'
 network.layer3.4.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.4.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1928,7 +1928,7 @@ network.layer3.4.bn2.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.4.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1936,7 +1936,7 @@ network.layer3.4.bn2.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.4.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1944,7 +1944,7 @@ network.layer3.4.bn2.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.4.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1952,14 +1952,14 @@ network.layer3.4.bn3.bias:
   - 1024
   sum: '0.e+00'
 network.layer3.4.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.4.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -1967,7 +1967,7 @@ network.layer3.4.bn3.running_mean:
   - 1024
   sum: '0.e+00'
 network.layer3.4.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1975,7 +1975,7 @@ network.layer3.4.bn3.running_var:
   - 1024
   sum: '1.024e+03'
 network.layer3.4.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -1983,40 +1983,40 @@ network.layer3.4.bn3.weight:
   - 1024
   sum: '1.024e+03'
 network.layer3.4.conv1.weight:
-  device: cpu
-  max: '4.246e-01'
-  mean: '5.362e-06'
-  min: '-4.278e-01'
+  device: cuda:0
+  max: '4.356e-01'
+  mean: '9.811e-05'
+  min: '-3.892e-01'
   shape:
   - 256
   - 1024
   - 1
   - 1
-  sum: '1.406e+00'
+  sum: '2.572e+01'
 network.layer3.4.conv2.weight:
-  device: cpu
-  max: '1.393e-01'
-  mean: '2.222e-06'
-  min: '-1.434e-01'
+  device: cuda:0
+  max: '1.430e-01'
+  mean: '-3.322e-05'
+  min: '-1.325e-01'
   shape:
   - 256
   - 256
   - 3
   - 3
-  sum: '1.311e+00'
+  sum: '-1.959e+01'
 network.layer3.4.conv3.weight:
-  device: cpu
-  max: '2.e-01'
-  mean: '9.206e-05'
-  min: '-2.008e-01'
+  device: cuda:0
+  max: '1.993e-01'
+  mean: '3.794e-05'
+  min: '-2.046e-01'
   shape:
   - 1024
   - 256
   - 1
   - 1
-  sum: '2.413e+01'
+  sum: '9.945e+00'
 network.layer3.5.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2024,14 +2024,14 @@ network.layer3.5.bn1.bias:
   - 256
   sum: '0.e+00'
 network.layer3.5.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.5.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2039,7 +2039,7 @@ network.layer3.5.bn1.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.5.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2047,7 +2047,7 @@ network.layer3.5.bn1.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.5.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2055,7 +2055,7 @@ network.layer3.5.bn1.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.5.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2063,14 +2063,14 @@ network.layer3.5.bn2.bias:
   - 256
   sum: '0.e+00'
 network.layer3.5.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.5.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2078,7 +2078,7 @@ network.layer3.5.bn2.running_mean:
   - 256
   sum: '0.e+00'
 network.layer3.5.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2086,7 +2086,7 @@ network.layer3.5.bn2.running_var:
   - 256
   sum: '2.56e+02'
 network.layer3.5.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2094,7 +2094,7 @@ network.layer3.5.bn2.weight:
   - 256
   sum: '2.56e+02'
 network.layer3.5.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2102,14 +2102,14 @@ network.layer3.5.bn3.bias:
   - 1024
   sum: '0.e+00'
 network.layer3.5.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer3.5.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2117,7 +2117,7 @@ network.layer3.5.bn3.running_mean:
   - 1024
   sum: '0.e+00'
 network.layer3.5.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2125,7 +2125,7 @@ network.layer3.5.bn3.running_var:
   - 1024
   sum: '1.024e+03'
 network.layer3.5.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2133,40 +2133,40 @@ network.layer3.5.bn3.weight:
   - 1024
   sum: '1.024e+03'
 network.layer3.5.conv1.weight:
-  device: cpu
-  max: '4.474e-01'
-  mean: '-1.600e-05'
-  min: '-4.060e-01'
+  device: cuda:0
+  max: '4.095e-01'
+  mean: '4.100e-05'
+  min: '-3.786e-01'
   shape:
   - 256
   - 1024
   - 1
   - 1
-  sum: '-4.194e+00'
+  sum: '1.075e+01'
 network.layer3.5.conv2.weight:
-  device: cpu
-  max: '1.359e-01'
-  mean: '3.909e-05'
-  min: '-1.454e-01'
+  device: cuda:0
+  max: '1.341e-01'
+  mean: '-1.609e-05'
+  min: '-1.361e-01'
   shape:
   - 256
   - 256
   - 3
   - 3
-  sum: '2.306e+01'
+  sum: '-9.492e+00'
 network.layer3.5.conv3.weight:
-  device: cpu
-  max: '2.021e-01'
-  mean: '8.33e-05'
-  min: '-1.915e-01'
+  device: cuda:0
+  max: '1.988e-01'
+  mean: '-1.139e-04'
+  min: '-2.040e-01'
   shape:
   - 1024
   - 256
   - 1
   - 1
-  sum: '2.184e+01'
+  sum: '-2.986e+01'
 network.layer4.0.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2174,14 +2174,14 @@ network.layer4.0.bn1.bias:
   - 512
   sum: '0.e+00'
 network.layer4.0.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.0.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2189,7 +2189,7 @@ network.layer4.0.bn1.running_mean:
   - 512
   sum: '0.e+00'
 network.layer4.0.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2197,7 +2197,7 @@ network.layer4.0.bn1.running_var:
   - 512
   sum: '5.12e+02'
 network.layer4.0.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2205,7 +2205,7 @@ network.layer4.0.bn1.weight:
   - 512
   sum: '5.12e+02'
 network.layer4.0.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2213,14 +2213,14 @@ network.layer4.0.bn2.bias:
   - 512
   sum: '0.e+00'
 network.layer4.0.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.0.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2228,7 +2228,7 @@ network.layer4.0.bn2.running_mean:
   - 512
   sum: '0.e+00'
 network.layer4.0.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2236,7 +2236,7 @@ network.layer4.0.bn2.running_var:
   - 512
   sum: '5.12e+02'
 network.layer4.0.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2244,7 +2244,7 @@ network.layer4.0.bn2.weight:
   - 512
   sum: '5.12e+02'
 network.layer4.0.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2252,14 +2252,14 @@ network.layer4.0.bn3.bias:
   - 2048
   sum: '0.e+00'
 network.layer4.0.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.0.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2267,7 +2267,7 @@ network.layer4.0.bn3.running_mean:
   - 2048
   sum: '0.e+00'
 network.layer4.0.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2275,7 +2275,7 @@ network.layer4.0.bn3.running_var:
   - 2048
   sum: '2.048e+03'
 network.layer4.0.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2283,51 +2283,51 @@ network.layer4.0.bn3.weight:
   - 2048
   sum: '2.048e+03'
 network.layer4.0.conv1.weight:
-  device: cpu
-  max: '3.176e-01'
-  mean: '-1.807e-05'
-  min: '-3.028e-01'
+  device: cuda:0
+  max: '2.970e-01'
+  mean: '5.637e-05'
+  min: '-2.903e-01'
   shape:
   - 512
   - 1024
   - 1
   - 1
-  sum: '-9.476e+00'
+  sum: '2.955e+01'
 network.layer4.0.conv2.weight:
-  device: cpu
-  max: '9.886e-02'
-  mean: '1.319e-05'
-  min: '-1.076e-01'
+  device: cuda:0
+  max: '9.993e-02'
+  mean: '1.64e-05'
+  min: '-1.102e-01'
   shape:
   - 512
   - 512
   - 3
   - 3
-  sum: '3.112e+01'
+  sum: '3.869e+01'
 network.layer4.0.conv3.weight:
-  device: cpu
-  max: '1.626e-01'
-  mean: '-1.957e-05'
-  min: '-1.542e-01'
+  device: cuda:0
+  max: '1.534e-01'
+  mean: '-2.382e-06'
+  min: '-1.673e-01'
   shape:
   - 2048
   - 512
   - 1
   - 1
-  sum: '-2.052e+01'
+  sum: '-2.498e+00'
 network.layer4.0.downsample.0.weight:
-  device: cpu
-  max: '1.639e-01'
-  mean: '4.621e-05'
-  min: '-1.535e-01'
+  device: cuda:0
+  max: '1.475e-01'
+  mean: '-6.343e-06'
+  min: '-1.472e-01'
   shape:
   - 2048
   - 1024
   - 1
   - 1
-  sum: '9.69e+01'
+  sum: '-1.330e+01'
 network.layer4.0.downsample.1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2335,14 +2335,14 @@ network.layer4.0.downsample.1.bias:
   - 2048
   sum: '0.e+00'
 network.layer4.0.downsample.1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.0.downsample.1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2350,7 +2350,7 @@ network.layer4.0.downsample.1.running_mean:
   - 2048
   sum: '0.e+00'
 network.layer4.0.downsample.1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2358,7 +2358,7 @@ network.layer4.0.downsample.1.running_var:
   - 2048
   sum: '2.048e+03'
 network.layer4.0.downsample.1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2366,7 +2366,7 @@ network.layer4.0.downsample.1.weight:
   - 2048
   sum: '2.048e+03'
 network.layer4.1.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2374,14 +2374,14 @@ network.layer4.1.bn1.bias:
   - 512
   sum: '0.e+00'
 network.layer4.1.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.1.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2389,7 +2389,7 @@ network.layer4.1.bn1.running_mean:
   - 512
   sum: '0.e+00'
 network.layer4.1.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2397,7 +2397,7 @@ network.layer4.1.bn1.running_var:
   - 512
   sum: '5.12e+02'
 network.layer4.1.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2405,7 +2405,7 @@ network.layer4.1.bn1.weight:
   - 512
   sum: '5.12e+02'
 network.layer4.1.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2413,14 +2413,14 @@ network.layer4.1.bn2.bias:
   - 512
   sum: '0.e+00'
 network.layer4.1.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.1.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2428,7 +2428,7 @@ network.layer4.1.bn2.running_mean:
   - 512
   sum: '0.e+00'
 network.layer4.1.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2436,7 +2436,7 @@ network.layer4.1.bn2.running_var:
   - 512
   sum: '5.12e+02'
 network.layer4.1.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2444,7 +2444,7 @@ network.layer4.1.bn2.weight:
   - 512
   sum: '5.12e+02'
 network.layer4.1.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2452,14 +2452,14 @@ network.layer4.1.bn3.bias:
   - 2048
   sum: '0.e+00'
 network.layer4.1.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.1.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2467,7 +2467,7 @@ network.layer4.1.bn3.running_mean:
   - 2048
   sum: '0.e+00'
 network.layer4.1.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2475,7 +2475,7 @@ network.layer4.1.bn3.running_var:
   - 2048
   sum: '2.048e+03'
 network.layer4.1.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2483,40 +2483,40 @@ network.layer4.1.bn3.weight:
   - 2048
   sum: '2.048e+03'
 network.layer4.1.conv1.weight:
-  device: cpu
-  max: '3.065e-01'
-  mean: '-6.068e-05'
-  min: '-2.977e-01'
+  device: cuda:0
+  max: '3.285e-01'
+  mean: '5.911e-05'
+  min: '-3.033e-01'
   shape:
   - 512
   - 2048
   - 1
   - 1
-  sum: '-6.363e+01'
+  sum: '6.198e+01'
 network.layer4.1.conv2.weight:
-  device: cpu
-  max: '9.902e-02'
-  mean: '1.140e-06'
-  min: '-1.08e-01'
+  device: cuda:0
+  max: '1.104e-01'
+  mean: '2.457e-05'
+  min: '-1.031e-01'
   shape:
   - 512
   - 512
   - 3
   - 3
-  sum: '2.690e+00'
+  sum: '5.797e+01'
 network.layer4.1.conv3.weight:
-  device: cpu
-  max: '1.517e-01'
-  mean: '-3.666e-05'
-  min: '-1.526e-01'
+  device: cuda:0
+  max: '1.483e-01'
+  mean: '-6.445e-06'
+  min: '-1.555e-01'
   shape:
   - 2048
   - 512
   - 1
   - 1
-  sum: '-3.844e+01'
+  sum: '-6.758e+00'
 network.layer4.2.bn1.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2524,14 +2524,14 @@ network.layer4.2.bn1.bias:
   - 512
   sum: '0.e+00'
 network.layer4.2.bn1.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.2.bn1.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2539,7 +2539,7 @@ network.layer4.2.bn1.running_mean:
   - 512
   sum: '0.e+00'
 network.layer4.2.bn1.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2547,7 +2547,7 @@ network.layer4.2.bn1.running_var:
   - 512
   sum: '5.12e+02'
 network.layer4.2.bn1.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2555,7 +2555,7 @@ network.layer4.2.bn1.weight:
   - 512
   sum: '5.12e+02'
 network.layer4.2.bn2.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2563,14 +2563,14 @@ network.layer4.2.bn2.bias:
   - 512
   sum: '0.e+00'
 network.layer4.2.bn2.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.2.bn2.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2578,7 +2578,7 @@ network.layer4.2.bn2.running_mean:
   - 512
   sum: '0.e+00'
 network.layer4.2.bn2.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2586,7 +2586,7 @@ network.layer4.2.bn2.running_var:
   - 512
   sum: '5.12e+02'
 network.layer4.2.bn2.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2594,7 +2594,7 @@ network.layer4.2.bn2.weight:
   - 512
   sum: '5.12e+02'
 network.layer4.2.bn3.bias:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2602,14 +2602,14 @@ network.layer4.2.bn3.bias:
   - 2048
   sum: '0.e+00'
 network.layer4.2.bn3.num_batches_tracked:
-  device: cpu
+  device: cuda:0
   max: 0
   mean: '0.e+00'
   min: 0
   shape: []
   sum: 0
 network.layer4.2.bn3.running_mean:
-  device: cpu
+  device: cuda:0
   max: '0.e+00'
   mean: '0.e+00'
   min: '0.e+00'
@@ -2617,7 +2617,7 @@ network.layer4.2.bn3.running_mean:
   - 2048
   sum: '0.e+00'
 network.layer4.2.bn3.running_var:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2625,7 +2625,7 @@ network.layer4.2.bn3.running_var:
   - 2048
   sum: '2.048e+03'
 network.layer4.2.bn3.weight:
-  device: cpu
+  device: cuda:0
   max: '1.e+00'
   mean: '1.e+00'
   min: '1.e+00'
@@ -2633,35 +2633,35 @@ network.layer4.2.bn3.weight:
   - 2048
   sum: '2.048e+03'
 network.layer4.2.conv1.weight:
-  device: cpu
-  max: '2.82e-01'
-  mean: '-9.716e-05'
-  min: '-2.873e-01'
+  device: cuda:0
+  max: '2.960e-01'
+  mean: '-1.275e-04'
+  min: '-3.368e-01'
   shape:
   - 512
   - 2048
   - 1
   - 1
-  sum: '-1.019e+02'
+  sum: '-1.337e+02'
 network.layer4.2.conv2.weight:
-  device: cpu
-  max: '1.111e-01'
-  mean: '-2.905e-06'
-  min: '-1.051e-01'
+  device: cuda:0
+  max: '9.885e-02'
+  mean: '-6.874e-06'
+  min: '-9.988e-02'
   shape:
   - 512
   - 512
   - 3
   - 3
-  sum: '-6.853e+00'
+  sum: '-1.622e+01'
 network.layer4.2.conv3.weight:
-  device: cpu
-  max: '1.576e-01'
-  mean: '5.136e-06'
-  min: '-1.479e-01'
+  device: cuda:0
+  max: '1.45e-01'
+  mean: '1.976e-05'
+  min: '-1.578e-01'
   shape:
   - 2048
   - 512
   - 1
   - 1
-  sum: '5.386e+00'
+  sum: '2.073e+01'
diff --git a/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet50_imagenet_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet50_imagenet_image_classifier.yaml
new file mode 100644
index 00000000..929934db
--- /dev/null
+++ b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet50_imagenet_image_classifier.yaml
@@ -0,0 +1,2667 @@
+network.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.conv1.weight:
+  device: cuda:0
+  max: '1.019e-01'
+  mean: '2.309e-04'
+  min: '-8.332e-02'
+  shape:
+  - 64
+  - 3
+  - 7
+  - 7
+  sum: '2.172e+00'
+network.fc.bias:
+  device: cuda:0
+  max: '2.203e-02'
+  mean: '4.486e-04'
+  min: '-2.206e-02'
+  shape:
+  - 1000
+  sum: '4.486e-01'
+network.fc.weight:
+  device: cuda:0
+  max: '2.21e-02'
+  mean: '6.154e-06'
+  min: '-2.21e-02'
+  shape:
+  - 1000
+  - 2048
+  sum: '1.260e+01'
+network.layer1.0.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.0.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer1.0.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.0.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.0.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.0.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.0.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer1.0.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.0.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.0.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.0.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer1.0.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer1.0.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer1.0.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer1.0.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer1.0.conv1.weight:
+  device: cuda:0
+  max: '6.509e-01'
+  mean: '1.445e-03'
+  min: '-6.027e-01'
+  shape:
+  - 64
+  - 64
+  - 1
+  - 1
+  sum: '5.919e+00'
+network.layer1.0.conv2.weight:
+  device: cuda:0
+  max: '2.359e-01'
+  mean: '1.355e-04'
+  min: '-2.49e-01'
+  shape:
+  - 64
+  - 64
+  - 3
+  - 3
+  sum: '4.995e+00'
+network.layer1.0.conv3.weight:
+  device: cuda:0
+  max: '3.852e-01'
+  mean: '3.642e-04'
+  min: '-3.478e-01'
+  shape:
+  - 256
+  - 64
+  - 1
+  - 1
+  sum: '5.966e+00'
+network.layer1.0.downsample.0.weight:
+  device: cuda:0
+  max: '3.423e-01'
+  mean: '-6.033e-04'
+  min: '-3.476e-01'
+  shape:
+  - 256
+  - 64
+  - 1
+  - 1
+  sum: '-9.884e+00'
+network.layer1.0.downsample.1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer1.0.downsample.1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer1.0.downsample.1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer1.0.downsample.1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer1.0.downsample.1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer1.1.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.1.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer1.1.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.1.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.1.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.1.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.1.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer1.1.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.1.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.1.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.1.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer1.1.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer1.1.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer1.1.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer1.1.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer1.1.conv1.weight:
+  device: cuda:0
+  max: '7.347e-01'
+  mean: '1.03e-03'
+  min: '-6.643e-01'
+  shape:
+  - 64
+  - 256
+  - 1
+  - 1
+  sum: '1.687e+01'
+network.layer1.1.conv2.weight:
+  device: cuda:0
+  max: '2.614e-01'
+  mean: '3.465e-04'
+  min: '-2.217e-01'
+  shape:
+  - 64
+  - 64
+  - 3
+  - 3
+  sum: '1.277e+01'
+network.layer1.1.conv3.weight:
+  device: cuda:0
+  max: '3.091e-01'
+  mean: '4.206e-05'
+  min: '-3.557e-01'
+  shape:
+  - 256
+  - 64
+  - 1
+  - 1
+  sum: '6.892e-01'
+network.layer1.2.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.2.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer1.2.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.2.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.2.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.2.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.2.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer1.2.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 64
+  sum: '0.e+00'
+network.layer1.2.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.2.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 64
+  sum: '6.4e+01'
+network.layer1.2.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer1.2.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer1.2.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer1.2.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer1.2.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer1.2.conv1.weight:
+  device: cuda:0
+  max: '6.524e-01'
+  mean: '-1.441e-03'
+  min: '-6.990e-01'
+  shape:
+  - 64
+  - 256
+  - 1
+  - 1
+  sum: '-2.362e+01'
+network.layer1.2.conv2.weight:
+  device: cuda:0
+  max: '2.666e-01'
+  mean: '-3.895e-05'
+  min: '-2.347e-01'
+  shape:
+  - 64
+  - 64
+  - 3
+  - 3
+  sum: '-1.436e+00'
+network.layer1.2.conv3.weight:
+  device: cuda:0
+  max: '3.408e-01'
+  mean: '5.479e-04'
+  min: '-3.091e-01'
+  shape:
+  - 256
+  - 64
+  - 1
+  - 1
+  sum: '8.977e+00'
+network.layer2.0.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.0.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.0.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.0.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.0.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.0.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.0.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.0.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.0.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.0.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.0.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer2.0.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.0.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer2.0.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer2.0.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer2.0.conv1.weight:
+  device: cuda:0
+  max: '5.176e-01'
+  mean: '-5.491e-04'
+  min: '-4.999e-01'
+  shape:
+  - 128
+  - 256
+  - 1
+  - 1
+  sum: '-1.799e+01'
+network.layer2.0.conv2.weight:
+  device: cuda:0
+  max: '1.808e-01'
+  mean: '-1.218e-04'
+  min: '-1.887e-01'
+  shape:
+  - 128
+  - 128
+  - 3
+  - 3
+  sum: '-1.796e+01'
+network.layer2.0.conv3.weight:
+  device: cuda:0
+  max: '2.875e-01'
+  mean: '-1.799e-04'
+  min: '-2.593e-01'
+  shape:
+  - 512
+  - 128
+  - 1
+  - 1
+  sum: '-1.179e+01'
+network.layer2.0.downsample.0.weight:
+  device: cuda:0
+  max: '3.018e-01'
+  mean: '-5.660e-05'
+  min: '-2.697e-01'
+  shape:
+  - 512
+  - 256
+  - 1
+  - 1
+  sum: '-7.419e+00'
+network.layer2.0.downsample.1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer2.0.downsample.1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.0.downsample.1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer2.0.downsample.1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer2.0.downsample.1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer2.1.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.1.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.1.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.1.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.1.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.1.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.1.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.1.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.1.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.1.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.1.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer2.1.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.1.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer2.1.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer2.1.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer2.1.conv1.weight:
+  device: cuda:0
+  max: '5.314e-01'
+  mean: '-3.536e-04'
+  min: '-5.475e-01'
+  shape:
+  - 128
+  - 512
+  - 1
+  - 1
+  sum: '-2.318e+01'
+network.layer2.1.conv2.weight:
+  device: cuda:0
+  max: '1.754e-01'
+  mean: '7.783e-05'
+  min: '-1.808e-01'
+  shape:
+  - 128
+  - 128
+  - 3
+  - 3
+  sum: '1.148e+01'
+network.layer2.1.conv3.weight:
+  device: cuda:0
+  max: '2.382e-01'
+  mean: '-1.054e-05'
+  min: '-2.517e-01'
+  shape:
+  - 512
+  - 128
+  - 1
+  - 1
+  sum: '-6.906e-01'
+network.layer2.2.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.2.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.2.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.2.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.2.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.2.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.2.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.2.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.2.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.2.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.2.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer2.2.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.2.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer2.2.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer2.2.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer2.2.conv1.weight:
+  device: cuda:0
+  max: '4.971e-01'
+  mean: '-3.09e-04'
+  min: '-5.291e-01'
+  shape:
+  - 128
+  - 512
+  - 1
+  - 1
+  sum: '-2.025e+01'
+network.layer2.2.conv2.weight:
+  device: cuda:0
+  max: '2.107e-01'
+  mean: '-7.661e-06'
+  min: '-1.779e-01'
+  shape:
+  - 128
+  - 128
+  - 3
+  - 3
+  sum: '-1.13e+00'
+network.layer2.2.conv3.weight:
+  device: cuda:0
+  max: '3.236e-01'
+  mean: '2.725e-05'
+  min: '-3.006e-01'
+  shape:
+  - 512
+  - 128
+  - 1
+  - 1
+  sum: '1.786e+00'
+network.layer2.3.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.3.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.3.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.3.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.3.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.3.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.3.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.3.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 128
+  sum: '0.e+00'
+network.layer2.3.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.3.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 128
+  sum: '1.28e+02'
+network.layer2.3.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer2.3.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer2.3.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer2.3.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer2.3.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer2.3.conv1.weight:
+  device: cuda:0
+  max: '5.317e-01'
+  mean: '9.857e-05'
+  min: '-5.177e-01'
+  shape:
+  - 128
+  - 512
+  - 1
+  - 1
+  sum: '6.460e+00'
+network.layer2.3.conv2.weight:
+  device: cuda:0
+  max: '1.874e-01'
+  mean: '6.223e-05'
+  min: '-1.855e-01'
+  shape:
+  - 128
+  - 128
+  - 3
+  - 3
+  sum: '9.176e+00'
+network.layer2.3.conv3.weight:
+  device: cuda:0
+  max: '2.559e-01'
+  mean: '-2.673e-04'
+  min: '-2.529e-01'
+  shape:
+  - 512
+  - 128
+  - 1
+  - 1
+  sum: '-1.752e+01'
+network.layer3.0.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.0.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.0.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.0.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.0.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.0.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.0.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.0.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.0.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.0.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.0.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 1024
+  sum: '0.e+00'
+network.layer3.0.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.0.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 1024
+  sum: '0.e+00'
+network.layer3.0.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.layer3.0.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.layer3.0.conv1.weight:
+  device: cuda:0
+  max: '3.843e-01'
+  mean: '3.586e-04'
+  min: '-3.99e-01'
+  shape:
+  - 256
+  - 512
+  - 1
+  - 1
+  sum: '4.701e+01'
+network.layer3.0.conv2.weight:
+  device: cuda:0
+  max: '1.38e-01'
+  mean: '-3.53e-06'
+  min: '-1.294e-01'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '-2.082e+00'
+network.layer3.0.conv3.weight:
+  device: cuda:0
+  max: '2.052e-01'
+  mean: '-7.496e-06'
+  min: '-1.973e-01'
+  shape:
+  - 1024
+  - 256
+  - 1
+  - 1
+  sum: '-1.965e+00'
+network.layer3.0.downsample.0.weight:
+  device: cuda:0
+  max: '2.020e-01'
+  mean: '1.340e-05'
+  min: '-2.257e-01'
+  shape:
+  - 1024
+  - 512
+  - 1
+  - 1
+  sum: '7.027e+00'
+network.layer3.0.downsample.1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 1024
+  sum: '0.e+00'
+network.layer3.0.downsample.1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.0.downsample.1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 1024
+  sum: '0.e+00'
+network.layer3.0.downsample.1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.layer3.0.downsample.1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.layer3.1.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.1.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.1.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.1.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.1.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.1.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.1.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.1.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.1.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.1.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.1.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 1024
+  sum: '0.e+00'
+network.layer3.1.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.1.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 1024
+  sum: '0.e+00'
+network.layer3.1.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.layer3.1.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.layer3.1.conv1.weight:
+  device: cuda:0
+  max: '4.143e-01'
+  mean: '1.499e-05'
+  min: '-3.709e-01'
+  shape:
+  - 256
+  - 1024
+  - 1
+  - 1
+  sum: '3.93e+00'
+network.layer3.1.conv2.weight:
+  device: cuda:0
+  max: '1.309e-01'
+  mean: '1.100e-05'
+  min: '-1.368e-01'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '6.490e+00'
+network.layer3.1.conv3.weight:
+  device: cuda:0
+  max: '2.051e-01'
+  mean: '-1.367e-04'
+  min: '-1.971e-01'
+  shape:
+  - 1024
+  - 256
+  - 1
+  - 1
+  sum: '-3.584e+01'
+network.layer3.2.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.2.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.2.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.2.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.2.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.2.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.2.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.2.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.2.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.2.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.2.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 1024
+  sum: '0.e+00'
+network.layer3.2.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.2.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 1024
+  sum: '0.e+00'
+network.layer3.2.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.layer3.2.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.layer3.2.conv1.weight:
+  device: cuda:0
+  max: '3.993e-01'
+  mean: '-1.212e-04'
+  min: '-4.269e-01'
+  shape:
+  - 256
+  - 1024
+  - 1
+  - 1
+  sum: '-3.178e+01'
+network.layer3.2.conv2.weight:
+  device: cuda:0
+  max: '1.517e-01'
+  mean: '1.648e-05'
+  min: '-1.378e-01'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '9.721e+00'
+network.layer3.2.conv3.weight:
+  device: cuda:0
+  max: '1.958e-01'
+  mean: '-6.993e-06'
+  min: '-1.987e-01'
+  shape:
+  - 1024
+  - 256
+  - 1
+  - 1
+  sum: '-1.833e+00'
+network.layer3.3.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.3.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.3.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.3.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.3.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.3.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.3.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.3.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.3.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.3.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.3.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 1024
+  sum: '0.e+00'
+network.layer3.3.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.3.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 1024
+  sum: '0.e+00'
+network.layer3.3.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.layer3.3.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.layer3.3.conv1.weight:
+  device: cuda:0
+  max: '4.290e-01'
+  mean: '-2.493e-04'
+  min: '-3.916e-01'
+  shape:
+  - 256
+  - 1024
+  - 1
+  - 1
+  sum: '-6.535e+01'
+network.layer3.3.conv2.weight:
+  device: cuda:0
+  max: '1.365e-01'
+  mean: '1.203e-05'
+  min: '-1.364e-01'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '7.097e+00'
+network.layer3.3.conv3.weight:
+  device: cuda:0
+  max: '2.011e-01'
+  mean: '9.821e-05'
+  min: '-2.042e-01'
+  shape:
+  - 1024
+  - 256
+  - 1
+  - 1
+  sum: '2.575e+01'
+network.layer3.4.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.4.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.4.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.4.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.4.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.4.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.4.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.4.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.4.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.4.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.4.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 1024
+  sum: '0.e+00'
+network.layer3.4.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.4.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 1024
+  sum: '0.e+00'
+network.layer3.4.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.layer3.4.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.layer3.4.conv1.weight:
+  device: cuda:0
+  max: '3.968e-01'
+  mean: '-2.179e-04'
+  min: '-3.871e-01'
+  shape:
+  - 256
+  - 1024
+  - 1
+  - 1
+  sum: '-5.712e+01'
+network.layer3.4.conv2.weight:
+  device: cuda:0
+  max: '1.392e-01'
+  mean: '-2.276e-05'
+  min: '-1.360e-01'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '-1.342e+01'
+network.layer3.4.conv3.weight:
+  device: cuda:0
+  max: '2.100e-01'
+  mean: '9.087e-05'
+  min: '-2.052e-01'
+  shape:
+  - 1024
+  - 256
+  - 1
+  - 1
+  sum: '2.382e+01'
+network.layer3.5.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.5.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.5.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.5.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.5.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.5.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.5.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.5.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.layer3.5.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.5.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 256
+  sum: '2.56e+02'
+network.layer3.5.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 1024
+  sum: '0.e+00'
+network.layer3.5.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer3.5.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 1024
+  sum: '0.e+00'
+network.layer3.5.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.layer3.5.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.layer3.5.conv1.weight:
+  device: cuda:0
+  max: '3.732e-01'
+  mean: '4.573e-05'
+  min: '-4.036e-01'
+  shape:
+  - 256
+  - 1024
+  - 1
+  - 1
+  sum: '1.199e+01'
+network.layer3.5.conv2.weight:
+  device: cuda:0
+  max: '1.382e-01'
+  mean: '3.509e-05'
+  min: '-1.344e-01'
+  shape:
+  - 256
+  - 256
+  - 3
+  - 3
+  sum: '2.07e+01'
+network.layer3.5.conv3.weight:
+  device: cuda:0
+  max: '2.12e-01'
+  mean: '-2.857e-05'
+  min: '-2.015e-01'
+  shape:
+  - 1024
+  - 256
+  - 1
+  - 1
+  sum: '-7.489e+00'
+network.layer4.0.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.0.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.0.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.0.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.0.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.0.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.0.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.0.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.0.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.0.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.0.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 2048
+  sum: '0.e+00'
+network.layer4.0.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.0.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 2048
+  sum: '0.e+00'
+network.layer4.0.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 2048
+  sum: '2.048e+03'
+network.layer4.0.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 2048
+  sum: '2.048e+03'
+network.layer4.0.conv1.weight:
+  device: cuda:0
+  max: '2.853e-01'
+  mean: '2.027e-04'
+  min: '-2.964e-01'
+  shape:
+  - 512
+  - 1024
+  - 1
+  - 1
+  sum: '1.063e+02'
+network.layer4.0.conv2.weight:
+  device: cuda:0
+  max: '1.022e-01'
+  mean: '-7.219e-06'
+  min: '-1.115e-01'
+  shape:
+  - 512
+  - 512
+  - 3
+  - 3
+  sum: '-1.703e+01'
+network.layer4.0.conv3.weight:
+  device: cuda:0
+  max: '1.469e-01'
+  mean: '1.062e-05'
+  min: '-1.472e-01'
+  shape:
+  - 2048
+  - 512
+  - 1
+  - 1
+  sum: '1.113e+01'
+network.layer4.0.downsample.0.weight:
+  device: cuda:0
+  max: '1.643e-01'
+  mean: '1.053e-05'
+  min: '-1.525e-01'
+  shape:
+  - 2048
+  - 1024
+  - 1
+  - 1
+  sum: '2.209e+01'
+network.layer4.0.downsample.1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 2048
+  sum: '0.e+00'
+network.layer4.0.downsample.1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.0.downsample.1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 2048
+  sum: '0.e+00'
+network.layer4.0.downsample.1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 2048
+  sum: '2.048e+03'
+network.layer4.0.downsample.1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 2048
+  sum: '2.048e+03'
+network.layer4.1.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.1.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.1.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.1.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.1.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.1.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.1.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.1.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.1.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.1.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.1.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 2048
+  sum: '0.e+00'
+network.layer4.1.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.1.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 2048
+  sum: '0.e+00'
+network.layer4.1.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 2048
+  sum: '2.048e+03'
+network.layer4.1.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 2048
+  sum: '2.048e+03'
+network.layer4.1.conv1.weight:
+  device: cuda:0
+  max: '3.313e-01'
+  mean: '1.118e-04'
+  min: '-3.093e-01'
+  shape:
+  - 512
+  - 2048
+  - 1
+  - 1
+  sum: '1.172e+02'
+network.layer4.1.conv2.weight:
+  device: cuda:0
+  max: '1.056e-01'
+  mean: '-1.704e-05'
+  min: '-1.123e-01'
+  shape:
+  - 512
+  - 512
+  - 3
+  - 3
+  sum: '-4.019e+01'
+network.layer4.1.conv3.weight:
+  device: cuda:0
+  max: '1.447e-01'
+  mean: '3.966e-06'
+  min: '-1.413e-01'
+  shape:
+  - 2048
+  - 512
+  - 1
+  - 1
+  sum: '4.158e+00'
+network.layer4.2.bn1.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.2.bn1.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.2.bn1.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.2.bn1.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.2.bn1.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.2.bn2.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.2.bn2.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.2.bn2.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 512
+  sum: '0.e+00'
+network.layer4.2.bn2.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.2.bn2.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 512
+  sum: '5.12e+02'
+network.layer4.2.bn3.bias:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 2048
+  sum: '0.e+00'
+network.layer4.2.bn3.num_batches_tracked:
+  device: cuda:0
+  max: 0
+  mean: '0.e+00'
+  min: 0
+  shape: []
+  sum: 0
+network.layer4.2.bn3.running_mean:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 2048
+  sum: '0.e+00'
+network.layer4.2.bn3.running_var:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 2048
+  sum: '2.048e+03'
+network.layer4.2.bn3.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 2048
+  sum: '2.048e+03'
+network.layer4.2.conv1.weight:
+  device: cuda:0
+  max: '2.966e-01'
+  mean: '-2.162e-05'
+  min: '-2.997e-01'
+  shape:
+  - 512
+  - 2048
+  - 1
+  - 1
+  sum: '-2.267e+01'
+network.layer4.2.conv2.weight:
+  device: cuda:0
+  max: '9.663e-02'
+  mean: '-1.553e-06'
+  min: '-1.052e-01'
+  shape:
+  - 512
+  - 512
+  - 3
+  - 3
+  sum: '-3.664e+00'
+network.layer4.2.conv3.weight:
+  device: cuda:0
+  max: '1.522e-01'
+  mean: '-1.257e-05'
+  min: '-1.512e-01'
+  shape:
+  - 2048
+  - 512
+  - 1
+  - 1
+  sum: '-1.318e+01'
diff --git a/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cpu/cifar10_jax_example.yaml b/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cpu/cifar10_jax_example.yaml
deleted file mode 100644
index 7b9e8b58..00000000
--- a/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cpu/cifar10_jax_example.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-input:
-  device: cpu
-  hash: -1373365636602041987
-  max: 2.1
-  mean: -0.0
-  min: -2.0
-  shape:
-  - 128
-  - 3
-  - 32
-  - 32
-  sum: -2429.8
-out:
-  device: cpu
-  hash: 7290015411165007734
-  max: 1.0
-  mean: 0.1
-  min: -0.8
-  shape:
-  - 128
-  - 10
-  sum: 151.9
diff --git a/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cpu/fashion_mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cpu/fashion_mnist_jax_example.yaml
deleted file mode 100644
index 913c73b8..00000000
--- a/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cpu/fashion_mnist_jax_example.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-input:
-  device: cpu
-  hash: 9223185275738543696
-  max: 2.8
-  mean: 0.5
-  min: -0.4
-  shape:
-  - 128
-  - 1
-  - 28
-  - 28
-  sum: 48391.2
-out:
-  device: cpu
-  hash: 8278441553463422914
-  max: 1.0
-  mean: -0.0
-  min: -1.0
-  shape:
-  - 128
-  - 10
-  sum: -14.1
diff --git a/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cpu/cifar10_jax_example.yaml b/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cpu/cifar10_jax_example.yaml
deleted file mode 100644
index 7e5c8245..00000000
--- a/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cpu/cifar10_jax_example.yaml
+++ /dev/null
@@ -1,80 +0,0 @@
-network.params.0:
-  device: cpu
-  hash: -4218701300434786233
-  max: 0.0
-  mean: 0.0
-  min: 0.0
-  shape:
-  - 32
-  sum: 0.0
-network.params.1:
-  device: cpu
-  hash: 6448973716641827056
-  max: 0.4
-  mean: -0.0
-  min: -0.4
-  shape:
-  - 3
-  - 3
-  - 3
-  - 32
-  sum: -7.1
-network.params.2:
-  device: cpu
-  hash: -5258163774450544391
-  max: 0.0
-  mean: 0.0
-  min: 0.0
-  shape:
-  - 64
-  sum: 0.0
-network.params.3:
-  device: cpu
-  hash: -195626296360386472
-  max: 0.1
-  mean: 0.0
-  min: -0.1
-  shape:
-  - 3
-  - 3
-  - 32
-  - 64
-  sum: 8.3
-network.params.4:
-  device: cpu
-  hash: 3505480816438514598
-  max: 0.0
-  mean: 0.0
-  min: 0.0
-  shape:
-  - 256
-  sum: 0.0
-network.params.5:
-  device: cpu
-  hash: 7328344990793555668
-  max: 0.0
-  mean: 0.0
-  min: -0.0
-  shape:
-  - 4096
-  - 256
-  sum: 17.4
-network.params.6:
-  device: cpu
-  hash: -7222447081605638768
-  max: 0.0
-  mean: 0.0
-  min: 0.0
-  shape:
-  - 10
-  sum: 0.0
-network.params.7:
-  device: cpu
-  hash: -2983191316776450796
-  max: 0.1
-  mean: 0.0
-  min: -0.1
-  shape:
-  - 256
-  - 10
-  sum: 1.8
diff --git a/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cpu/fashion_mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cpu/fashion_mnist_jax_example.yaml
deleted file mode 100644
index deba293a..00000000
--- a/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cpu/fashion_mnist_jax_example.yaml
+++ /dev/null
@@ -1,80 +0,0 @@
-network.params.0:
-  device: cpu
-  hash: -4218701300434786233
-  max: 0.0
-  mean: 0.0
-  min: 0.0
-  shape:
-  - 32
-  sum: 0.0
-network.params.1:
-  device: cpu
-  hash: -2168085942084572394
-  max: 0.7
-  mean: -0.0
-  min: -0.7
-  shape:
-  - 3
-  - 3
-  - 1
-  - 32
-  sum: -0.3
-network.params.2:
-  device: cpu
-  hash: -5258163774450544391
-  max: 0.0
-  mean: 0.0
-  min: 0.0
-  shape:
-  - 64
-  sum: 0.0
-network.params.3:
-  device: cpu
-  hash: -195626296360386472
-  max: 0.1
-  mean: 0.0
-  min: -0.1
-  shape:
-  - 3
-  - 3
-  - 32
-  - 64
-  sum: 8.3
-network.params.4:
-  device: cpu
-  hash: 3505480816438514598
-  max: 0.0
-  mean: 0.0
-  min: 0.0
-  shape:
-  - 256
-  sum: 0.0
-network.params.5:
-  device: cpu
-  hash: 8975080659470718874
-  max: 0.0
-  mean: 0.0
-  min: -0.0
-  shape:
-  - 3136
-  - 256
-  sum: 15.7
-network.params.6:
-  device: cpu
-  hash: -7222447081605638768
-  max: 0.0
-  mean: 0.0
-  min: 0.0
-  shape:
-  - 10
-  sum: 0.0
-network.params.7:
-  device: cpu
-  hash: -2983191316776450796
-  max: 0.1
-  mean: 0.0
-  min: -0.1
-  shape:
-  - 256
-  - 10
-  sum: 1.8
diff --git a/.regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/cifar10_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/cifar10_jax_cnn_jax_image_classifier.yaml
similarity index 84%
rename from .regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/cifar10_jax_example.yaml
rename to .regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/cifar10_jax_cnn_jax_image_classifier.yaml
index abb5c072..ff422c2a 100644
--- a/.regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/cifar10_jax_example.yaml
+++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/cifar10_jax_cnn_jax_image_classifier.yaml
@@ -1,5 +1,5 @@
 batch.0:
-  device: cpu
+  device: cuda:0
   max: '2.126e+00'
   mean: '-6.179e-03'
   min: '-1.989e+00'
@@ -10,7 +10,7 @@ batch.0:
   - 32
   sum: '-2.43e+03'
 batch.1:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.555e+00'
   min: 0
@@ -18,7 +18,7 @@ batch.1:
   - 128
   sum: 583
 grads.network.params.0:
-  device: cpu
+  device: cuda:0
   max: '9.654e-03'
   mean: '1.276e-03'
   min: '-1.148e-02'
@@ -26,7 +26,7 @@ grads.network.params.0:
   - 32
   sum: '4.083e-02'
 grads.network.params.1:
-  device: cpu
+  device: cuda:0
   max: '1.149e-02'
   mean: '5.030e-04'
   min: '-1.473e-02'
@@ -37,7 +37,7 @@ grads.network.params.1:
   - 32
   sum: '4.346e-01'
 grads.network.params.2:
-  device: cpu
+  device: cuda:0
   max: '1.680e-02'
   mean: '1.566e-03'
   min: '-7.296e-03'
@@ -45,7 +45,7 @@ grads.network.params.2:
   - 64
   sum: '1.002e-01'
 grads.network.params.3:
-  device: cpu
+  device: cuda:0
   max: '2.507e-02'
   mean: '4.631e-04'
   min: '-2.280e-02'
@@ -56,7 +56,7 @@ grads.network.params.3:
   - 64
   sum: '8.536e+00'
 grads.network.params.4:
-  device: cpu
+  device: cuda:0
   max: '1.025e-02'
   mean: '1.384e-04'
   min: '-1.082e-02'
@@ -64,7 +64,7 @@ grads.network.params.4:
   - 256
   sum: '3.542e-02'
 grads.network.params.5:
-  device: cpu
+  device: cuda:0
   max: '3.064e-02'
   mean: '3.315e-05'
   min: '-2.379e-02'
@@ -73,7 +73,7 @@ grads.network.params.5:
   - 256
   sum: '3.476e+01'
 grads.network.params.6:
-  device: cpu
+  device: cuda:0
   max: '2.984e-02'
   mean: '-5.588e-10'
   min: '-2.597e-02'
@@ -81,16 +81,16 @@ grads.network.params.6:
   - 10
   sum: '-5.588e-09'
 grads.network.params.7:
-  device: cpu
+  device: cuda:0
   max: '4.361e-02'
-  mean: '-1.63e-10'
+  mean: '-2.154e-10'
   min: '-4.662e-02'
   shape:
   - 256
   - 10
-  sum: '-4.172e-07'
+  sum: '-5.513e-07'
 outputs.logits:
-  device: cpu
+  device: cuda:0
   max: '9.608e-01'
   mean: '1.186e-01'
   min: '-7.613e-01'
@@ -99,14 +99,14 @@ outputs.logits:
   - 10
   sum: '1.519e+02'
 outputs.loss:
-  device: cpu
+  device: cuda:0
   max: '2.341e+00'
   mean: '2.341e+00'
   min: '2.341e+00'
   shape: []
   sum: '2.341e+00'
 outputs.y:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.555e+00'
   min: 0
diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/cifar10_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/cifar10_jax_fcnet_jax_image_classifier.yaml
new file mode 100644
index 00000000..2fe6e1fa
--- /dev/null
+++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/cifar10_jax_fcnet_jax_image_classifier.yaml
@@ -0,0 +1,77 @@
+batch.0:
+  device: cuda:0
+  max: '2.126e+00'
+  mean: '-6.179e-03'
+  min: '-1.989e+00'
+  shape:
+  - 128
+  - 3
+  - 32
+  - 32
+  sum: '-2.43e+03'
+batch.1:
+  device: cuda:0
+  max: 9
+  mean: '4.555e+00'
+  min: 0
+  shape:
+  - 128
+  sum: 583
+grads.network.params.0:
+  device: cuda:0
+  max: '1.552e-02'
+  mean: '8.602e-04'
+  min: '-9.862e-03'
+  shape:
+  - 256
+  sum: '2.202e-01'
+grads.network.params.1:
+  device: cuda:0
+  max: '2.677e-02'
+  mean: '1.968e-05'
+  min: '-2.576e-02'
+  shape:
+  - 3072
+  - 256
+  sum: '1.548e+01'
+grads.network.params.2:
+  device: cuda:0
+  max: '6.868e-02'
+  mean: '0.e+00'
+  min: '-3.458e-02'
+  shape:
+  - 10
+  sum: '0.e+00'
+grads.network.params.3:
+  device: cuda:0
+  max: '1.497e-01'
+  mean: '-2.445e-10'
+  min: '-1.415e-01'
+  shape:
+  - 256
+  - 10
+  sum: '-6.258e-07'
+outputs.logits:
+  device: cuda:0
+  max: '2.380e+00'
+  mean: '5.809e-02'
+  min: '-3.135e+00'
+  shape:
+  - 128
+  - 10
+  sum: '7.436e+01'
+outputs.loss:
+  device: cuda:0
+  max: '2.466e+00'
+  mean: '2.466e+00'
+  min: '2.466e+00'
+  shape: []
+  sum: '2.466e+00'
+outputs.y:
+  device: cuda:0
+  max: 9
+  mean: '4.555e+00'
+  min: 0
+  shape:
+  - 128
+  sum: 583
diff --git a/.regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/fashion_mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/fashion_mnist_jax_cnn_jax_image_classifier.yaml
similarity index 84%
rename from .regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/fashion_mnist_jax_example.yaml
rename to .regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/fashion_mnist_jax_cnn_jax_image_classifier.yaml
index bdc2a02f..7b7a7623 100644
--- a/.regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/fashion_mnist_jax_example.yaml
+++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/fashion_mnist_jax_cnn_jax_image_classifier.yaml
@@ -1,5 +1,5 @@
 batch.0:
-  device: cpu
+  device: cuda:0
   max: '2.821e+00'
   mean: '4.822e-01'
   min: '-4.242e-01'
@@ -10,7 +10,7 @@ batch.0:
   - 28
   sum: '4.839e+04'
 batch.1:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.555e+00'
   min: 0
@@ -18,7 +18,7 @@ batch.1:
   - 128
   sum: 583
 grads.network.params.0:
-  device: cpu
+  device: cuda:0
   max: '1.949e-02'
   mean: '4.526e-03'
   min: '-1.615e-02'
@@ -26,7 +26,7 @@ grads.network.params.0:
   - 32
   sum: '1.448e-01'
 grads.network.params.1:
-  device: cpu
+  device: cuda:0
   max: '4.36e-02'
   mean: '5.924e-03'
   min: '-3.013e-02'
@@ -37,7 +37,7 @@ grads.network.params.1:
   - 32
   sum: '1.706e+00'
 grads.network.params.2:
-  device: cpu
+  device: cuda:0
   max: '2.734e-02'
   mean: '1.847e-03'
   min: '-1.76e-02'
@@ -45,7 +45,7 @@ grads.network.params.2:
   - 64
   sum: '1.182e-01'
 grads.network.params.3:
-  device: cpu
+  device: cuda:0
   max: '6.099e-02'
   mean: '1.127e-03'
   min: '-5.833e-02'
@@ -56,7 +56,7 @@ grads.network.params.3:
   - 64
   sum: '2.077e+01'
 grads.network.params.4:
-  device: cpu
+  device: cuda:0
   max: '2.451e-02'
   mean: '1.065e-03'
   min: '-1.999e-02'
@@ -64,7 +64,7 @@ grads.network.params.4:
   - 256
   sum: '2.727e-01'
 grads.network.params.5:
-  device: cpu
+  device: cuda:0
   max: '7.691e-02'
   mean: '3.075e-04'
   min: '-6.106e-02'
@@ -73,7 +73,7 @@ grads.network.params.5:
   - 256
   sum: '2.469e+02'
 grads.network.params.6:
-  device: cpu
+  device: cuda:0
   max: '5.898e-02'
   mean: '-1.863e-09'
   min: '-7.022e-02'
@@ -81,16 +81,16 @@ grads.network.params.6:
   - 10
   sum: '-1.863e-08'
 grads.network.params.7:
-  device: cpu
+  device: cuda:0
   max: '1.382e-01'
-  mean: '-5.821e-11'
+  mean: '-1.775e-10'
   min: '-1.376e-01'
   shape:
   - 256
   - 10
-  sum: '-1.490e-07'
+  sum: '-4.545e-07'
 outputs.logits:
-  device: cpu
+  device: cuda:0
   max: '1.032e+00'
   mean: '-1.1e-02'
   min: '-9.602e-01'
@@ -99,14 +99,14 @@ outputs.logits:
   - 10
   sum: '-1.408e+01'
 outputs.loss:
-  device: cpu
+  device: cuda:0
   max: '2.385e+00'
   mean: '2.385e+00'
   min: '2.385e+00'
   shape: []
   sum: '2.385e+00'
 outputs.y:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.555e+00'
   min: 0
diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/fashion_mnist_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/fashion_mnist_jax_fcnet_jax_image_classifier.yaml
new file mode 100644
index 00000000..7a36defc
--- /dev/null
+++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/fashion_mnist_jax_fcnet_jax_image_classifier.yaml
@@ -0,0 +1,77 @@
+batch.0:
+  device: cuda:0
+  max: '2.821e+00'
+  mean: '4.822e-01'
+  min: '-4.242e-01'
+  shape:
+  - 128
+  - 1
+  - 28
+  - 28
+  sum: '4.839e+04'
+batch.1:
+  device: cuda:0
+  max: 9
+  mean: '4.555e+00'
+  min: 0
+  shape:
+  - 128
+  sum: 583
+grads.network.params.0:
+  device: cuda:0
+  max: '2.188e-02'
+  mean: '8.325e-04'
+  min: '-2.096e-02'
+  shape:
+  - 256
+  sum: '2.131e-01'
+grads.network.params.1:
+  device: cuda:0
+  max: '5.304e-02'
+  mean: '4.879e-04'
+  min: '-4.886e-02'
+  shape:
+  - 784
+  - 256
+  sum: '9.792e+01'
+grads.network.params.2:
+  device: cuda:0
+  max: '1.375e-01'
+  mean: '0.e+00'
+  min: '-9.162e-02'
+  shape:
+  - 10
+  sum: '0.e+00'
+grads.network.params.3:
+  device: cuda:0
+  max: '3.990e-01'
+  mean: '-1.106e-10'
+  min: '-2.054e-01'
+  shape:
+  - 256
+  - 10
+  sum: '-2.831e-07'
+outputs.logits:
+  device: cuda:0
+  max: '2.656e+00'
+  mean: '2.355e-02'
+  min: '-2.715e+00'
+  shape:
+  - 128
+  - 10
+  sum: '3.015e+01'
+outputs.loss:
+  device: cuda:0
+  max: '2.554e+00'
+  mean: '2.554e+00'
+  min: '2.554e+00'
+  shape: []
+  sum: '2.554e+00'
+outputs.y:
+  device: cuda:0
+  max: 9
+  mean: '4.555e+00'
+  min: 0
+  shape:
+  - 128
+  sum: 583
diff --git a/.regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/mnist_jax_cnn_jax_image_classifier.yaml
similarity index 82%
rename from .regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/mnist_jax_example.yaml
rename to .regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/mnist_jax_cnn_jax_image_classifier.yaml
index f4c17e52..d41f869b 100644
--- a/.regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/mnist_jax_example.yaml
+++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/mnist_jax_cnn_jax_image_classifier.yaml
@@ -1,5 +1,5 @@
 batch.0:
-  device: cpu
+  device: cuda:0
   max: '2.821e+00'
   mean: '1.432e-02'
   min: '-4.242e-01'
@@ -10,7 +10,7 @@ batch.0:
   - 28
   sum: '1.437e+03'
 batch.1:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.242e+00'
   min: 0
@@ -18,7 +18,7 @@ batch.1:
   - 128
   sum: 543
 grads.network.params.0:
-  device: cpu
+  device: cuda:0
   max: '1.65e-02'
   mean: '2.109e-03'
   min: '-8.628e-03'
@@ -26,7 +26,7 @@ grads.network.params.0:
   - 32
   sum: '6.748e-02'
 grads.network.params.1:
-  device: cpu
+  device: cuda:0
   max: '1.893e-02'
   mean: '-1.55e-05'
   min: '-1.627e-02'
@@ -37,7 +37,7 @@ grads.network.params.1:
   - 32
   sum: '-4.463e-03'
 grads.network.params.2:
-  device: cpu
+  device: cuda:0
   max: '2.053e-02'
   mean: '1.196e-03'
   min: '-1.783e-02'
@@ -45,7 +45,7 @@ grads.network.params.2:
   - 64
   sum: '7.653e-02'
 grads.network.params.3:
-  device: cpu
+  device: cuda:0
   max: '2.25e-02'
   mean: '3.613e-04'
   min: '-2.352e-02'
@@ -56,7 +56,7 @@ grads.network.params.3:
   - 64
   sum: '6.659e+00'
 grads.network.params.4:
-  device: cpu
+  device: cuda:0
   max: '2.231e-02'
   mean: '2.332e-04'
   min: '-2.018e-02'
@@ -64,7 +64,7 @@ grads.network.params.4:
   - 256
   sum: '5.970e-02'
 grads.network.params.5:
-  device: cpu
+  device: cuda:0
   max: '5.356e-02'
   mean: '3.131e-05'
   min: '-4.563e-02'
@@ -73,24 +73,24 @@ grads.network.params.5:
   - 256
   sum: '2.514e+01'
 grads.network.params.6:
-  device: cpu
+  device: cuda:0
   max: '6.484e-02'
-  mean: '-1.397e-09'
+  mean: '-1.490e-09'
   min: '-8.046e-02'
   shape:
   - 10
-  sum: '-1.397e-08'
+  sum: '-1.490e-08'
 grads.network.params.7:
-  device: cpu
+  device: cuda:0
   max: '7.496e-02'
-  mean: '-3.376e-10'
+  mean: '-3.361e-10'
   min: '-8.565e-02'
   shape:
   - 256
   - 10
-  sum: '-8.643e-07'
+  sum: '-8.605e-07'
 outputs.logits:
-  device: cpu
+  device: cuda:0
   max: '8.092e-01'
   mean: '-2.764e-02'
   min: '-1.135e+00'
@@ -99,14 +99,14 @@ outputs.logits:
   - 10
   sum: '-3.538e+01'
 outputs.loss:
-  device: cpu
+  device: cuda:0
   max: '2.303e+00'
   mean: '2.303e+00'
   min: '2.303e+00'
   shape: []
   sum: '2.303e+00'
 outputs.y:
-  device: cpu
+  device: cuda:0
   max: 9
   mean: '4.242e+00'
   min: 0
diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/mnist_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/mnist_jax_fcnet_jax_image_classifier.yaml
new file mode 100644
index 00000000..b1219522
--- /dev/null
+++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/mnist_jax_fcnet_jax_image_classifier.yaml
@@ -0,0 +1,77 @@
+batch.0:
+  device: cuda:0
+  max: '2.821e+00'
+  mean: '1.432e-02'
+  min: '-4.242e-01'
+  shape:
+  - 128
+  - 1
+  - 28
+  - 28
+  sum: '1.437e+03'
+batch.1:
+  device: cuda:0
+  max: 9
+  mean: '4.242e+00'
+  min: 0
+  shape:
+  - 128
+  sum: 543
+grads.network.params.0:
+  device: cuda:0
+  max: '1.386e-02'
+  mean: '8.019e-04'
+  min: '-1.326e-02'
+  shape:
+  - 256
+  sum: '2.053e-01'
+grads.network.params.1:
+  device: cuda:0
+  max: '3.122e-02'
+  mean: '-1.002e-04'
+  min: '-3.579e-02'
+  shape:
+  - 784
+  - 256
+  sum: '-2.012e+01'
+grads.network.params.2:
+  device: cuda:0
+  max: '4.549e-02'
+  mean: '0.e+00'
+  min: '-7.537e-02'
+  shape:
+  - 10
+  sum: '0.e+00'
+grads.network.params.3:
+  device: cuda:0
+  max: '7.07e-02'
+  mean: '-5.821e-11'
+  min: '-1.064e-01'
+  shape:
+  - 256
+  - 10
+  sum: '-1.490e-07'
+outputs.logits:
+  device: cuda:0
+  max: '1.85e+00'
+  mean: '6.708e-02'
+  min: '-1.919e+00'
+  shape:
+  - 128
+  - 10
+  sum: '8.586e+01'
+outputs.loss:
+  device: cuda:0
+  max: '2.398e+00'
+  mean: '2.398e+00'
+  min: '2.398e+00'
+  shape: []
+  sum: '2.398e+00'
+outputs.y:
+  device: cuda:0
+  max: 9
+  mean: '4.242e+00'
+  min: 0
+  shape:
+  - 128
+  sum: 543
diff --git a/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cuda/cifar10_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/cifar10_jax_cnn_jax_image_classifier.yaml
similarity index 100%
rename from .regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cuda/cifar10_jax_example.yaml
rename to .regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/cifar10_jax_cnn_jax_image_classifier.yaml
diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/cifar10_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/cifar10_jax_fcnet_jax_image_classifier.yaml
new file mode 100644
index 00000000..c73fe9ab
--- /dev/null
+++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/cifar10_jax_fcnet_jax_image_classifier.yaml
@@ -0,0 +1,20 @@
+input:
+  device: cuda:0
+  max: '2.126e+00'
+  mean: '-6.179e-03'
+  min: '-1.989e+00'
+  shape:
+  - 128
+  - 3
+  - 32
+  - 32
+  sum: '-2.43e+03'
+out:
+  device: cuda:0
+  max: '2.380e+00'
+  mean: '5.809e-02'
+  min: '-3.135e+00'
+  shape:
+  - 128
+  - 10
+  sum: '7.436e+01'
diff --git a/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cuda/fashion_mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/fashion_mnist_jax_cnn_jax_image_classifier.yaml
similarity index 100%
rename from .regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cuda/fashion_mnist_jax_example.yaml
rename to .regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/fashion_mnist_jax_cnn_jax_image_classifier.yaml
diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/fashion_mnist_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/fashion_mnist_jax_fcnet_jax_image_classifier.yaml
new file mode 100644
index 00000000..7e489df5
--- /dev/null
+++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/fashion_mnist_jax_fcnet_jax_image_classifier.yaml
@@ -0,0 +1,20 @@
+input:
+  device: cuda:0
+  max: '2.821e+00'
+  mean: '4.822e-01'
+  min: '-4.242e-01'
+  shape:
+  - 128
+  - 1
+  - 28
+  - 28
+  sum: '4.839e+04'
+out:
+  device: cuda:0
+  max: '2.656e+00'
+  mean: '2.355e-02'
+  min: '-2.715e+00'
+  shape:
+  - 128
+  - 10
+  sum: '3.015e+01'
diff --git a/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cuda/mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/mnist_jax_cnn_jax_image_classifier.yaml
similarity index 100%
rename from .regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cuda/mnist_jax_example.yaml
rename to .regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/mnist_jax_cnn_jax_image_classifier.yaml
diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/mnist_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/mnist_jax_fcnet_jax_image_classifier.yaml
new file mode 100644
index 00000000..5659f1e9
--- /dev/null
+++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/mnist_jax_fcnet_jax_image_classifier.yaml
@@ -0,0 +1,20 @@
+input:
+  device: cuda:0
+  max: '2.821e+00'
+  mean: '1.432e-02'
+  min: '-4.242e-01'
+  shape:
+  - 128
+  - 1
+  - 28
+  - 28
+  sum: '1.437e+03'
+out:
+  device: cuda:0
+  max: '1.85e+00'
+  mean: '6.708e-02'
+  min: '-1.919e+00'
+  shape:
+  - 128
+  - 10
+  sum: '8.586e+01'
diff --git a/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cuda/cifar10_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/cifar10_jax_cnn_jax_image_classifier.yaml
similarity index 100%
rename from .regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cuda/cifar10_jax_example.yaml
rename to .regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/cifar10_jax_cnn_jax_image_classifier.yaml
diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/cifar10_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/cifar10_jax_fcnet_jax_image_classifier.yaml
new file mode 100644
index 00000000..178d3b7e
--- /dev/null
+++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/cifar10_jax_fcnet_jax_image_classifier.yaml
@@ -0,0 +1,34 @@
+network.params.0:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.params.1:
+  device: cuda:0
+  max: '4.102e-02'
+  mean: '2.969e-05'
+  min: '-4.102e-02'
+  shape:
+  - 3072
+  - 256
+  sum: '2.335e+01'
+network.params.2:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 10
+  sum: '0.e+00'
+network.params.3:
+  device: cuda:0
+  max: '1.421e-01'
+  mean: '7.197e-04'
+  min: '-1.416e-01'
+  shape:
+  - 256
+  - 10
+  sum: '1.842e+00'
diff --git a/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cuda/fashion_mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/fashion_mnist_jax_cnn_jax_image_classifier.yaml
similarity index 100%
rename from .regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cuda/fashion_mnist_jax_example.yaml
rename to .regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/fashion_mnist_jax_cnn_jax_image_classifier.yaml
diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/fashion_mnist_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/fashion_mnist_jax_fcnet_jax_image_classifier.yaml
new file mode 100644
index 00000000..b29367ad
--- /dev/null
+++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/fashion_mnist_jax_fcnet_jax_image_classifier.yaml
@@ -0,0 +1,34 @@
+network.params.0:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.params.1:
+  device: cuda:0
+  max: '8.120e-02'
+  mean: '-2.572e-05'
+  min: '-8.120e-02'
+  shape:
+  - 784
+  - 256
+  sum: '-5.162e+00'
+network.params.2:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 10
+  sum: '0.e+00'
+network.params.3:
+  device: cuda:0
+  max: '1.421e-01'
+  mean: '7.197e-04'
+  min: '-1.416e-01'
+  shape:
+  - 256
+  - 10
+  sum: '1.842e+00'
diff --git a/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cuda/mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/mnist_jax_cnn_jax_image_classifier.yaml
similarity index 100%
rename from .regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cuda/mnist_jax_example.yaml
rename to .regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/mnist_jax_cnn_jax_image_classifier.yaml
diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/mnist_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/mnist_jax_fcnet_jax_image_classifier.yaml
new file mode 100644
index 00000000..b29367ad
--- /dev/null
+++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/mnist_jax_fcnet_jax_image_classifier.yaml
@@ -0,0 +1,34 @@
+network.params.0:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 256
+  sum: '0.e+00'
+network.params.1:
+  device: cuda:0
+  max: '8.120e-02'
+  mean: '-2.572e-05'
+  min: '-8.120e-02'
+  shape:
+  - 784
+  - 256
+  sum: '-5.162e+00'
+network.params.2:
+  device: cuda:0
+  max: '0.e+00'
+  mean: '0.e+00'
+  min: '0.e+00'
+  shape:
+  - 10
+  sum: '0.e+00'
+network.params.3:
+  device: cuda:0
+  max: '1.421e-01'
+  mean: '7.197e-04'
+  min: '-1.416e-01'
+  shape:
+  - 256
+  - 10
+  sum: '1.842e+00'
diff --git a/.regression_files/project/algorithms/jax_rl_example_test/test_lightning/123_Pendulum_v1_15.yaml b/.regression_files/project/algorithms/jax_ppo_test/test_lightning/Pendulum_v1_42_15.yaml
similarity index 60%
rename from .regression_files/project/algorithms/jax_rl_example_test/test_lightning/123_Pendulum_v1_15.yaml
rename to .regression_files/project/algorithms/jax_ppo_test/test_lightning/Pendulum_v1_42_15.yaml
index e70ed343..a47898ea 100644
--- a/.regression_files/project/algorithms/jax_rl_example_test/test_lightning/123_Pendulum_v1_15.yaml
+++ b/.regression_files/project/algorithms/jax_ppo_test/test_lightning/Pendulum_v1_42_15.yaml
@@ -5,8 +5,8 @@ val/episode_lengths:
   shape: []
   sum: '2.e+02'
 val/rewards:
-  max: '-1.222e+03'
-  mean: '-1.222e+03'
-  min: '-1.222e+03'
+  max: '-9.099e+02'
+  mean: '-9.099e+02'
+  min: '-9.099e+02'
   shape: []
-  sum: '-1.222e+03'
+  sum: '-9.099e+02'
diff --git a/.regression_files/project/algorithms/jax_rl_example_test/test_ours_with_trainer/123_Pendulum_v1.yaml b/.regression_files/project/algorithms/jax_ppo_test/test_ours/42__123__Pendulum_v1.yaml
similarity index 61%
rename from .regression_files/project/algorithms/jax_rl_example_test/test_ours_with_trainer/123_Pendulum_v1.yaml
rename to .regression_files/project/algorithms/jax_ppo_test/test_ours/42__123__Pendulum_v1.yaml
index d83973a5..113d223f 100644
--- a/.regression_files/project/algorithms/jax_rl_example_test/test_ours_with_trainer/123_Pendulum_v1.yaml
+++ b/.regression_files/project/algorithms/jax_ppo_test/test_ours/42__123__Pendulum_v1.yaml
@@ -1,16 +1,18 @@
 cumulative_reward:
-  max: '-6.495e+02'
-  mean: '-1.229e+03'
+  max: '-7.835e-01'
+  mean: '-9.323e+02'
   min: '-1.878e+03'
   shape:
+  - 2
   - 76
   - 128
-  sum: '-1.196e+07'
+  sum: '-1.814e+07'
 episode_length:
   max: 200
   mean: '2.e+02'
   min: 200
   shape:
+  - 2
   - 76
   - 128
-  sum: 1945600
+  sum: 3891200
diff --git a/.regression_files/project/algorithms/jax_rl_example_test/test_ours/123_Pendulum_v1.yaml b/.regression_files/project/algorithms/jax_ppo_test/test_ours_with_trainer/42__123__Pendulum_v1.yaml
similarity index 61%
rename from .regression_files/project/algorithms/jax_rl_example_test/test_ours/123_Pendulum_v1.yaml
rename to .regression_files/project/algorithms/jax_ppo_test/test_ours_with_trainer/42__123__Pendulum_v1.yaml
index d83973a5..113d223f 100644
--- a/.regression_files/project/algorithms/jax_rl_example_test/test_ours/123_Pendulum_v1.yaml
+++ b/.regression_files/project/algorithms/jax_ppo_test/test_ours_with_trainer/42__123__Pendulum_v1.yaml
@@ -1,16 +1,18 @@
 cumulative_reward:
-  max: '-6.495e+02'
-  mean: '-1.229e+03'
+  max: '-7.835e-01'
+  mean: '-9.323e+02'
   min: '-1.878e+03'
   shape:
+  - 2
   - 76
   - 128
-  sum: '-1.196e+07'
+  sum: '-1.814e+07'
 episode_length:
   max: 200
   mean: '2.e+02'
   min: 200
   shape:
+  - 2
   - 76
   - 128
-  sum: 1945600
+  sum: 3891200
diff --git a/.regression_files/project/algorithms/jax_rl_example_test/test_rejax/123_Pendulum_v1.yaml b/.regression_files/project/algorithms/jax_ppo_test/test_rejax/42__123__Pendulum_v1.yaml
similarity index 61%
rename from .regression_files/project/algorithms/jax_rl_example_test/test_rejax/123_Pendulum_v1.yaml
rename to .regression_files/project/algorithms/jax_ppo_test/test_rejax/42__123__Pendulum_v1.yaml
index 8b29ccb9..bf24f361 100644
--- a/.regression_files/project/algorithms/jax_rl_example_test/test_rejax/123_Pendulum_v1.yaml
+++ b/.regression_files/project/algorithms/jax_ppo_test/test_rejax/42__123__Pendulum_v1.yaml
@@ -1,16 +1,18 @@
 cumulative_reward:
-  max: '-4.319e-01'
-  mean: '-5.755e+02'
+  max: '-3.978e-01'
+  mean: '-5.231e+02'
   min: '-1.872e+03'
   shape:
+  - 2
   - 76
   - 128
-  sum: '-5.599e+06'
+  sum: '-1.018e+07'
 episode_length:
   max: 200
   mean: '2.e+02'
   min: 200
   shape:
+  - 2
   - 76
   - 128
-  sum: 1945600
+  sum: 3891200
diff --git a/.regression_files/project/algorithms/llm_finetuning_test/test_backward_pass_is_reproducible/llm_finetuning.yaml b/.regression_files/project/algorithms/llm_finetuning_test/test_backward_pass_is_reproducible/llm_finetuning.yaml
new file mode 100644
index 00000000..e1932620
--- /dev/null
+++ b/.regression_files/project/algorithms/llm_finetuning_test/test_backward_pass_is_reproducible/llm_finetuning.yaml
@@ -0,0 +1,3286 @@
+batch.attention_mask:
+  device: cuda:0
+  max: 1
+  mean: '1.e+00'
+  min: 1
+  shape:
+  - 8
+  - 256
+  sum: 2048
+batch.input_ids:
+  device: cuda:0
+  max: 50118
+  mean: '5.447e+03'
+  min: 2
+  shape:
+  - 8
+  - 256
+  sum: 11154886
+batch.labels:
+  device: cuda:0
+  max: 50118
+  mean: '5.447e+03'
+  min: 2
+  shape:
+  - 8
+  - 256
+  sum: 11154886
+grads.network.model.decoder.embed_positions.weight:
+  device: cuda:0
+  max: '2.549e-02'
+  mean: '2.795e-07'
+  min: '-2.530e-02'
+  shape:
+  - 2050
+  - 1024
+  sum: '5.867e-01'
+grads.network.model.decoder.embed_tokens.weight:
+  device: cuda:0
+  max: '7.65e-01'
+  mean: '-2.928e-07'
+  min: '-9.832e-01'
+  shape:
+  - 50272
+  - 512
+  sum: '-7.537e+00'
+grads.network.model.decoder.layers.0.fc1.bias:
+  device: cuda:0
+  max: '2.624e-03'
+  mean: '-2.445e-06'
+  min: '-8.882e-03'
+  shape:
+  - 4096
+  sum: '-1.001e-02'
+grads.network.model.decoder.layers.0.fc1.weight:
+  device: cuda:0
+  max: '8.724e-02'
+  mean: '4.963e-09'
+  min: '-1.222e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '2.082e-02'
+grads.network.model.decoder.layers.0.fc2.bias:
+  device: cuda:0
+  max: '1.031e-02'
+  mean: '7.276e-12'
+  min: '-1.265e-02'
+  shape:
+  - 1024
+  sum: '7.451e-09'
+grads.network.model.decoder.layers.0.fc2.weight:
+  device: cuda:0
+  max: '1.836e-02'
+  mean: '0.e+00'
+  min: '-1.480e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '0.e+00'
+grads.network.model.decoder.layers.0.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.124e-02'
+  mean: '2.244e-06'
+  min: '-1.343e-02'
+  shape:
+  - 1024
+  sum: '2.298e-03'
+grads.network.model.decoder.layers.0.final_layer_norm.weight:
+  device: cuda:0
+  max: '9.238e-03'
+  mean: '-1.765e-05'
+  min: '-5.406e-02'
+  shape:
+  - 1024
+  sum: '-1.807e-02'
+grads.network.model.decoder.layers.0.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.455e-10'
+  mean: '1.036e-12'
+  min: '-1.673e-10'
+  shape:
+  - 1024
+  sum: '1.061e-09'
+grads.network.model.decoder.layers.0.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.895e-04'
+  mean: '6.07e-11'
+  min: '-1.679e-04'
+  shape:
+  - 1024
+  - 1024
+  sum: '6.365e-05'
+grads.network.model.decoder.layers.0.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '2.459e-01'
+  mean: '-8.149e-10'
+  min: '-2.594e-01'
+  shape:
+  - 1024
+  sum: '-8.345e-07'
+grads.network.model.decoder.layers.0.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '7.433e-03'
+  mean: '1.705e-13'
+  min: '-7.011e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.788e-07'
+grads.network.model.decoder.layers.0.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '4.872e-04'
+  mean: '3.458e-07'
+  min: '-5.13e-04'
+  shape:
+  - 1024
+  sum: '3.541e-04'
+grads.network.model.decoder.layers.0.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '3.873e-04'
+  mean: '3.472e-09'
+  min: '-4.093e-04'
+  shape:
+  - 1024
+  - 1024
+  sum: '3.641e-03'
+grads.network.model.decoder.layers.0.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '1.222e-01'
+  mean: '5.112e-04'
+  min: '-1.374e-01'
+  shape:
+  - 1024
+  sum: '5.235e-01'
+grads.network.model.decoder.layers.0.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '7.942e-02'
+  mean: '3.069e-07'
+  min: '-7.008e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '3.218e-01'
+grads.network.model.decoder.layers.0.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.182e-02'
+  mean: '-1.809e-05'
+  min: '-1.26e-02'
+  shape:
+  - 1024
+  sum: '-1.852e-02'
+grads.network.model.decoder.layers.0.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '9.642e-03'
+  mean: '-9.916e-07'
+  min: '-4.965e-02'
+  shape:
+  - 1024
+  sum: '-1.015e-03'
+grads.network.model.decoder.layers.1.fc1.bias:
+  device: cuda:0
+  max: '5.562e-03'
+  mean: '-1.470e-06'
+  min: '-7.369e-03'
+  shape:
+  - 4096
+  sum: '-6.023e-03'
+grads.network.model.decoder.layers.1.fc1.weight:
+  device: cuda:0
+  max: '6.877e-02'
+  mean: '2.984e-09'
+  min: '-9.409e-02'
+  shape:
+  - 4096
+  - 1024
+  sum: '1.251e-02'
+grads.network.model.decoder.layers.1.fc2.bias:
+  device: cuda:0
+  max: '1.038e-02'
+  mean: '1.819e-11'
+  min: '-1.155e-02'
+  shape:
+  - 1024
+  sum: '1.863e-08'
+grads.network.model.decoder.layers.1.fc2.weight:
+  device: cuda:0
+  max: '1.431e-02'
+  mean: '2.558e-13'
+  min: '-1.138e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '1.073e-06'
+grads.network.model.decoder.layers.1.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.17e-02'
+  mean: '-9.708e-05'
+  min: '-1.293e-02'
+  shape:
+  - 1024
+  sum: '-9.941e-02'
+grads.network.model.decoder.layers.1.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.304e-02'
+  mean: '1.814e-05'
+  min: '-3.518e-02'
+  shape:
+  - 1024
+  sum: '1.858e-02'
+grads.network.model.decoder.layers.1.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '6.403e-10'
+  mean: '6.279e-13'
+  min: '-1.397e-09'
+  shape:
+  - 1024
+  sum: '6.430e-10'
+grads.network.model.decoder.layers.1.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '3.312e-02'
+  mean: '3.22e-15'
+  min: '-3.174e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '3.376e-09'
+grads.network.model.decoder.layers.1.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '9.799e-03'
+  mean: '2.183e-11'
+  min: '-1.048e-02'
+  shape:
+  - 1024
+  sum: '2.235e-08'
+grads.network.model.decoder.layers.1.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.020e-02'
+  mean: '-1.705e-13'
+  min: '-1.033e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.788e-07'
+grads.network.model.decoder.layers.1.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.236e-03'
+  mean: '-3.821e-06'
+  min: '-2.06e-03'
+  shape:
+  - 1024
+  sum: '-3.913e-03'
+grads.network.model.decoder.layers.1.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.833e-02'
+  mean: '-2.680e-08'
+  min: '-1.194e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-2.811e-02'
+grads.network.model.decoder.layers.1.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '1.296e-02'
+  mean: '1.047e-04'
+  min: '-9.251e-03'
+  shape:
+  - 1024
+  sum: '1.072e-01'
+grads.network.model.decoder.layers.1.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '2.234e-01'
+  mean: '7.347e-07'
+  min: '-1.650e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '7.704e-01'
+grads.network.model.decoder.layers.1.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.000e-02'
+  mean: '-4.235e-05'
+  min: '-1.078e-02'
+  shape:
+  - 1024
+  sum: '-4.337e-02'
+grads.network.model.decoder.layers.1.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.163e-02'
+  mean: '5.549e-06'
+  min: '-3.955e-02'
+  shape:
+  - 1024
+  sum: '5.682e-03'
+grads.network.model.decoder.layers.10.fc1.bias:
+  device: cuda:0
+  max: '1.167e-02'
+  mean: '-1.093e-05'
+  min: '-4.407e-03'
+  shape:
+  - 4096
+  sum: '-4.475e-02'
+grads.network.model.decoder.layers.10.fc1.weight:
+  device: cuda:0
+  max: '1.255e-01'
+  mean: '-1.298e-08'
+  min: '-2.335e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-5.445e-02'
+grads.network.model.decoder.layers.10.fc2.bias:
+  device: cuda:0
+  max: '9.324e-03'
+  mean: '3.638e-12'
+  min: '-9.376e-03'
+  shape:
+  - 1024
+  sum: '3.725e-09'
+grads.network.model.decoder.layers.10.fc2.weight:
+  device: cuda:0
+  max: '1.888e-02'
+  mean: '1.137e-13'
+  min: '-1.95e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '4.768e-07'
+grads.network.model.decoder.layers.10.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.063e-02'
+  mean: '1.763e-04'
+  min: '-1.049e-02'
+  shape:
+  - 1024
+  sum: '1.805e-01'
+grads.network.model.decoder.layers.10.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.245e-02'
+  mean: '1.566e-05'
+  min: '-1.95e-02'
+  shape:
+  - 1024
+  sum: '1.604e-02'
+grads.network.model.decoder.layers.10.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.863e-09'
+  mean: '-8.787e-12'
+  min: '-1.164e-09'
+  shape:
+  - 1024
+  sum: '-8.998e-09'
+grads.network.model.decoder.layers.10.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.065e-01'
+  mean: '1.164e-13'
+  min: '-1.330e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.220e-07'
+grads.network.model.decoder.layers.10.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '8.365e-03'
+  mean: '1.819e-11'
+  min: '-8.918e-03'
+  shape:
+  - 1024
+  sum: '1.863e-08'
+grads.network.model.decoder.layers.10.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '7.876e-03'
+  mean: '3.126e-13'
+  min: '-7.644e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '3.278e-07'
+grads.network.model.decoder.layers.10.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '3.907e-03'
+  mean: '-1.607e-05'
+  min: '-4.692e-03'
+  shape:
+  - 1024
+  sum: '-1.645e-02'
+grads.network.model.decoder.layers.10.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '3.358e-02'
+  mean: '1.291e-07'
+  min: '-4.45e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.354e-01'
+grads.network.model.decoder.layers.10.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '9.312e-03'
+  mean: '-8.616e-05'
+  min: '-9.148e-03'
+  shape:
+  - 1024
+  sum: '-8.822e-02'
+grads.network.model.decoder.layers.10.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '2.466e-01'
+  mean: '6.922e-07'
+  min: '-2.438e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '7.259e-01'
+grads.network.model.decoder.layers.10.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '8.563e-03'
+  mean: '-2.205e-05'
+  min: '-9.231e-03'
+  shape:
+  - 1024
+  sum: '-2.258e-02'
+grads.network.model.decoder.layers.10.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.004e-02'
+  mean: '8.82e-06'
+  min: '-2.064e-02'
+  shape:
+  - 1024
+  sum: '9.032e-03'
+grads.network.model.decoder.layers.11.fc1.bias:
+  device: cuda:0
+  max: '4.537e-03'
+  mean: '-1.97e-05'
+  min: '-1.077e-02'
+  shape:
+  - 4096
+  sum: '-8.069e-02'
+grads.network.model.decoder.layers.11.fc1.weight:
+  device: cuda:0
+  max: '1.921e-01'
+  mean: '-8.097e-08'
+  min: '-1.258e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-3.396e-01'
+grads.network.model.decoder.layers.11.fc2.bias:
+  device: cuda:0
+  max: '9.747e-03'
+  mean: '0.e+00'
+  min: '-1.146e-02'
+  shape:
+  - 1024
+  sum: '0.e+00'
+grads.network.model.decoder.layers.11.fc2.weight:
+  device: cuda:0
+  max: '2.297e-02'
+  mean: '-2.274e-13'
+  min: '-2.611e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '-9.537e-07'
+grads.network.model.decoder.layers.11.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.074e-02'
+  mean: '-1.697e-04'
+  min: '-1.309e-02'
+  shape:
+  - 1024
+  sum: '-1.738e-01'
+grads.network.model.decoder.layers.11.final_layer_norm.weight:
+  device: cuda:0
+  max: '4.611e-02'
+  mean: '-1.405e-05'
+  min: '-1.679e-02'
+  shape:
+  - 1024
+  sum: '-1.439e-02'
+grads.network.model.decoder.layers.11.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '4.075e-10'
+  mean: '3.897e-12'
+  min: '-5.239e-10'
+  shape:
+  - 1024
+  sum: '3.990e-09'
+grads.network.model.decoder.layers.11.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '3.695e-02'
+  mean: '-2.855e-13'
+  min: '-3.176e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-2.994e-07'
+grads.network.model.decoder.layers.11.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '1.050e-02'
+  mean: '1.819e-12'
+  min: '-1.04e-02'
+  shape:
+  - 1024
+  sum: '1.863e-09'
+grads.network.model.decoder.layers.11.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '4.005e-03'
+  mean: '-4.619e-14'
+  min: '-3.44e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '-4.843e-08'
+grads.network.model.decoder.layers.11.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.21e-03'
+  mean: '-1.349e-05'
+  min: '-2.133e-03'
+  shape:
+  - 1024
+  sum: '-1.382e-02'
+grads.network.model.decoder.layers.11.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '2.495e-02'
+  mean: '1.265e-07'
+  min: '-2.483e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.326e-01'
+grads.network.model.decoder.layers.11.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '9.094e-03'
+  mean: '-1.657e-05'
+  min: '-1.120e-02'
+  shape:
+  - 1024
+  sum: '-1.697e-02'
+grads.network.model.decoder.layers.11.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '2.806e-01'
+  mean: '1.554e-07'
+  min: '-2.307e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.629e-01'
+grads.network.model.decoder.layers.11.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.090e-02'
+  mean: '4.103e-05'
+  min: '-1.074e-02'
+  shape:
+  - 1024
+  sum: '4.202e-02'
+grads.network.model.decoder.layers.11.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '9.913e-03'
+  mean: '8.734e-06'
+  min: '-2.563e-02'
+  shape:
+  - 1024
+  sum: '8.943e-03'
+grads.network.model.decoder.layers.12.fc1.bias:
+  device: cuda:0
+  max: '4.174e-03'
+  mean: '-9.494e-06'
+  min: '-5.266e-03'
+  shape:
+  - 4096
+  sum: '-3.889e-02'
+grads.network.model.decoder.layers.12.fc1.weight:
+  device: cuda:0
+  max: '1.308e-01'
+  mean: '-4.169e-08'
+  min: '-1.225e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-1.749e-01'
+grads.network.model.decoder.layers.12.fc2.bias:
+  device: cuda:0
+  max: '9.381e-03'
+  mean: '0.e+00'
+  min: '-9.925e-03'
+  shape:
+  - 1024
+  sum: '0.e+00'
+grads.network.model.decoder.layers.12.fc2.weight:
+  device: cuda:0
+  max: '1.477e-02'
+  mean: '-1.137e-13'
+  min: '-1.799e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '-4.768e-07'
+grads.network.model.decoder.layers.12.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.085e-02'
+  mean: '-6.289e-05'
+  min: '-1.164e-02'
+  shape:
+  - 1024
+  sum: '-6.440e-02'
+grads.network.model.decoder.layers.12.final_layer_norm.weight:
+  device: cuda:0
+  max: '2.347e-02'
+  mean: '1.717e-05'
+  min: '-3.135e-02'
+  shape:
+  - 1024
+  sum: '1.758e-02'
+grads.network.model.decoder.layers.12.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '6.694e-10'
+  mean: '8.309e-13'
+  min: '-4.948e-10'
+  shape:
+  - 1024
+  sum: '8.508e-10'
+grads.network.model.decoder.layers.12.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '7.397e-02'
+  mean: '-2.175e-13'
+  min: '-9.768e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-2.281e-07'
+grads.network.model.decoder.layers.12.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '9.249e-03'
+  mean: '-7.276e-12'
+  min: '-9.731e-03'
+  shape:
+  - 1024
+  sum: '-7.451e-09'
+grads.network.model.decoder.layers.12.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '4.412e-03'
+  mean: '1.421e-13'
+  min: '-4.588e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.490e-07'
+grads.network.model.decoder.layers.12.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '3.407e-03'
+  mean: '2.445e-05'
+  min: '-1.779e-03'
+  shape:
+  - 1024
+  sum: '2.504e-02'
+grads.network.model.decoder.layers.12.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '4.225e-02'
+  mean: '-3.557e-07'
+  min: '-4.189e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-3.729e-01'
+grads.network.model.decoder.layers.12.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '8.426e-03'
+  mean: '2.616e-05'
+  min: '-1.041e-02'
+  shape:
+  - 1024
+  sum: '2.679e-02'
+grads.network.model.decoder.layers.12.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '2.573e-01'
+  mean: '-3.806e-07'
+  min: '-2.223e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-3.990e-01'
+grads.network.model.decoder.layers.12.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '9.540e-03'
+  mean: '1.539e-05'
+  min: '-1.009e-02'
+  shape:
+  - 1024
+  sum: '1.576e-02'
+grads.network.model.decoder.layers.12.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.112e-02'
+  mean: '6.956e-06'
+  min: '-3.292e-02'
+  shape:
+  - 1024
+  sum: '7.123e-03'
+grads.network.model.decoder.layers.13.fc1.bias:
+  device: cuda:0
+  max: '4.255e-03'
+  mean: '-6.284e-06'
+  min: '-3.659e-03'
+  shape:
+  - 4096
+  sum: '-2.574e-02'
+grads.network.model.decoder.layers.13.fc1.weight:
+  device: cuda:0
+  max: '9.864e-02'
+  mean: '-1.925e-08'
+  min: '-8.668e-02'
+  shape:
+  - 4096
+  - 1024
+  sum: '-8.074e-02'
+grads.network.model.decoder.layers.13.fc2.bias:
+  device: cuda:0
+  max: '8.901e-03'
+  mean: '-9.095e-12'
+  min: '-9.272e-03'
+  shape:
+  - 1024
+  sum: '-9.313e-09'
+grads.network.model.decoder.layers.13.fc2.weight:
+  device: cuda:0
+  max: '9.958e-03'
+  mean: '-1.137e-13'
+  min: '-1.159e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '-4.768e-07'
+grads.network.model.decoder.layers.13.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.098e-02'
+  mean: '1.136e-04'
+  min: '-1.088e-02'
+  shape:
+  - 1024
+  sum: '1.163e-01'
+grads.network.model.decoder.layers.13.final_layer_norm.weight:
+  device: cuda:0
+  max: '3.056e-02'
+  mean: '2.505e-06'
+  min: '-2.49e-02'
+  shape:
+  - 1024
+  sum: '2.565e-03'
+grads.network.model.decoder.layers.13.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '3.056e-10'
+  mean: '-3.326e-12'
+  min: '-4.657e-10'
+  shape:
+  - 1024
+  sum: '-3.406e-09'
+grads.network.model.decoder.layers.13.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '3.654e-02'
+  mean: '2.432e-13'
+  min: '-4.357e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.551e-07'
+grads.network.model.decoder.layers.13.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '7.424e-03'
+  mean: '-3.638e-12'
+  min: '-9.317e-03'
+  shape:
+  - 1024
+  sum: '-3.725e-09'
+grads.network.model.decoder.layers.13.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '3.228e-03'
+  mean: '7.105e-14'
+  min: '-2.774e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '7.451e-08'
+grads.network.model.decoder.layers.13.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '2.412e-03'
+  mean: '1.546e-05'
+  min: '-1.678e-03'
+  shape:
+  - 1024
+  sum: '1.583e-02'
+grads.network.model.decoder.layers.13.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.646e-02'
+  mean: '-2.364e-07'
+  min: '-1.986e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-2.479e-01'
+grads.network.model.decoder.layers.13.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '9.358e-03'
+  mean: '-2.785e-05'
+  min: '-8.192e-03'
+  shape:
+  - 1024
+  sum: '-2.851e-02'
+grads.network.model.decoder.layers.13.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '2.093e-01'
+  mean: '4.26e-07'
+  min: '-2.454e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '4.467e-01'
+grads.network.model.decoder.layers.13.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '7.755e-03'
+  mean: '4.027e-05'
+  min: '-9.616e-03'
+  shape:
+  - 1024
+  sum: '4.124e-02'
+grads.network.model.decoder.layers.13.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.237e-02'
+  mean: '2.634e-06'
+  min: '-3.056e-02'
+  shape:
+  - 1024
+  sum: '2.697e-03'
+grads.network.model.decoder.layers.14.fc1.bias:
+  device: cuda:0
+  max: '3.368e-03'
+  mean: '-4.94e-06'
+  min: '-4.024e-03'
+  shape:
+  - 4096
+  sum: '-2.023e-02'
+grads.network.model.decoder.layers.14.fc1.weight:
+  device: cuda:0
+  max: '1.023e-01'
+  mean: '-4.683e-09'
+  min: '-8.753e-02'
+  shape:
+  - 4096
+  - 1024
+  sum: '-1.964e-02'
+grads.network.model.decoder.layers.14.fc2.bias:
+  device: cuda:0
+  max: '9.881e-03'
+  mean: '-2.183e-11'
+  min: '-9.016e-03'
+  shape:
+  - 1024
+  sum: '-2.235e-08'
+grads.network.model.decoder.layers.14.fc2.weight:
+  device: cuda:0
+  max: '1.668e-02'
+  mean: '-1.592e-12'
+  min: '-1.498e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '-6.676e-06'
+grads.network.model.decoder.layers.14.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.219e-02'
+  mean: '2.743e-05'
+  min: '-1.083e-02'
+  shape:
+  - 1024
+  sum: '2.809e-02'
+grads.network.model.decoder.layers.14.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.590e-02'
+  mean: '-4.36e-06'
+  min: '-3.127e-02'
+  shape:
+  - 1024
+  sum: '-4.464e-03'
+grads.network.model.decoder.layers.14.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '3.929e-10'
+  mean: '-2.173e-12'
+  min: '-3.056e-10'
+  shape:
+  - 1024
+  sum: '-2.226e-09'
+grads.network.model.decoder.layers.14.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '5.135e-02'
+  mean: '-5.795e-14'
+  min: '-4.326e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-6.077e-08'
+grads.network.model.decoder.layers.14.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '9.779e-03'
+  mean: '9.095e-12'
+  min: '-8.985e-03'
+  shape:
+  - 1024
+  sum: '9.313e-09'
+grads.network.model.decoder.layers.14.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '2.521e-03'
+  mean: '-2.842e-14'
+  min: '-2.492e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '-2.980e-08'
+grads.network.model.decoder.layers.14.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '2.483e-03'
+  mean: '-2.104e-05'
+  min: '-4.766e-03'
+  shape:
+  - 1024
+  sum: '-2.155e-02'
+grads.network.model.decoder.layers.14.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '3.591e-02'
+  mean: '4.924e-07'
+  min: '-2.957e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '5.163e-01'
+grads.network.model.decoder.layers.14.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '8.477e-03'
+  mean: '1.055e-04'
+  min: '-8.184e-03'
+  shape:
+  - 1024
+  sum: '1.081e-01'
+grads.network.model.decoder.layers.14.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '2.027e-01'
+  mean: '-2.47e-06'
+  min: '-2.218e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-2.59e+00'
+grads.network.model.decoder.layers.14.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.029e-02'
+  mean: '4.850e-05'
+  min: '-9.323e-03'
+  shape:
+  - 1024
+  sum: '4.967e-02'
+grads.network.model.decoder.layers.14.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.910e-02'
+  mean: '5.651e-06'
+  min: '-3.208e-02'
+  shape:
+  - 1024
+  sum: '5.786e-03'
+grads.network.model.decoder.layers.15.fc1.bias:
+  device: cuda:0
+  max: '5.394e-03'
+  mean: '-1.012e-05'
+  min: '-6.176e-03'
+  shape:
+  - 4096
+  sum: '-4.146e-02'
+grads.network.model.decoder.layers.15.fc1.weight:
+  device: cuda:0
+  max: '8.324e-02'
+  mean: '-1.046e-08'
+  min: '-1.047e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-4.386e-02'
+grads.network.model.decoder.layers.15.fc2.bias:
+  device: cuda:0
+  max: '9.866e-03'
+  mean: '-7.276e-12'
+  min: '-1.172e-02'
+  shape:
+  - 1024
+  sum: '-7.451e-09'
+grads.network.model.decoder.layers.15.fc2.weight:
+  device: cuda:0
+  max: '1.37e-02'
+  mean: '-5.684e-13'
+  min: '-1.439e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '-2.384e-06'
+grads.network.model.decoder.layers.15.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.231e-02'
+  mean: '-1.332e-04'
+  min: '-1.468e-02'
+  shape:
+  - 1024
+  sum: '-1.364e-01'
+grads.network.model.decoder.layers.15.final_layer_norm.weight:
+  device: cuda:0
+  max: '3.634e-02'
+  mean: '1.128e-05'
+  min: '-3.444e-02'
+  shape:
+  - 1024
+  sum: '1.155e-02'
+grads.network.model.decoder.layers.15.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.164e-09'
+  mean: '3.457e-12'
+  min: '-4.657e-10'
+  shape:
+  - 1024
+  sum: '3.54e-09'
+grads.network.model.decoder.layers.15.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '3.154e-02'
+  mean: '4.652e-14'
+  min: '-2.124e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '4.878e-08'
+grads.network.model.decoder.layers.15.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '9.871e-03'
+  mean: '-1.455e-11'
+  min: '-9.811e-03'
+  shape:
+  - 1024
+  sum: '-1.490e-08'
+grads.network.model.decoder.layers.15.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '4.353e-03'
+  mean: '1.421e-14'
+  min: '-4.717e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.490e-08'
+grads.network.model.decoder.layers.15.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.886e-03'
+  mean: '2.190e-05'
+  min: '-2.335e-03'
+  shape:
+  - 1024
+  sum: '2.243e-02'
+grads.network.model.decoder.layers.15.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '2.037e-02'
+  mean: '-4.754e-07'
+  min: '-2.289e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-4.985e-01'
+grads.network.model.decoder.layers.15.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '7.805e-03'
+  mean: '-4.434e-05'
+  min: '-9.824e-03'
+  shape:
+  - 1024
+  sum: '-4.541e-02'
+grads.network.model.decoder.layers.15.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.984e-01'
+  mean: '9.627e-07'
+  min: '-1.703e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.009e+00'
+grads.network.model.decoder.layers.15.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.079e-02'
+  mean: '1.138e-04'
+  min: '-1.047e-02'
+  shape:
+  - 1024
+  sum: '1.165e-01'
+grads.network.model.decoder.layers.15.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.985e-02'
+  mean: '-3.775e-06'
+  min: '-3.666e-02'
+  shape:
+  - 1024
+  sum: '-3.866e-03'
+grads.network.model.decoder.layers.16.fc1.bias:
+  device: cuda:0
+  max: '4.077e-03'
+  mean: '2.515e-06'
+  min: '-4.591e-03'
+  shape:
+  - 4096
+  sum: '1.030e-02'
+grads.network.model.decoder.layers.16.fc1.weight:
+  device: cuda:0
+  max: '1.095e-01'
+  mean: '2.903e-09'
+  min: '-1.061e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '1.218e-02'
+grads.network.model.decoder.layers.16.fc2.bias:
+  device: cuda:0
+  max: '1.072e-02'
+  mean: '0.e+00'
+  min: '-1.028e-02'
+  shape:
+  - 1024
+  sum: '0.e+00'
+grads.network.model.decoder.layers.16.fc2.weight:
+  device: cuda:0
+  max: '2.759e-02'
+  mean: '0.e+00'
+  min: '-2.188e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '0.e+00'
+grads.network.model.decoder.layers.16.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.385e-02'
+  mean: '3.693e-04'
+  min: '-1.169e-02'
+  shape:
+  - 1024
+  sum: '3.781e-01'
+grads.network.model.decoder.layers.16.final_layer_norm.weight:
+  device: cuda:0
+  max: '2.044e-02'
+  mean: '-2.249e-06'
+  min: '-2.405e-02'
+  shape:
+  - 1024
+  sum: '-2.303e-03'
+grads.network.model.decoder.layers.16.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '4.657e-10'
+  mean: '-1.148e-12'
+  min: '-4.657e-10'
+  shape:
+  - 1024
+  sum: '-1.176e-09'
+grads.network.model.decoder.layers.16.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '2.442e-02'
+  mean: '7.527e-14'
+  min: '-2.925e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '7.893e-08'
+grads.network.model.decoder.layers.16.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '8.875e-03'
+  mean: '0.e+00'
+  min: '-9.845e-03'
+  shape:
+  - 1024
+  sum: '0.e+00'
+grads.network.model.decoder.layers.16.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '2.749e-03'
+  mean: '-1.563e-13'
+  min: '-2.783e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.639e-07'
+grads.network.model.decoder.layers.16.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.541e-03'
+  mean: '-7.89e-06'
+  min: '-2.125e-03'
+  shape:
+  - 1024
+  sum: '-8.079e-03'
+grads.network.model.decoder.layers.16.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '2.979e-02'
+  mean: '1.649e-07'
+  min: '-3.029e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.729e-01'
+grads.network.model.decoder.layers.16.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '9.657e-03'
+  mean: '-1.308e-04'
+  min: '-9.640e-03'
+  shape:
+  - 1024
+  sum: '-1.339e-01'
+grads.network.model.decoder.layers.16.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '2.179e-01'
+  mean: '2.732e-06'
+  min: '-2.213e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.865e+00'
+grads.network.model.decoder.layers.16.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '9.162e-03'
+  mean: '-9.535e-05'
+  min: '-1.059e-02'
+  shape:
+  - 1024
+  sum: '-9.764e-02'
+grads.network.model.decoder.layers.16.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '2.578e-02'
+  mean: '9.235e-06'
+  min: '-2.987e-02'
+  shape:
+  - 1024
+  sum: '9.457e-03'
+grads.network.model.decoder.layers.17.fc1.bias:
+  device: cuda:0
+  max: '6.044e-03'
+  mean: '2.890e-06'
+  min: '-6.564e-03'
+  shape:
+  - 4096
+  sum: '1.184e-02'
+grads.network.model.decoder.layers.17.fc1.weight:
+  device: cuda:0
+  max: '1.345e-01'
+  mean: '5.029e-10'
+  min: '-1.541e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '2.109e-03'
+grads.network.model.decoder.layers.17.fc2.bias:
+  device: cuda:0
+  max: '1.305e-02'
+  mean: '0.e+00'
+  min: '-1.607e-02'
+  shape:
+  - 1024
+  sum: '0.e+00'
+grads.network.model.decoder.layers.17.fc2.weight:
+  device: cuda:0
+  max: '2.616e-02'
+  mean: '0.e+00'
+  min: '-3.049e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '0.e+00'
+grads.network.model.decoder.layers.17.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.535e-02'
+  mean: '-2.257e-04'
+  min: '-1.923e-02'
+  shape:
+  - 1024
+  sum: '-2.311e-01'
+grads.network.model.decoder.layers.17.final_layer_norm.weight:
+  device: cuda:0
+  max: '3.850e-02'
+  mean: '2.985e-05'
+  min: '-2.193e-02'
+  shape:
+  - 1024
+  sum: '3.056e-02'
+grads.network.model.decoder.layers.17.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '3.201e-10'
+  mean: '1.170e-12'
+  min: '-2.183e-10'
+  shape:
+  - 1024
+  sum: '1.198e-09'
+grads.network.model.decoder.layers.17.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.88e-02'
+  mean: '1.493e-13'
+  min: '-1.416e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.566e-07'
+grads.network.model.decoder.layers.17.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '1.277e-02'
+  mean: '-1.455e-11'
+  min: '-1.398e-02'
+  shape:
+  - 1024
+  sum: '-1.490e-08'
+grads.network.model.decoder.layers.17.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '3.332e-03'
+  mean: '9.592e-14'
+  min: '-4.020e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.006e-07'
+grads.network.model.decoder.layers.17.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '8.169e-04'
+  mean: '1.575e-07'
+  min: '-1.763e-03'
+  shape:
+  - 1024
+  sum: '1.613e-04'
+grads.network.model.decoder.layers.17.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '2.347e-02'
+  mean: '-2.684e-09'
+  min: '-1.066e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-2.815e-03'
+grads.network.model.decoder.layers.17.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '1.098e-02'
+  mean: '-1.444e-05'
+  min: '-1.304e-02'
+  shape:
+  - 1024
+  sum: '-1.479e-02'
+grads.network.model.decoder.layers.17.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '3.683e-01'
+  mean: '2.462e-07'
+  min: '-3.150e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.581e-01'
+grads.network.model.decoder.layers.17.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.358e-02'
+  mean: '-5.711e-06'
+  min: '-1.483e-02'
+  shape:
+  - 1024
+  sum: '-5.848e-03'
+grads.network.model.decoder.layers.17.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '2.098e-02'
+  mean: '3.371e-06'
+  min: '-1.99e-02'
+  shape:
+  - 1024
+  sum: '3.452e-03'
+grads.network.model.decoder.layers.18.fc1.bias:
+  device: cuda:0
+  max: '1.147e-02'
+  mean: '-5.311e-06'
+  min: '-7.232e-03'
+  shape:
+  - 4096
+  sum: '-2.175e-02'
+grads.network.model.decoder.layers.18.fc1.weight:
+  device: cuda:0
+  max: '1.619e-01'
+  mean: '-9.185e-09'
+  min: '-3.223e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-3.853e-02'
+grads.network.model.decoder.layers.18.fc2.bias:
+  device: cuda:0
+  max: '1.429e-02'
+  mean: '0.e+00'
+  min: '-1.499e-02'
+  shape:
+  - 1024
+  sum: '0.e+00'
+grads.network.model.decoder.layers.18.fc2.weight:
+  device: cuda:0
+  max: '2.821e-02'
+  mean: '-2.274e-13'
+  min: '-2.067e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '-9.537e-07'
+grads.network.model.decoder.layers.18.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.670e-02'
+  mean: '2.067e-04'
+  min: '-1.701e-02'
+  shape:
+  - 1024
+  sum: '2.117e-01'
+grads.network.model.decoder.layers.18.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.673e-02'
+  mean: '-3.888e-05'
+  min: '-1.522e-02'
+  shape:
+  - 1024
+  sum: '-3.981e-02'
+grads.network.model.decoder.layers.18.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '8.731e-10'
+  mean: '2.129e-12'
+  min: '-4.075e-10'
+  shape:
+  - 1024
+  sum: '2.18e-09'
+grads.network.model.decoder.layers.18.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '4.180e-02'
+  mean: '1.821e-14'
+  min: '-5.685e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.909e-08'
+grads.network.model.decoder.layers.18.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '1.283e-02'
+  mean: '7.276e-12'
+  min: '-1.266e-02'
+  shape:
+  - 1024
+  sum: '7.451e-09'
+grads.network.model.decoder.layers.18.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '2.322e-03'
+  mean: '2.842e-14'
+  min: '-2.526e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.980e-08'
+grads.network.model.decoder.layers.18.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '5.705e-03'
+  mean: '-1.891e-05'
+  min: '-5.284e-03'
+  shape:
+  - 1024
+  sum: '-1.937e-02'
+grads.network.model.decoder.layers.18.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '7.843e-02'
+  mean: '2.579e-07'
+  min: '-8.680e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.704e-01'
+grads.network.model.decoder.layers.18.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '1.423e-02'
+  mean: '1.193e-04'
+  min: '-1.538e-02'
+  shape:
+  - 1024
+  sum: '1.222e-01'
+grads.network.model.decoder.layers.18.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '4.271e-01'
+  mean: '-1.627e-06'
+  min: '-3.934e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.706e+00'
+grads.network.model.decoder.layers.18.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.349e-02'
+  mean: '1.753e-06'
+  min: '-1.332e-02'
+  shape:
+  - 1024
+  sum: '1.795e-03'
+grads.network.model.decoder.layers.18.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.638e-02'
+  mean: '1.578e-06'
+  min: '-1.96e-02'
+  shape:
+  - 1024
+  sum: '1.616e-03'
+grads.network.model.decoder.layers.19.fc1.bias:
+  device: cuda:0
+  max: '1.043e-02'
+  mean: '3.285e-06'
+  min: '-8.926e-03'
+  shape:
+  - 4096
+  sum: '1.346e-02'
+grads.network.model.decoder.layers.19.fc1.weight:
+  device: cuda:0
+  max: '2.514e-01'
+  mean: '1.092e-08'
+  min: '-2.619e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '4.581e-02'
+grads.network.model.decoder.layers.19.fc2.bias:
+  device: cuda:0
+  max: '1.579e-02'
+  mean: '7.276e-12'
+  min: '-1.67e-02'
+  shape:
+  - 1024
+  sum: '7.451e-09'
+grads.network.model.decoder.layers.19.fc2.weight:
+  device: cuda:0
+  max: '2.852e-02'
+  mean: '0.e+00'
+  min: '-2.674e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '0.e+00'
+grads.network.model.decoder.layers.19.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.804e-02'
+  mean: '8.083e-05'
+  min: '-1.924e-02'
+  shape:
+  - 1024
+  sum: '8.276e-02'
+grads.network.model.decoder.layers.19.final_layer_norm.weight:
+  device: cuda:0
+  max: '2.331e-02'
+  mean: '-1.504e-05'
+  min: '-1.230e-02'
+  shape:
+  - 1024
+  sum: '-1.54e-02'
+grads.network.model.decoder.layers.19.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '4.075e-10'
+  mean: '-1.247e-12'
+  min: '-4.948e-10'
+  shape:
+  - 1024
+  sum: '-1.277e-09'
+grads.network.model.decoder.layers.19.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '4.950e-02'
+  mean: '1.668e-13'
+  min: '-3.336e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.749e-07'
+grads.network.model.decoder.layers.19.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '1.443e-02'
+  mean: '4.366e-11'
+  min: '-1.464e-02'
+  shape:
+  - 1024
+  sum: '4.470e-08'
+grads.network.model.decoder.layers.19.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '5.047e-03'
+  mean: '1.137e-13'
+  min: '-4.323e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.192e-07'
+grads.network.model.decoder.layers.19.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '2.846e-03'
+  mean: '-5.669e-06'
+  min: '-2.716e-03'
+  shape:
+  - 1024
+  sum: '-5.805e-03'
+grads.network.model.decoder.layers.19.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '5.232e-02'
+  mean: '7.022e-08'
+  min: '-5.666e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '7.363e-02'
+grads.network.model.decoder.layers.19.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '1.353e-02'
+  mean: '-1.046e-04'
+  min: '-1.307e-02'
+  shape:
+  - 1024
+  sum: '-1.071e-01'
+grads.network.model.decoder.layers.19.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '3.506e-01'
+  mean: '1.296e-06'
+  min: '-3.869e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.359e+00'
+grads.network.model.decoder.layers.19.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.543e-02'
+  mean: '1.895e-05'
+  min: '-1.569e-02'
+  shape:
+  - 1024
+  sum: '1.941e-02'
+grads.network.model.decoder.layers.19.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.44e-02'
+  mean: '5.186e-07'
+  min: '-1.104e-02'
+  shape:
+  - 1024
+  sum: '5.310e-04'
+grads.network.model.decoder.layers.2.fc1.bias:
+  device: cuda:0
+  max: '5.921e-03'
+  mean: '8.856e-06'
+  min: '-9.619e-03'
+  shape:
+  - 4096
+  sum: '3.627e-02'
+grads.network.model.decoder.layers.2.fc1.weight:
+  device: cuda:0
+  max: '1.109e-01'
+  mean: '-1.692e-08'
+  min: '-1.033e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-7.098e-02'
+grads.network.model.decoder.layers.2.fc2.bias:
+  device: cuda:0
+  max: '8.814e-03'
+  mean: '1.455e-11'
+  min: '-9.890e-03'
+  shape:
+  - 1024
+  sum: '1.490e-08'
+grads.network.model.decoder.layers.2.fc2.weight:
+  device: cuda:0
+  max: '8.03e-03'
+  mean: '1.705e-13'
+  min: '-7.305e-03'
+  shape:
+  - 1024
+  - 4096
+  sum: '7.153e-07'
+grads.network.model.decoder.layers.2.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.062e-02'
+  mean: '2.142e-05'
+  min: '-9.885e-03'
+  shape:
+  - 1024
+  sum: '2.193e-02'
+grads.network.model.decoder.layers.2.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.06e-02'
+  mean: '1.349e-05'
+  min: '-3.724e-02'
+  shape:
+  - 1024
+  sum: '1.382e-02'
+grads.network.model.decoder.layers.2.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '6.985e-10'
+  mean: '3.819e-13'
+  min: '-3.492e-10'
+  shape:
+  - 1024
+  sum: '3.911e-10'
+grads.network.model.decoder.layers.2.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.658e-02'
+  mean: '-6.373e-14'
+  min: '-1.493e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-6.682e-08'
+grads.network.model.decoder.layers.2.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '9.061e-03'
+  mean: '1.455e-11'
+  min: '-9.315e-03'
+  shape:
+  - 1024
+  sum: '1.490e-08'
+grads.network.model.decoder.layers.2.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '9.092e-03'
+  mean: '-1.421e-14'
+  min: '-8.389e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.490e-08'
+grads.network.model.decoder.layers.2.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.064e-03'
+  mean: '4.480e-06'
+  min: '-1.057e-03'
+  shape:
+  - 1024
+  sum: '4.588e-03'
+grads.network.model.decoder.layers.2.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '9.205e-03'
+  mean: '3.874e-08'
+  min: '-1.268e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '4.063e-02'
+grads.network.model.decoder.layers.2.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '8.063e-03'
+  mean: '3.71e-05'
+  min: '-6.821e-03'
+  shape:
+  - 1024
+  sum: '3.799e-02'
+grads.network.model.decoder.layers.2.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.234e-01'
+  mean: '3.208e-07'
+  min: '-1.047e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '3.364e-01'
+grads.network.model.decoder.layers.2.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '9.170e-03'
+  mean: '-3.405e-05'
+  min: '-9.528e-03'
+  shape:
+  - 1024
+  sum: '-3.486e-02'
+grads.network.model.decoder.layers.2.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.376e-02'
+  mean: '3.953e-06'
+  min: '-3.395e-02'
+  shape:
+  - 1024
+  sum: '4.048e-03'
+grads.network.model.decoder.layers.20.fc1.bias:
+  device: cuda:0
+  max: '7.671e-03'
+  mean: '-3.533e-07'
+  min: '-1.159e-02'
+  shape:
+  - 4096
+  sum: '-1.447e-03'
+grads.network.model.decoder.layers.20.fc1.weight:
+  device: cuda:0
+  max: '3.498e-01'
+  mean: '-1.061e-09'
+  min: '-2.271e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-4.449e-03'
+grads.network.model.decoder.layers.20.fc2.bias:
+  device: cuda:0
+  max: '1.901e-02'
+  mean: '-1.455e-11'
+  min: '-1.83e-02'
+  shape:
+  - 1024
+  sum: '-1.490e-08'
+grads.network.model.decoder.layers.20.fc2.weight:
+  device: cuda:0
+  max: '8.356e-02'
+  mean: '5.684e-14'
+  min: '-8.36e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '2.384e-07'
+grads.network.model.decoder.layers.20.final_layer_norm.bias:
+  device: cuda:0
+  max: '2.215e-02'
+  mean: '2.282e-04'
+  min: '-2.103e-02'
+  shape:
+  - 1024
+  sum: '2.337e-01'
+grads.network.model.decoder.layers.20.final_layer_norm.weight:
+  device: cuda:0
+  max: '2.260e-02'
+  mean: '-2.262e-05'
+  min: '-1.660e-02'
+  shape:
+  - 1024
+  sum: '-2.316e-02'
+grads.network.model.decoder.layers.20.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '3.492e-10'
+  mean: '1.942e-12'
+  min: '-3.347e-10'
+  shape:
+  - 1024
+  sum: '1.989e-09'
+grads.network.model.decoder.layers.20.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '3.529e-02'
+  mean: '-4.73e-14'
+  min: '-3.390e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-4.959e-08'
+grads.network.model.decoder.layers.20.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '1.786e-02'
+  mean: '1.455e-11'
+  min: '-1.611e-02'
+  shape:
+  - 1024
+  sum: '1.490e-08'
+grads.network.model.decoder.layers.20.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '8.450e-03'
+  mean: '-1.243e-14'
+  min: '-9.957e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.304e-08'
+grads.network.model.decoder.layers.20.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.168e-03'
+  mean: '1.373e-05'
+  min: '-1.461e-03'
+  shape:
+  - 1024
+  sum: '1.406e-02'
+grads.network.model.decoder.layers.20.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '3.718e-02'
+  mean: '-1.270e-07'
+  min: '-3.829e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.332e-01'
+grads.network.model.decoder.layers.20.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '1.316e-02'
+  mean: '1.595e-04'
+  min: '-1.22e-02'
+  shape:
+  - 1024
+  sum: '1.634e-01'
+grads.network.model.decoder.layers.20.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '3.578e-01'
+  mean: '-1.476e-06'
+  min: '-3.892e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.548e+00'
+grads.network.model.decoder.layers.20.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.886e-02'
+  mean: '-2.963e-04'
+  min: '-1.759e-02'
+  shape:
+  - 1024
+  sum: '-3.034e-01'
+grads.network.model.decoder.layers.20.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '2.024e-02'
+  mean: '9.812e-07'
+  min: '-1.449e-02'
+  shape:
+  - 1024
+  sum: '1.005e-03'
+grads.network.model.decoder.layers.21.fc1.bias:
+  device: cuda:0
+  max: '1.159e-02'
+  mean: '-7.116e-06'
+  min: '-1.195e-02'
+  shape:
+  - 4096
+  sum: '-2.915e-02'
+grads.network.model.decoder.layers.21.fc1.weight:
+  device: cuda:0
+  max: '3.364e-01'
+  mean: '-2.245e-08'
+  min: '-3.275e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-9.418e-02'
+grads.network.model.decoder.layers.21.fc2.bias:
+  device: cuda:0
+  max: '2.210e-02'
+  mean: '1.455e-11'
+  min: '-2.116e-02'
+  shape:
+  - 1024
+  sum: '1.490e-08'
+grads.network.model.decoder.layers.21.fc2.weight:
+  device: cuda:0
+  max: '1.082e-01'
+  mean: '-5.684e-14'
+  min: '-9.473e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '-2.384e-07'
+grads.network.model.decoder.layers.21.final_layer_norm.bias:
+  device: cuda:0
+  max: '2.494e-02'
+  mean: '2.162e-05'
+  min: '-2.386e-02'
+  shape:
+  - 1024
+  sum: '2.214e-02'
+grads.network.model.decoder.layers.21.final_layer_norm.weight:
+  device: cuda:0
+  max: '2.376e-02'
+  mean: '7.015e-06'
+  min: '-1.133e-02'
+  shape:
+  - 1024
+  sum: '7.184e-03'
+grads.network.model.decoder.layers.21.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '4.002e-10'
+  mean: '-1.572e-12'
+  min: '-3.638e-10'
+  shape:
+  - 1024
+  sum: '-1.61e-09'
+grads.network.model.decoder.layers.21.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '2.533e-02'
+  mean: '2.293e-13'
+  min: '-3.203e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.405e-07'
+grads.network.model.decoder.layers.21.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '1.854e-02'
+  mean: '0.e+00'
+  min: '-1.843e-02'
+  shape:
+  - 1024
+  sum: '0.e+00'
+grads.network.model.decoder.layers.21.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.236e-02'
+  mean: '1.137e-13'
+  min: '-1.02e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.192e-07'
+grads.network.model.decoder.layers.21.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.768e-03'
+  mean: '1.468e-05'
+  min: '-1.166e-03'
+  shape:
+  - 1024
+  sum: '1.503e-02'
+grads.network.model.decoder.layers.21.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.766e-02'
+  mean: '-1.343e-07'
+  min: '-2.628e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.408e-01'
+grads.network.model.decoder.layers.21.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '1.447e-02'
+  mean: '1.302e-05'
+  min: '-1.778e-02'
+  shape:
+  - 1024
+  sum: '1.333e-02'
+grads.network.model.decoder.layers.21.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '4.942e-01'
+  mean: '-1.191e-07'
+  min: '-4.252e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.249e-01'
+grads.network.model.decoder.layers.21.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.995e-02'
+  mean: '1.246e-05'
+  min: '-1.996e-02'
+  shape:
+  - 1024
+  sum: '1.276e-02'
+grads.network.model.decoder.layers.21.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '2.301e-02'
+  mean: '1.724e-06'
+  min: '-1.395e-02'
+  shape:
+  - 1024
+  sum: '1.766e-03'
+grads.network.model.decoder.layers.22.fc1.bias:
+  device: cuda:0
+  max: '1.418e-02'
+  mean: '1.925e-05'
+  min: '-3.796e-02'
+  shape:
+  - 4096
+  sum: '7.886e-02'
+grads.network.model.decoder.layers.22.fc1.weight:
+  device: cuda:0
+  max: '4.455e-01'
+  mean: '1.533e-08'
+  min: '-3.281e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '6.429e-02'
+grads.network.model.decoder.layers.22.fc2.bias:
+  device: cuda:0
+  max: '2.107e-02'
+  mean: '-2.183e-11'
+  min: '-1.798e-02'
+  shape:
+  - 1024
+  sum: '-2.235e-08'
+grads.network.model.decoder.layers.22.fc2.weight:
+  device: cuda:0
+  max: '3.631e-02'
+  mean: '-1.137e-13'
+  min: '-5.145e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '-4.768e-07'
+grads.network.model.decoder.layers.22.final_layer_norm.bias:
+  device: cuda:0
+  max: '2.261e-02'
+  mean: '-3.098e-04'
+  min: '-1.996e-02'
+  shape:
+  - 1024
+  sum: '-3.173e-01'
+grads.network.model.decoder.layers.22.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.112e-01'
+  mean: '1.792e-05'
+  min: '-7.273e-03'
+  shape:
+  - 1024
+  sum: '1.835e-02'
+grads.network.model.decoder.layers.22.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '2.838e-10'
+  mean: '1.338e-12'
+  min: '-2.328e-10'
+  shape:
+  - 1024
+  sum: '1.37e-09'
+grads.network.model.decoder.layers.22.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.521e-02'
+  mean: '-6.001e-14'
+  min: '-1.506e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-6.292e-08'
+grads.network.model.decoder.layers.22.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '1.797e-02'
+  mean: '2.910e-11'
+  min: '-1.645e-02'
+  shape:
+  - 1024
+  sum: '2.980e-08'
+grads.network.model.decoder.layers.22.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.489e-02'
+  mean: '-2.132e-13'
+  min: '-1.383e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-2.235e-07'
+grads.network.model.decoder.layers.22.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.432e-03'
+  mean: '-1.077e-05'
+  min: '-1.380e-03'
+  shape:
+  - 1024
+  sum: '-1.103e-02'
+grads.network.model.decoder.layers.22.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.757e-02'
+  mean: '6.216e-08'
+  min: '-1.876e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '6.518e-02'
+grads.network.model.decoder.layers.22.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '1.04e-02'
+  mean: '9.040e-05'
+  min: '-1.207e-02'
+  shape:
+  - 1024
+  sum: '9.257e-02'
+grads.network.model.decoder.layers.22.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '3.492e-01'
+  mean: '-5.219e-07'
+  min: '-2.943e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-5.472e-01'
+grads.network.model.decoder.layers.22.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.879e-02'
+  mean: '-5.430e-05'
+  min: '-1.734e-02'
+  shape:
+  - 1024
+  sum: '-5.561e-02'
+grads.network.model.decoder.layers.22.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.860e-02'
+  mean: '-1.348e-05'
+  min: '-3.154e-02'
+  shape:
+  - 1024
+  sum: '-1.380e-02'
+grads.network.model.decoder.layers.23.fc1.bias:
+  device: cuda:0
+  max: '1.947e-02'
+  mean: '2.517e-05'
+  min: '-1.008e-02'
+  shape:
+  - 4096
+  sum: '1.031e-01'
+grads.network.model.decoder.layers.23.fc1.weight:
+  device: cuda:0
+  max: '1.458e-01'
+  mean: '4.279e-08'
+  min: '-2.653e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '1.795e-01'
+grads.network.model.decoder.layers.23.fc2.bias:
+  device: cuda:0
+  max: '9.512e-03'
+  mean: '1.819e-12'
+  min: '-9.348e-03'
+  shape:
+  - 1024
+  sum: '1.863e-09'
+grads.network.model.decoder.layers.23.fc2.weight:
+  device: cuda:0
+  max: '2.092e-02'
+  mean: '-4.547e-13'
+  min: '-1.892e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '-1.907e-06'
+grads.network.model.decoder.layers.23.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.005e-02'
+  mean: '-9.368e-05'
+  min: '-9.654e-03'
+  shape:
+  - 1024
+  sum: '-9.593e-02'
+grads.network.model.decoder.layers.23.final_layer_norm.weight:
+  device: cuda:0
+  max: '9.125e-03'
+  mean: '2.809e-04'
+  min: '-8.498e-03'
+  shape:
+  - 1024
+  sum: '2.876e-01'
+grads.network.model.decoder.layers.23.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.048e-09'
+  mean: '-2.047e-13'
+  min: '-1.513e-09'
+  shape:
+  - 1024
+  sum: '-2.096e-10'
+grads.network.model.decoder.layers.23.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '7.757e-02'
+  mean: '-1.006e-13'
+  min: '-1.167e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.055e-07'
+grads.network.model.decoder.layers.23.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '9.025e-03'
+  mean: '-5.457e-12'
+  min: '-8.085e-03'
+  shape:
+  - 1024
+  sum: '-5.588e-09'
+grads.network.model.decoder.layers.23.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '4.444e-03'
+  mean: '-6.395e-14'
+  min: '-4.31e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '-6.706e-08'
+grads.network.model.decoder.layers.23.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '6.065e-03'
+  mean: '3.442e-05'
+  min: '-5.142e-03'
+  shape:
+  - 1024
+  sum: '3.525e-02'
+grads.network.model.decoder.layers.23.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '7.615e-02'
+  mean: '-1.647e-07'
+  min: '-8.673e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.727e-01'
+grads.network.model.decoder.layers.23.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '1.326e-02'
+  mean: '-5.18e-05'
+  min: '-1.957e-02'
+  shape:
+  - 1024
+  sum: '-5.304e-02'
+grads.network.model.decoder.layers.23.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '5.156e-01'
+  mean: '2.478e-07'
+  min: '-3.333e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.599e-01'
+grads.network.model.decoder.layers.23.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '9.140e-03'
+  mean: '1.168e-04'
+  min: '-7.772e-03'
+  shape:
+  - 1024
+  sum: '1.196e-01'
+grads.network.model.decoder.layers.23.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '5.779e-03'
+  mean: '4.173e-06'
+  min: '-1.385e-02'
+  shape:
+  - 1024
+  sum: '4.273e-03'
+grads.network.model.decoder.layers.3.fc1.bias:
+  device: cuda:0
+  max: '5.954e-03'
+  mean: '1.316e-05'
+  min: '-8.344e-03'
+  shape:
+  - 4096
+  sum: '5.389e-02'
+grads.network.model.decoder.layers.3.fc1.weight:
+  device: cuda:0
+  max: '1.064e-01'
+  mean: '-6.116e-09'
+  min: '-9.593e-02'
+  shape:
+  - 4096
+  - 1024
+  sum: '-2.565e-02'
+grads.network.model.decoder.layers.3.fc2.bias:
+  device: cuda:0
+  max: '8.140e-03'
+  mean: '-3.638e-12'
+  min: '-1.140e-02'
+  shape:
+  - 1024
+  sum: '-3.725e-09'
+grads.network.model.decoder.layers.3.fc2.weight:
+  device: cuda:0
+  max: '1.384e-02'
+  mean: '4.547e-13'
+  min: '-1.706e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '1.907e-06'
+grads.network.model.decoder.layers.3.final_layer_norm.bias:
+  device: cuda:0
+  max: '9.449e-03'
+  mean: '2.546e-05'
+  min: '-1.205e-02'
+  shape:
+  - 1024
+  sum: '2.607e-02'
+grads.network.model.decoder.layers.3.final_layer_norm.weight:
+  device: cuda:0
+  max: '2.066e-02'
+  mean: '-4.079e-05'
+  min: '-3.198e-02'
+  shape:
+  - 1024
+  sum: '-4.177e-02'
+grads.network.model.decoder.layers.3.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '3.056e-10'
+  mean: '-1.023e-12'
+  min: '-2.983e-10'
+  shape:
+  - 1024
+  sum: '-1.047e-09'
+grads.network.model.decoder.layers.3.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.167e-02'
+  mean: '-1.421e-14'
+  min: '-1.363e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.490e-08'
+grads.network.model.decoder.layers.3.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '7.554e-03'
+  mean: '1.819e-11'
+  min: '-1.130e-02'
+  shape:
+  - 1024
+  sum: '1.863e-08'
+grads.network.model.decoder.layers.3.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.395e-02'
+  mean: '7.105e-14'
+  min: '-9.944e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '7.451e-08'
+grads.network.model.decoder.layers.3.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.262e-03'
+  mean: '1.523e-05'
+  min: '-1.661e-03'
+  shape:
+  - 1024
+  sum: '1.560e-02'
+grads.network.model.decoder.layers.3.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.264e-02'
+  mean: '1.393e-07'
+  min: '-1.569e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.461e-01'
+grads.network.model.decoder.layers.3.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '6.315e-03'
+  mean: '3.350e-05'
+  min: '-1.044e-02'
+  shape:
+  - 1024
+  sum: '3.431e-02'
+grads.network.model.decoder.layers.3.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.511e-01'
+  mean: '3.064e-07'
+  min: '-1.489e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '3.212e-01'
+grads.network.model.decoder.layers.3.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '7.629e-03'
+  mean: '2.019e-05'
+  min: '-1.149e-02'
+  shape:
+  - 1024
+  sum: '2.068e-02'
+grads.network.model.decoder.layers.3.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.384e-02'
+  mean: '1.535e-06'
+  min: '-3.271e-02'
+  shape:
+  - 1024
+  sum: '1.572e-03'
+grads.network.model.decoder.layers.4.fc1.bias:
+  device: cuda:0
+  max: '8.716e-03'
+  mean: '-6.134e-06'
+  min: '-3.885e-03'
+  shape:
+  - 4096
+  sum: '-2.513e-02'
+grads.network.model.decoder.layers.4.fc1.weight:
+  device: cuda:0
+  max: '9.354e-02'
+  mean: '-1.18e-09'
+  min: '-1.037e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-4.948e-03'
+grads.network.model.decoder.layers.4.fc2.bias:
+  device: cuda:0
+  max: '7.127e-03'
+  mean: '-1.455e-11'
+  min: '-8.873e-03'
+  shape:
+  - 1024
+  sum: '-1.490e-08'
+grads.network.model.decoder.layers.4.fc2.weight:
+  device: cuda:0
+  max: '1.011e-02'
+  mean: '-2.274e-13'
+  min: '-1.157e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '-9.537e-07'
+grads.network.model.decoder.layers.4.final_layer_norm.bias:
+  device: cuda:0
+  max: '7.855e-03'
+  mean: '-2.88e-05'
+  min: '-9.680e-03'
+  shape:
+  - 1024
+  sum: '-2.949e-02'
+grads.network.model.decoder.layers.4.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.503e-02'
+  mean: '1.502e-06'
+  min: '-1.015e-02'
+  shape:
+  - 1024
+  sum: '1.538e-03'
+grads.network.model.decoder.layers.4.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '4.511e-10'
+  mean: '-4.124e-12'
+  min: '-2.838e-10'
+  shape:
+  - 1024
+  sum: '-4.223e-09'
+grads.network.model.decoder.layers.4.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '2.309e-02'
+  mean: '-2.882e-13'
+  min: '-2.746e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-3.022e-07'
+grads.network.model.decoder.layers.4.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '7.763e-03'
+  mean: '-7.276e-12'
+  min: '-1.027e-02'
+  shape:
+  - 1024
+  sum: '-7.451e-09'
+grads.network.model.decoder.layers.4.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.258e-02'
+  mean: '-5.684e-14'
+  min: '-8.443e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '-5.960e-08'
+grads.network.model.decoder.layers.4.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.406e-03'
+  mean: '8.718e-06'
+  min: '-1.263e-03'
+  shape:
+  - 1024
+  sum: '8.927e-03'
+grads.network.model.decoder.layers.4.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.614e-02'
+  mean: '5.714e-08'
+  min: '-1.253e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '5.992e-02'
+grads.network.model.decoder.layers.4.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '7.103e-03'
+  mean: '4.113e-05'
+  min: '-7.943e-03'
+  shape:
+  - 1024
+  sum: '4.212e-02'
+grads.network.model.decoder.layers.4.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.551e-01'
+  mean: '2.696e-07'
+  min: '-1.392e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.827e-01'
+grads.network.model.decoder.layers.4.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '8.028e-03'
+  mean: '7.166e-06'
+  min: '-1.046e-02'
+  shape:
+  - 1024
+  sum: '7.338e-03'
+grads.network.model.decoder.layers.4.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '8.643e-03'
+  mean: '-1.091e-05'
+  min: '-2.483e-02'
+  shape:
+  - 1024
+  sum: '-1.117e-02'
+grads.network.model.decoder.layers.5.fc1.bias:
+  device: cuda:0
+  max: '4.748e-03'
+  mean: '4.587e-06'
+  min: '-5.883e-03'
+  shape:
+  - 4096
+  sum: '1.879e-02'
+grads.network.model.decoder.layers.5.fc1.weight:
+  device: cuda:0
+  max: '9.723e-02'
+  mean: '-2.199e-09'
+  min: '-1.125e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-9.221e-03'
+grads.network.model.decoder.layers.5.fc2.bias:
+  device: cuda:0
+  max: '7.651e-03'
+  mean: '2.183e-11'
+  min: '-1.023e-02'
+  shape:
+  - 1024
+  sum: '2.235e-08'
+grads.network.model.decoder.layers.5.fc2.weight:
+  device: cuda:0
+  max: '1.427e-02'
+  mean: '4.547e-13'
+  min: '-1.743e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '1.907e-06'
+grads.network.model.decoder.layers.5.final_layer_norm.bias:
+  device: cuda:0
+  max: '8.459e-03'
+  mean: '-6.824e-05'
+  min: '-1.104e-02'
+  shape:
+  - 1024
+  sum: '-6.988e-02'
+grads.network.model.decoder.layers.5.final_layer_norm.weight:
+  device: cuda:0
+  max: '2.276e-02'
+  mean: '1.546e-05'
+  min: '-1.198e-02'
+  shape:
+  - 1024
+  sum: '1.583e-02'
+grads.network.model.decoder.layers.5.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '4.366e-10'
+  mean: '2.527e-12'
+  min: '-3.929e-10'
+  shape:
+  - 1024
+  sum: '2.588e-09'
+grads.network.model.decoder.layers.5.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '2.063e-02'
+  mean: '6.717e-14'
+  min: '-1.871e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '7.043e-08'
+grads.network.model.decoder.layers.5.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '7.647e-03'
+  mean: '1.455e-11'
+  min: '-1.1e-02'
+  shape:
+  - 1024
+  sum: '1.490e-08'
+grads.network.model.decoder.layers.5.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.146e-02'
+  mean: '-1.137e-13'
+  min: '-7.558e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.192e-07'
+grads.network.model.decoder.layers.5.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.232e-03'
+  mean: '5.46e-06'
+  min: '-1.171e-03'
+  shape:
+  - 1024
+  sum: '5.591e-03'
+grads.network.model.decoder.layers.5.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.892e-02'
+  mean: '1.393e-08'
+  min: '-1.640e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.461e-02'
+grads.network.model.decoder.layers.5.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '7.63e-03'
+  mean: '2.826e-05'
+  min: '-6.905e-03'
+  shape:
+  - 1024
+  sum: '2.894e-02'
+grads.network.model.decoder.layers.5.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.549e-01'
+  mean: '7.210e-08'
+  min: '-1.564e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '7.561e-02'
+grads.network.model.decoder.layers.5.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '7.75e-03'
+  mean: '-6.064e-05'
+  min: '-1.140e-02'
+  shape:
+  - 1024
+  sum: '-6.21e-02'
+grads.network.model.decoder.layers.5.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.310e-02'
+  mean: '-7.533e-06'
+  min: '-1.207e-02'
+  shape:
+  - 1024
+  sum: '-7.714e-03'
+grads.network.model.decoder.layers.6.fc1.bias:
+  device: cuda:0
+  max: '8.689e-03'
+  mean: '-1.853e-05'
+  min: '-5.812e-03'
+  shape:
+  - 4096
+  sum: '-7.588e-02'
+grads.network.model.decoder.layers.6.fc1.weight:
+  device: cuda:0
+  max: '1.247e-01'
+  mean: '2.587e-11'
+  min: '-1.671e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '1.085e-04'
+grads.network.model.decoder.layers.6.fc2.bias:
+  device: cuda:0
+  max: '8.694e-03'
+  mean: '-3.638e-12'
+  min: '-8.964e-03'
+  shape:
+  - 1024
+  sum: '-3.725e-09'
+grads.network.model.decoder.layers.6.fc2.weight:
+  device: cuda:0
+  max: '2.818e-02'
+  mean: '-1.99e-13'
+  min: '-2.423e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '-8.345e-07'
+grads.network.model.decoder.layers.6.final_layer_norm.bias:
+  device: cuda:0
+  max: '9.466e-03'
+  mean: '1.768e-05'
+  min: '-9.583e-03'
+  shape:
+  - 1024
+  sum: '1.811e-02'
+grads.network.model.decoder.layers.6.final_layer_norm.weight:
+  device: cuda:0
+  max: '3.202e-02'
+  mean: '1.739e-05'
+  min: '-1.373e-02'
+  shape:
+  - 1024
+  sum: '1.780e-02'
+grads.network.model.decoder.layers.6.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.048e-09'
+  mean: '2.847e-12'
+  min: '-5.821e-10'
+  shape:
+  - 1024
+  sum: '2.915e-09'
+grads.network.model.decoder.layers.6.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '7.468e-02'
+  mean: '3.264e-14'
+  min: '-7.459e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '3.423e-08'
+grads.network.model.decoder.layers.6.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '9.673e-03'
+  mean: '-7.276e-12'
+  min: '-9.632e-03'
+  shape:
+  - 1024
+  sum: '-7.451e-09'
+grads.network.model.decoder.layers.6.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.069e-02'
+  mean: '-2.558e-13'
+  min: '-1.237e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-2.682e-07'
+grads.network.model.decoder.layers.6.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.893e-03'
+  mean: '-1.271e-05'
+  min: '-3.243e-03'
+  shape:
+  - 1024
+  sum: '-1.302e-02'
+grads.network.model.decoder.layers.6.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '4.317e-02'
+  mean: '-5.287e-09'
+  min: '-5.174e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-5.543e-03'
+grads.network.model.decoder.layers.6.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '6.756e-03'
+  mean: '8.55e-05'
+  min: '-5.219e-03'
+  shape:
+  - 1024
+  sum: '8.755e-02'
+grads.network.model.decoder.layers.6.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.221e-01'
+  mean: '3.555e-08'
+  min: '-1.883e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '3.728e-02'
+grads.network.model.decoder.layers.6.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.004e-02'
+  mean: '2.542e-06'
+  min: '-9.872e-03'
+  shape:
+  - 1024
+  sum: '2.603e-03'
+grads.network.model.decoder.layers.6.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '2.376e-02'
+  mean: '-1.475e-05'
+  min: '-1.311e-02'
+  shape:
+  - 1024
+  sum: '-1.511e-02'
+grads.network.model.decoder.layers.7.fc1.bias:
+  device: cuda:0
+  max: '1.040e-02'
+  mean: '-1.111e-05'
+  min: '-5.846e-03'
+  shape:
+  - 4096
+  sum: '-4.551e-02'
+grads.network.model.decoder.layers.7.fc1.weight:
+  device: cuda:0
+  max: '1.282e-01'
+  mean: '-2.034e-09'
+  min: '-2.541e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-8.530e-03'
+grads.network.model.decoder.layers.7.fc2.bias:
+  device: cuda:0
+  max: '8.647e-03'
+  mean: '-1.819e-12'
+  min: '-1.108e-02'
+  shape:
+  - 1024
+  sum: '-1.863e-09'
+grads.network.model.decoder.layers.7.fc2.weight:
+  device: cuda:0
+  max: '2.036e-02'
+  mean: '-2.274e-13'
+  min: '-2.125e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '-9.537e-07'
+grads.network.model.decoder.layers.7.final_layer_norm.bias:
+  device: cuda:0
+  max: '9.436e-03'
+  mean: '1.051e-04'
+  min: '-1.201e-02'
+  shape:
+  - 1024
+  sum: '1.076e-01'
+grads.network.model.decoder.layers.7.final_layer_norm.weight:
+  device: cuda:0
+  max: '2.502e-02'
+  mean: '-2.608e-06'
+  min: '-1.341e-02'
+  shape:
+  - 1024
+  sum: '-2.670e-03'
+grads.network.model.decoder.layers.7.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '4.075e-10'
+  mean: '1.863e-13'
+  min: '-3.492e-10'
+  shape:
+  - 1024
+  sum: '1.908e-10'
+grads.network.model.decoder.layers.7.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '3.309e-02'
+  mean: '6.817e-14'
+  min: '-4.19e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '7.148e-08'
+grads.network.model.decoder.layers.7.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '7.477e-03'
+  mean: '-5.457e-12'
+  min: '-9.228e-03'
+  shape:
+  - 1024
+  sum: '-5.588e-09'
+grads.network.model.decoder.layers.7.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.003e-02'
+  mean: '-1.563e-13'
+  min: '-7.771e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.639e-07'
+grads.network.model.decoder.layers.7.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '2.209e-03'
+  mean: '-4.411e-06'
+  min: '-1.604e-03'
+  shape:
+  - 1024
+  sum: '-4.517e-03'
+grads.network.model.decoder.layers.7.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '3.379e-02'
+  mean: '5.986e-10'
+  min: '-2.946e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '6.277e-04'
+grads.network.model.decoder.layers.7.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '6.926e-03'
+  mean: '5.966e-05'
+  min: '-6.282e-03'
+  shape:
+  - 1024
+  sum: '6.109e-02'
+grads.network.model.decoder.layers.7.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.424e-01'
+  mean: '-8.094e-09'
+  min: '-1.385e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-8.487e-03'
+grads.network.model.decoder.layers.7.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '7.795e-03'
+  mean: '8.083e-05'
+  min: '-9.428e-03'
+  shape:
+  - 1024
+  sum: '8.277e-02'
+grads.network.model.decoder.layers.7.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '3.435e-02'
+  mean: '-2.633e-06'
+  min: '-1.194e-02'
+  shape:
+  - 1024
+  sum: '-2.696e-03'
+grads.network.model.decoder.layers.8.fc1.bias:
+  device: cuda:0
+  max: '9.447e-03'
+  mean: '-1.000e-05'
+  min: '-1.029e-02'
+  shape:
+  - 4096
+  sum: '-4.096e-02'
+grads.network.model.decoder.layers.8.fc1.weight:
+  device: cuda:0
+  max: '1.788e-01'
+  mean: '-1.028e-08'
+  min: '-1.565e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-4.31e-02'
+grads.network.model.decoder.layers.8.fc2.bias:
+  device: cuda:0
+  max: '9.312e-03'
+  mean: '1.819e-11'
+  min: '-9.654e-03'
+  shape:
+  - 1024
+  sum: '1.863e-08'
+grads.network.model.decoder.layers.8.fc2.weight:
+  device: cuda:0
+  max: '2.393e-02'
+  mean: '6.821e-13'
+  min: '-1.897e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '2.861e-06'
+grads.network.model.decoder.layers.8.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.033e-02'
+  mean: '-9.404e-05'
+  min: '-1.074e-02'
+  shape:
+  - 1024
+  sum: '-9.63e-02'
+grads.network.model.decoder.layers.8.final_layer_norm.weight:
+  device: cuda:0
+  max: '8.312e-03'
+  mean: '-3.398e-05'
+  min: '-2.52e-02'
+  shape:
+  - 1024
+  sum: '-3.479e-02'
+grads.network.model.decoder.layers.8.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '4.657e-10'
+  mean: '1.157e-12'
+  min: '-7.567e-10'
+  shape:
+  - 1024
+  sum: '1.185e-09'
+grads.network.model.decoder.layers.8.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '2.660e-02'
+  mean: '-1.255e-14'
+  min: '-2.215e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.315e-08'
+grads.network.model.decoder.layers.8.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '8.574e-03'
+  mean: '-1.091e-11'
+  min: '-1.133e-02'
+  shape:
+  - 1024
+  sum: '-1.118e-08'
+grads.network.model.decoder.layers.8.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '5.791e-03'
+  mean: '1.776e-13'
+  min: '-7.842e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.863e-07'
+grads.network.model.decoder.layers.8.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '2.176e-03'
+  mean: '1.136e-05'
+  min: '-1.464e-03'
+  shape:
+  - 1024
+  sum: '1.164e-02'
+grads.network.model.decoder.layers.8.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '2.919e-02'
+  mean: '-1.766e-08'
+  min: '-3.662e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.852e-02'
+grads.network.model.decoder.layers.8.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '7.759e-03'
+  mean: '5.574e-05'
+  min: '-1.002e-02'
+  shape:
+  - 1024
+  sum: '5.708e-02'
+grads.network.model.decoder.layers.8.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '2.583e-01'
+  mean: '-8.663e-08'
+  min: '-1.763e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-9.083e-02'
+grads.network.model.decoder.layers.8.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '8.934e-03'
+  mean: '3.720e-05'
+  min: '-1.170e-02'
+  shape:
+  - 1024
+  sum: '3.81e-02'
+grads.network.model.decoder.layers.8.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.159e-02'
+  mean: '-3.363e-06'
+  min: '-1.334e-02'
+  shape:
+  - 1024
+  sum: '-3.444e-03'
+grads.network.model.decoder.layers.9.fc1.bias:
+  device: cuda:0
+  max: '1.084e-02'
+  mean: '-1.724e-05'
+  min: '-8.211e-03'
+  shape:
+  - 4096
+  sum: '-7.062e-02'
+grads.network.model.decoder.layers.9.fc1.weight:
+  device: cuda:0
+  max: '1.987e-01'
+  mean: '-1.661e-08'
+  min: '-2.721e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-6.966e-02'
+grads.network.model.decoder.layers.9.fc2.bias:
+  device: cuda:0
+  max: '1.032e-02'
+  mean: '-7.276e-12'
+  min: '-1.013e-02'
+  shape:
+  - 1024
+  sum: '-7.451e-09'
+grads.network.model.decoder.layers.9.fc2.weight:
+  device: cuda:0
+  max: '2.487e-02'
+  mean: '-5.684e-13'
+  min: '-2.754e-02'
+  shape:
+  - 1024
+  - 4096
+  sum: '-2.384e-06'
+grads.network.model.decoder.layers.9.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.148e-02'
+  mean: '-7.486e-05'
+  min: '-1.105e-02'
+  shape:
+  - 1024
+  sum: '-7.665e-02'
+grads.network.model.decoder.layers.9.final_layer_norm.weight:
+  device: cuda:0
+  max: '5.081e-02'
+  mean: '3.829e-06'
+  min: '-1.181e-02'
+  shape:
+  - 1024
+  sum: '3.921e-03'
+grads.network.model.decoder.layers.9.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.397e-09'
+  mean: '-3.783e-12'
+  min: '-2.095e-09'
+  shape:
+  - 1024
+  sum: '-3.874e-09'
+grads.network.model.decoder.layers.9.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.288e-01'
+  mean: '2.314e-13'
+  min: '-1.159e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.427e-07'
+grads.network.model.decoder.layers.9.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '9.677e-03'
+  mean: '-2.183e-11'
+  min: '-9.679e-03'
+  shape:
+  - 1024
+  sum: '-2.235e-08'
+grads.network.model.decoder.layers.9.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '8.051e-03'
+  mean: '2.558e-13'
+  min: '-8.809e-03'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.682e-07'
+grads.network.model.decoder.layers.9.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '3.228e-03'
+  mean: '-6.335e-06'
+  min: '-4.683e-03'
+  shape:
+  - 1024
+  sum: '-6.487e-03'
+grads.network.model.decoder.layers.9.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '8.449e-02'
+  mean: '2.055e-08'
+  min: '-6.571e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.155e-02'
+grads.network.model.decoder.layers.9.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '1.115e-02'
+  mean: '-3.493e-05'
+  min: '-9.448e-03'
+  shape:
+  - 1024
+  sum: '-3.577e-02'
+grads.network.model.decoder.layers.9.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '2.284e-01'
+  mean: '1.133e-07'
+  min: '-2.614e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.188e-01'
+grads.network.model.decoder.layers.9.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.015e-02'
+  mean: '4.447e-05'
+  min: '-1.010e-02'
+  shape:
+  - 1024
+  sum: '4.553e-02'
+grads.network.model.decoder.layers.9.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '9.655e-03'
+  mean: '2.292e-06'
+  min: '-2.027e-02'
+  shape:
+  - 1024
+  sum: '2.347e-03'
+grads.network.model.decoder.project_in.weight:
+  device: cuda:0
+  max: '2.645e-02'
+  mean: '-3.396e-07'
+  min: '-2.839e-02'
+  shape:
+  - 1024
+  - 512
+  sum: '-1.780e-01'
+grads.network.model.decoder.project_out.weight:
+  device: cuda:0
+  max: '9.968e-02'
+  mean: '-3.139e-07'
+  min: '-1.016e-01'
+  shape:
+  - 512
+  - 1024
+  sum: '-1.646e-01'
+outputs.loss:
+  device: cuda:0
+  max: '4.05e+00'
+  mean: '4.05e+00'
+  min: '4.05e+00'
+  shape: []
+  sum: '4.05e+00'
diff --git a/.regression_files/project/algorithms/llm_finetuning_test/test_forward_pass_is_reproducible/cuda/llm_finetuning.yaml b/.regression_files/project/algorithms/llm_finetuning_test/test_forward_pass_is_reproducible/cuda/llm_finetuning.yaml
new file mode 100644
index 00000000..41f33102
--- /dev/null
+++ b/.regression_files/project/algorithms/llm_finetuning_test/test_forward_pass_is_reproducible/cuda/llm_finetuning.yaml
@@ -0,0 +1,572 @@
+input.attention_mask:
+  device: cuda:0
+  max: 1
+  mean: '1.e+00'
+  min: 1
+  shape:
+  - 8
+  - 256
+  sum: 2048
+input.input_ids:
+  device: cuda:0
+  max: 50118
+  mean: '5.447e+03'
+  min: 2
+  shape:
+  - 8
+  - 256
+  sum: 11154886
+input.labels:
+  device: cuda:0
+  max: 50118
+  mean: '5.447e+03'
+  min: 2
+  shape:
+  - 8
+  - 256
+  sum: 11154886
+out.logits:
+  device: cuda:0
+  max: '3.537e+01'
+  mean: '-4.715e+00'
+  min: '-3.336e+01'
+  shape:
+  - 8
+  - 256
+  - 50272
+  sum: '-4.855e+08'
+out.loss:
+  device: cuda:0
+  max: '4.05e+00'
+  mean: '4.05e+00'
+  min: '4.05e+00'
+  shape: []
+  sum: '4.05e+00'
+out.past_key_values.0.0:
+  device: cuda:0
+  max: '1.824e+00'
+  mean: '-3.677e-03'
+  min: '-2.004e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-7.711e+03'
+out.past_key_values.0.1:
+  device: cuda:0
+  max: '1.91e-01'
+  mean: '6.668e-05'
+  min: '-1.719e-01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '1.398e+02'
+out.past_key_values.1.0:
+  device: cuda:0
+  max: '1.150e+01'
+  mean: '5.521e-03'
+  min: '-1.144e+01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '1.158e+04'
+out.past_key_values.1.1:
+  device: cuda:0
+  max: '4.35e+00'
+  mean: '2.593e-03'
+  min: '-4.527e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '5.439e+03'
+out.past_key_values.10.0:
+  device: cuda:0
+  max: '9.741e+00'
+  mean: '5.765e-02'
+  min: '-1.030e+01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '1.209e+05'
+out.past_key_values.10.1:
+  device: cuda:0
+  max: '5.526e+00'
+  mean: '1.023e-02'
+  min: '-5.248e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '2.145e+04'
+out.past_key_values.11.0:
+  device: cuda:0
+  max: '9.2e+00'
+  mean: '4.524e-02'
+  min: '-8.32e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '9.488e+04'
+out.past_key_values.11.1:
+  device: cuda:0
+  max: '4.676e+00'
+  mean: '7.994e-03'
+  min: '-4.337e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '1.676e+04'
+out.past_key_values.12.0:
+  device: cuda:0
+  max: '8.099e+00'
+  mean: '-4.339e-03'
+  min: '-8.358e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-9.101e+03'
+out.past_key_values.12.1:
+  device: cuda:0
+  max: '5.357e+00'
+  mean: '7.804e-03'
+  min: '-5.152e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '1.637e+04'
+out.past_key_values.13.0:
+  device: cuda:0
+  max: '8.449e+00'
+  mean: '-9.491e-03'
+  min: '-8.29e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-1.990e+04'
+out.past_key_values.13.1:
+  device: cuda:0
+  max: '4.555e+00'
+  mean: '3.872e-03'
+  min: '-5.178e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '8.120e+03'
+out.past_key_values.14.0:
+  device: cuda:0
+  max: '7.696e+00'
+  mean: '-4.042e-02'
+  min: '-8.394e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-8.477e+04'
+out.past_key_values.14.1:
+  device: cuda:0
+  max: '5.031e+00'
+  mean: '3.803e-03'
+  min: '-5.123e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '7.976e+03'
+out.past_key_values.15.0:
+  device: cuda:0
+  max: '8.108e+00'
+  mean: '2.572e-02'
+  min: '-1.000e+01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '5.394e+04'
+out.past_key_values.15.1:
+  device: cuda:0
+  max: '4.85e+00'
+  mean: '-8.774e-03'
+  min: '-4.855e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-1.840e+04'
+out.past_key_values.16.0:
+  device: cuda:0
+  max: '8.927e+00'
+  mean: '-1.676e-02'
+  min: '-8.144e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-3.515e+04'
+out.past_key_values.16.1:
+  device: cuda:0
+  max: '4.793e+00'
+  mean: '-1.081e-02'
+  min: '-5.854e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-2.268e+04'
+out.past_key_values.17.0:
+  device: cuda:0
+  max: '1.004e+01'
+  mean: '2.810e-02'
+  min: '-9.726e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '5.893e+04'
+out.past_key_values.17.1:
+  device: cuda:0
+  max: '5.284e+00'
+  mean: '5.285e-03'
+  min: '-5.681e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '1.108e+04'
+out.past_key_values.18.0:
+  device: cuda:0
+  max: '8.982e+00'
+  mean: '5.052e-02'
+  min: '-8.762e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '1.059e+05'
+out.past_key_values.18.1:
+  device: cuda:0
+  max: '4.748e+00'
+  mean: '-1.694e-03'
+  min: '-4.891e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-3.554e+03'
+out.past_key_values.19.0:
+  device: cuda:0
+  max: '9.813e+00'
+  mean: '1.273e-02'
+  min: '-9.707e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '2.670e+04'
+out.past_key_values.19.1:
+  device: cuda:0
+  max: '4.619e+00'
+  mean: '-1.924e-02'
+  min: '-4.700e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-4.036e+04'
+out.past_key_values.2.0:
+  device: cuda:0
+  max: '1.074e+01'
+  mean: '6.862e-02'
+  min: '-1.063e+01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '1.439e+05'
+out.past_key_values.2.1:
+  device: cuda:0
+  max: '4.396e+00'
+  mean: '2.223e-03'
+  min: '-4.462e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '4.662e+03'
+out.past_key_values.20.0:
+  device: cuda:0
+  max: '1.106e+01'
+  mean: '5.73e-02'
+  min: '-1.099e+01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '1.202e+05'
+out.past_key_values.20.1:
+  device: cuda:0
+  max: '4.813e+00'
+  mean: '6.246e-03'
+  min: '-5.477e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '1.31e+04'
+out.past_key_values.21.0:
+  device: cuda:0
+  max: '1.079e+01'
+  mean: '4.522e-02'
+  min: '-1.039e+01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '9.484e+04'
+out.past_key_values.21.1:
+  device: cuda:0
+  max: '4.631e+00'
+  mean: '1.379e-02'
+  min: '-4.818e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '2.891e+04'
+out.past_key_values.22.0:
+  device: cuda:0
+  max: '1.065e+01'
+  mean: '4.017e-02'
+  min: '-1.125e+01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '8.425e+04'
+out.past_key_values.22.1:
+  device: cuda:0
+  max: '5.105e+00'
+  mean: '5.328e-03'
+  min: '-4.445e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '1.117e+04'
+out.past_key_values.23.0:
+  device: cuda:0
+  max: '9.464e+00'
+  mean: '1.056e-02'
+  min: '-8.453e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '2.214e+04'
+out.past_key_values.23.1:
+  device: cuda:0
+  max: '4.379e+00'
+  mean: '-1.464e-03'
+  min: '-4.951e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-3.069e+03'
+out.past_key_values.3.0:
+  device: cuda:0
+  max: '1.142e+01'
+  mean: '4.512e-02'
+  min: '-1.147e+01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '9.462e+04'
+out.past_key_values.3.1:
+  device: cuda:0
+  max: '4.416e+00'
+  mean: '-3.978e-04'
+  min: '-4.476e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-8.342e+02'
+out.past_key_values.4.0:
+  device: cuda:0
+  max: '1.193e+01'
+  mean: '-3.041e-02'
+  min: '-1.091e+01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-6.377e+04'
+out.past_key_values.4.1:
+  device: cuda:0
+  max: '4.839e+00'
+  mean: '-4.185e-04'
+  min: '-5.120e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-8.776e+02'
+out.past_key_values.5.0:
+  device: cuda:0
+  max: '1.230e+01'
+  mean: '4.608e-02'
+  min: '-1.164e+01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '9.664e+04'
+out.past_key_values.5.1:
+  device: cuda:0
+  max: '5.191e+00'
+  mean: '1.398e-03'
+  min: '-4.402e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '2.932e+03'
+out.past_key_values.6.0:
+  device: cuda:0
+  max: '1.248e+01'
+  mean: '6.588e-03'
+  min: '-1.322e+01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '1.382e+04'
+out.past_key_values.6.1:
+  device: cuda:0
+  max: '4.148e+00'
+  mean: '5.169e-03'
+  min: '-4.295e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '1.084e+04'
+out.past_key_values.7.0:
+  device: cuda:0
+  max: '1.326e+01'
+  mean: '-1.400e-02'
+  min: '-1.272e+01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-2.936e+04'
+out.past_key_values.7.1:
+  device: cuda:0
+  max: '4.043e+00'
+  mean: '5.246e-03'
+  min: '-3.823e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '1.100e+04'
+out.past_key_values.8.0:
+  device: cuda:0
+  max: '1.329e+01'
+  mean: '1.543e-02'
+  min: '-1.222e+01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '3.235e+04'
+out.past_key_values.8.1:
+  device: cuda:0
+  max: '4.179e+00'
+  mean: '-1.275e-03'
+  min: '-4.191e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-2.674e+03'
+out.past_key_values.9.0:
+  device: cuda:0
+  max: '1.514e+01'
+  mean: '-1.051e-01'
+  min: '-1.701e+01'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '-2.204e+05'
+out.past_key_values.9.1:
+  device: cuda:0
+  max: '4.456e+00'
+  mean: '3.825e-04'
+  min: '-4.440e+00'
+  shape:
+  - 8
+  - 16
+  - 256
+  - 64
+  sum: '8.022e+02'
diff --git a/.regression_files/project/algorithms/llm_finetuning_test/test_initialization_is_reproducible/cuda/llm_finetuning.yaml b/.regression_files/project/algorithms/llm_finetuning_test/test_initialization_is_reproducible/cuda/llm_finetuning.yaml
new file mode 100644
index 00000000..9e7c6ffb
--- /dev/null
+++ b/.regression_files/project/algorithms/llm_finetuning_test/test_initialization_is_reproducible/cuda/llm_finetuning.yaml
@@ -0,0 +1,3261 @@
+network.lm_head.weight:
+  device: cuda:0
+  max: '2.372e-01'
+  mean: '-1.208e-03'
+  min: '-2.5e-01'
+  shape:
+  - 50272
+  - 512
+  sum: '-3.109e+04'
+network.model.decoder.embed_positions.weight:
+  device: cuda:0
+  max: '1.327e-01'
+  mean: '1.768e-05'
+  min: '-1.379e-01'
+  shape:
+  - 2050
+  - 1024
+  sum: '3.711e+01'
+network.model.decoder.embed_tokens.weight:
+  device: cuda:0
+  max: '2.372e-01'
+  mean: '-1.208e-03'
+  min: '-2.5e-01'
+  shape:
+  - 50272
+  - 512
+  sum: '-3.109e+04'
+network.model.decoder.layers.0.fc1.bias:
+  device: cuda:0
+  max: '1.249e-01'
+  mean: '-2.961e-02'
+  min: '-1.085e-01'
+  shape:
+  - 4096
+  sum: '-1.213e+02'
+network.model.decoder.layers.0.fc1.weight:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '1.667e-04'
+  min: '-1.251e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '6.992e+02'
+network.model.decoder.layers.0.fc2.bias:
+  device: cuda:0
+  max: '7.88e-02'
+  mean: '-8.293e-05'
+  min: '-9.351e-02'
+  shape:
+  - 1024
+  sum: '-8.492e-02'
+network.model.decoder.layers.0.fc2.weight:
+  device: cuda:0
+  max: '1.331e-01'
+  mean: '5.357e-06'
+  min: '-1.448e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '2.247e+01'
+network.model.decoder.layers.0.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.256e-01'
+  mean: '7.015e-03'
+  min: '-1.204e-01'
+  shape:
+  - 1024
+  sum: '7.183e+00'
+network.model.decoder.layers.0.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.0.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '3.125e-02'
+  mean: '3.414e-04'
+  min: '-3.123e-02'
+  shape:
+  - 1024
+  sum: '3.496e-01'
+network.model.decoder.layers.0.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.256e-01'
+  mean: '-4.626e-05'
+  min: '-1.256e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-4.850e+01'
+network.model.decoder.layers.0.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '1.579e-02'
+  mean: '-2.766e-05'
+  min: '-1.138e-02'
+  shape:
+  - 1024
+  sum: '-2.833e-02'
+network.model.decoder.layers.0.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.283e-01'
+  mean: '-6.181e-06'
+  min: '-1.295e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-6.481e+00'
+network.model.decoder.layers.0.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.282e-01'
+  mean: '1.180e-03'
+  min: '-1.271e-01'
+  shape:
+  - 1024
+  sum: '1.208e+00'
+network.model.decoder.layers.0.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.267e-01'
+  mean: '-5.663e-05'
+  min: '-1.267e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-5.938e+01'
+network.model.decoder.layers.0.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '2.769e-02'
+  mean: '-2.715e-05'
+  min: '-2.669e-02'
+  shape:
+  - 1024
+  sum: '-2.780e-02'
+network.model.decoder.layers.0.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '8.795e-02'
+  mean: '1.917e-06'
+  min: '-8.508e-02'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.011e+00'
+network.model.decoder.layers.0.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.271e-01'
+  mean: '-2.03e-03'
+  min: '-1.248e-01'
+  shape:
+  - 1024
+  sum: '-2.079e+00'
+network.model.decoder.layers.0.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.1.fc1.bias:
+  device: cuda:0
+  max: '1.236e-01'
+  mean: '-2.428e-02'
+  min: '-8.075e-02'
+  shape:
+  - 4096
+  sum: '-9.946e+01'
+network.model.decoder.layers.1.fc1.weight:
+  device: cuda:0
+  max: '1.254e-01'
+  mean: '1.85e-04'
+  min: '-1.261e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '7.759e+02'
+network.model.decoder.layers.1.fc2.bias:
+  device: cuda:0
+  max: '8.911e-02'
+  mean: '2.946e-04'
+  min: '-8.362e-02'
+  shape:
+  - 1024
+  sum: '3.017e-01'
+network.model.decoder.layers.1.fc2.weight:
+  device: cuda:0
+  max: '1.321e-01'
+  mean: '-2.468e-06'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '-1.035e+01'
+network.model.decoder.layers.1.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.256e-01'
+  mean: '8.647e-03'
+  min: '-1.198e-01'
+  shape:
+  - 1024
+  sum: '8.855e+00'
+network.model.decoder.layers.1.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.1.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '7.153e-02'
+  mean: '7.902e-03'
+  min: '-7.874e-02'
+  shape:
+  - 1024
+  sum: '8.092e+00'
+network.model.decoder.layers.1.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.266e-01'
+  mean: '-1.284e-05'
+  min: '-1.272e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.346e+01'
+network.model.decoder.layers.1.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '8.606e-02'
+  mean: '-1.118e-04'
+  min: '-7.031e-02'
+  shape:
+  - 1024
+  sum: '-1.144e-01'
+network.model.decoder.layers.1.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.266e-01'
+  mean: '1.676e-06'
+  min: '-1.272e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.758e+00'
+network.model.decoder.layers.1.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.254e-01'
+  mean: '-1.557e-03'
+  min: '-1.252e-01'
+  shape:
+  - 1024
+  sum: '-1.595e+00'
+network.model.decoder.layers.1.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.256e-01'
+  mean: '-3.561e-05'
+  min: '-1.26e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-3.734e+01'
+network.model.decoder.layers.1.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '5.002e-02'
+  mean: '3.967e-04'
+  min: '-4.831e-02'
+  shape:
+  - 1024
+  sum: '4.062e-01'
+network.model.decoder.layers.1.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.092e-01'
+  mean: '1.417e-05'
+  min: '-1.07e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.486e+01'
+network.model.decoder.layers.1.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.304e-01'
+  mean: '-2.029e-03'
+  min: '-1.248e-01'
+  shape:
+  - 1024
+  sum: '-2.078e+00'
+network.model.decoder.layers.1.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.10.fc1.bias:
+  device: cuda:0
+  max: '5.505e-02'
+  mean: '-2.099e-02'
+  min: '-8.49e-02'
+  shape:
+  - 4096
+  sum: '-8.599e+01'
+network.model.decoder.layers.10.fc1.weight:
+  device: cuda:0
+  max: '1.27e-01'
+  mean: '1.603e-05'
+  min: '-1.296e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '6.723e+01'
+network.model.decoder.layers.10.fc2.bias:
+  device: cuda:0
+  max: '6.293e-02'
+  mean: '-1.937e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-1.983e-01'
+network.model.decoder.layers.10.fc2.weight:
+  device: cuda:0
+  max: '1.281e-01'
+  mean: '-1.624e-06'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '-6.81e+00'
+network.model.decoder.layers.10.final_layer_norm.bias:
+  device: cuda:0
+  max: '8.020e-02'
+  mean: '-9.374e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-9.599e+00'
+network.model.decoder.layers.10.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.10.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '7.422e-02'
+  mean: '7.871e-03'
+  min: '-7.428e-02'
+  shape:
+  - 1024
+  sum: '8.06e+00'
+network.model.decoder.layers.10.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.318e-01'
+  mean: '-1.478e-05'
+  min: '-1.285e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.55e+01'
+network.model.decoder.layers.10.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '7.031e-02'
+  mean: '-2.308e-05'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-2.363e-02'
+network.model.decoder.layers.10.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.321e-01'
+  mean: '1.384e-06'
+  min: '-1.316e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.452e+00'
+network.model.decoder.layers.10.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.089e-01'
+  mean: '-1.708e-03'
+  min: '-1.009e-01'
+  shape:
+  - 1024
+  sum: '-1.749e+00'
+network.model.decoder.layers.10.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.300e-01'
+  mean: '5.200e-06'
+  min: '-1.311e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '5.453e+00'
+network.model.decoder.layers.10.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '5.096e-02'
+  mean: '3.204e-04'
+  min: '-5.444e-02'
+  shape:
+  - 1024
+  sum: '3.281e-01'
+network.model.decoder.layers.10.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.241e-01'
+  mean: '1.173e-05'
+  min: '-1.152e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.229e+01'
+network.model.decoder.layers.10.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '8.594e-02'
+  mean: '1.188e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '1.217e+00'
+network.model.decoder.layers.10.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.11.fc1.bias:
+  device: cuda:0
+  max: '6.107e-02'
+  mean: '-2.344e-02'
+  min: '-8.850e-02'
+  shape:
+  - 4096
+  sum: '-9.601e+01'
+network.model.decoder.layers.11.fc1.weight:
+  device: cuda:0
+  max: '1.257e-01'
+  mean: '-1.888e-04'
+  min: '-1.263e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-7.920e+02'
+network.model.decoder.layers.11.fc2.bias:
+  device: cuda:0
+  max: '6.47e-02'
+  mean: '1.148e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '1.176e-01'
+network.model.decoder.layers.11.fc2.weight:
+  device: cuda:0
+  max: '1.26e-01'
+  mean: '3.113e-07'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '1.306e+00'
+network.model.decoder.layers.11.final_layer_norm.bias:
+  device: cuda:0
+  max: '7.886e-02'
+  mean: '-1.455e-02'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-1.489e+01'
+network.model.decoder.layers.11.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.11.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '7.074e-02'
+  mean: '5.886e-03'
+  min: '-6.482e-02'
+  shape:
+  - 1024
+  sum: '6.027e+00'
+network.model.decoder.layers.11.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.331e-01'
+  mean: '1.017e-05'
+  min: '-1.31e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.066e+01'
+network.model.decoder.layers.11.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '6.311e-02'
+  mean: '-3.316e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-3.396e-01'
+network.model.decoder.layers.11.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.514e-01'
+  mean: '1.601e-05'
+  min: '-1.647e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.679e+01'
+network.model.decoder.layers.11.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.105e-01'
+  mean: '-2.709e-03'
+  min: '-1.172e-01'
+  shape:
+  - 1024
+  sum: '-2.774e+00'
+network.model.decoder.layers.11.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.287e-01'
+  mean: '5.092e-06'
+  min: '-1.26e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '5.339e+00'
+network.model.decoder.layers.11.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '3.922e-02'
+  mean: '4.083e-04'
+  min: '-4.712e-02'
+  shape:
+  - 1024
+  sum: '4.180e-01'
+network.model.decoder.layers.11.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.234e-01'
+  mean: '-8.525e-05'
+  min: '-1.197e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-8.939e+01'
+network.model.decoder.layers.11.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.046e-01'
+  mean: '4.110e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '4.209e+00'
+network.model.decoder.layers.11.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.12.fc1.bias:
+  device: cuda:0
+  max: '7.367e-02'
+  mean: '-2.188e-02'
+  min: '-7.434e-02'
+  shape:
+  - 4096
+  sum: '-8.961e+01'
+network.model.decoder.layers.12.fc1.weight:
+  device: cuda:0
+  max: '1.274e-01'
+  mean: '-2.221e-04'
+  min: '-1.266e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-9.314e+02'
+network.model.decoder.layers.12.fc2.bias:
+  device: cuda:0
+  max: '7.233e-02'
+  mean: '-3.044e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-3.118e-01'
+network.model.decoder.layers.12.fc2.weight:
+  device: cuda:0
+  max: '1.265e-01'
+  mean: '1.128e-07'
+  min: '-1.393e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '4.732e-01'
+network.model.decoder.layers.12.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.241e-01'
+  mean: '-1.53e-02'
+  min: '-1.254e-01'
+  shape:
+  - 1024
+  sum: '-1.566e+01'
+network.model.decoder.layers.12.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.12.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.177e-01'
+  mean: '6.118e-03'
+  min: '-8.82e-02'
+  shape:
+  - 1024
+  sum: '6.265e+00'
+network.model.decoder.layers.12.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.274e-01'
+  mean: '2.051e-05'
+  min: '-1.263e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.151e+01'
+network.model.decoder.layers.12.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '6.604e-02'
+  mean: '-4.053e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-4.151e-01'
+network.model.decoder.layers.12.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.273e-01'
+  mean: '6.458e-06'
+  min: '-1.268e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '6.772e+00'
+network.model.decoder.layers.12.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.249e-01'
+  mean: '3.377e-04'
+  min: '-1.248e-01'
+  shape:
+  - 1024
+  sum: '3.458e-01'
+network.model.decoder.layers.12.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.262e-01'
+  mean: '-4.44e-05'
+  min: '-1.266e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-4.655e+01'
+network.model.decoder.layers.12.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '5.71e-02'
+  mean: '1.127e-04'
+  min: '-4.361e-02'
+  shape:
+  - 1024
+  sum: '1.155e-01'
+network.model.decoder.layers.12.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.246e-01'
+  mean: '5.265e-05'
+  min: '-1.251e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '5.521e+01'
+network.model.decoder.layers.12.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.025e-01'
+  mean: '4.391e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '4.497e+00'
+network.model.decoder.layers.12.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.13.fc1.bias:
+  device: cuda:0
+  max: '9.039e-02'
+  mean: '-2.392e-02'
+  min: '-7.361e-02'
+  shape:
+  - 4096
+  sum: '-9.798e+01'
+network.model.decoder.layers.13.fc1.weight:
+  device: cuda:0
+  max: '1.263e-01'
+  mean: '-2.766e-04'
+  min: '-1.261e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-1.160e+03'
+network.model.decoder.layers.13.fc2.bias:
+  device: cuda:0
+  max: '7.214e-02'
+  mean: '2.524e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '2.584e-01'
+network.model.decoder.layers.13.fc2.weight:
+  device: cuda:0
+  max: '1.256e-01'
+  mean: '-2.636e-06'
+  min: '-1.754e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '-1.106e+01'
+network.model.decoder.layers.13.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.246e-01'
+  mean: '-2.340e-02'
+  min: '-1.254e-01'
+  shape:
+  - 1024
+  sum: '-2.396e+01'
+network.model.decoder.layers.13.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.13.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '7.465e-02'
+  mean: '5.789e-03'
+  min: '-7.758e-02'
+  shape:
+  - 1024
+  sum: '5.928e+00'
+network.model.decoder.layers.13.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.281e-01'
+  mean: '3.542e-05'
+  min: '-1.283e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '3.714e+01'
+network.model.decoder.layers.13.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '6.506e-02'
+  mean: '-2.055e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-2.104e-01'
+network.model.decoder.layers.13.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.277e-01'
+  mean: '-1.117e-05'
+  min: '-1.268e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.171e+01'
+network.model.decoder.layers.13.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.247e-01'
+  mean: '-2.867e-03'
+  min: '-1.138e-01'
+  shape:
+  - 1024
+  sum: '-2.936e+00'
+network.model.decoder.layers.13.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.265e-01'
+  mean: '3.923e-05'
+  min: '-1.273e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '4.114e+01'
+network.model.decoder.layers.13.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.150e-02'
+  mean: '-2.426e-04'
+  min: '-4.178e-02'
+  shape:
+  - 1024
+  sum: '-2.485e-01'
+network.model.decoder.layers.13.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.262e-01'
+  mean: '-6.461e-05'
+  min: '-1.251e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-6.775e+01'
+network.model.decoder.layers.13.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.247e-01'
+  mean: '3.063e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '3.137e+00'
+network.model.decoder.layers.13.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.14.fc1.bias:
+  device: cuda:0
+  max: '6.329e-02'
+  mean: '-2.279e-02'
+  min: '-6.866e-02'
+  shape:
+  - 4096
+  sum: '-9.333e+01'
+network.model.decoder.layers.14.fc1.weight:
+  device: cuda:0
+  max: '1.261e-01'
+  mean: '-1.687e-04'
+  min: '-1.256e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-7.075e+02'
+network.model.decoder.layers.14.fc2.bias:
+  device: cuda:0
+  max: '8.209e-02'
+  mean: '2.395e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '2.453e-01'
+network.model.decoder.layers.14.fc2.weight:
+  device: cuda:0
+  max: '1.265e-01'
+  mean: '-1.073e-06'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '-4.501e+00'
+network.model.decoder.layers.14.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.249e-01'
+  mean: '-2.171e-02'
+  min: '-1.277e-01'
+  shape:
+  - 1024
+  sum: '-2.223e+01'
+network.model.decoder.layers.14.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.14.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '4.583e-03'
+  min: '-1.03e-01'
+  shape:
+  - 1024
+  sum: '4.693e+00'
+network.model.decoder.layers.14.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.265e-01'
+  mean: '3.023e-05'
+  min: '-1.266e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '3.170e+01'
+network.model.decoder.layers.14.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '6.335e-02'
+  mean: '-2.293e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-2.348e-01'
+network.model.decoder.layers.14.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.292e-01'
+  mean: '-1.601e-05'
+  min: '-1.316e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.679e+01'
+network.model.decoder.layers.14.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.237e-01'
+  mean: '-1.509e-03'
+  min: '-1.181e-01'
+  shape:
+  - 1024
+  sum: '-1.546e+00'
+network.model.decoder.layers.14.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.263e-01'
+  mean: '3.587e-05'
+  min: '-1.265e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '3.761e+01'
+network.model.decoder.layers.14.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.108e-02'
+  mean: '4.279e-04'
+  min: '-3.915e-02'
+  shape:
+  - 1024
+  sum: '4.381e-01'
+network.model.decoder.layers.14.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.249e-01'
+  mean: '6.315e-06'
+  min: '-1.249e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '6.622e+00'
+network.model.decoder.layers.14.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '9.48e-04'
+  min: '-1.285e-01'
+  shape:
+  - 1024
+  sum: '9.707e-01'
+network.model.decoder.layers.14.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.15.fc1.bias:
+  device: cuda:0
+  max: '6.256e-02'
+  mean: '-2.178e-02'
+  min: '-7.373e-02'
+  shape:
+  - 4096
+  sum: '-8.921e+01'
+network.model.decoder.layers.15.fc1.weight:
+  device: cuda:0
+  max: '1.262e-01'
+  mean: '-2.048e-04'
+  min: '-1.274e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-8.590e+02'
+network.model.decoder.layers.15.fc2.bias:
+  device: cuda:0
+  max: '7.629e-02'
+  mean: '-2.647e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-2.711e-01'
+network.model.decoder.layers.15.fc2.weight:
+  device: cuda:0
+  max: '1.273e-01'
+  mean: '-1.300e-06'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '-5.454e+00'
+network.model.decoder.layers.15.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.251e-01'
+  mean: '-2.09e-02'
+  min: '-1.271e-01'
+  shape:
+  - 1024
+  sum: '-2.14e+01'
+network.model.decoder.layers.15.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.15.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '5.291e-03'
+  min: '-8.069e-02'
+  shape:
+  - 1024
+  sum: '5.418e+00'
+network.model.decoder.layers.15.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.259e-01'
+  mean: '3.431e-05'
+  min: '-1.272e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '3.598e+01'
+network.model.decoder.layers.15.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '6.873e-02'
+  mean: '2.003e-05'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '2.051e-02'
+network.model.decoder.layers.15.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.798e-01'
+  mean: '1.003e-06'
+  min: '-1.726e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.052e+00'
+network.model.decoder.layers.15.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '1.456e-03'
+  min: '-1.242e-01'
+  shape:
+  - 1024
+  sum: '1.491e+00'
+network.model.decoder.layers.15.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.271e-01'
+  mean: '-2.108e-05'
+  min: '-1.259e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-2.21e+01'
+network.model.decoder.layers.15.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.312e-02'
+  mean: '-6.573e-04'
+  min: '-4.214e-02'
+  shape:
+  - 1024
+  sum: '-6.731e-01'
+network.model.decoder.layers.15.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.246e-01'
+  mean: '-1.231e-04'
+  min: '-1.249e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.291e+02'
+network.model.decoder.layers.15.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '1.033e-03'
+  min: '-1.627e-01'
+  shape:
+  - 1024
+  sum: '1.058e+00'
+network.model.decoder.layers.15.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.16.fc1.bias:
+  device: cuda:0
+  max: '1.138e-01'
+  mean: '-2.057e-02'
+  min: '-8.105e-02'
+  shape:
+  - 4096
+  sum: '-8.427e+01'
+network.model.decoder.layers.16.fc1.weight:
+  device: cuda:0
+  max: '1.261e-01'
+  mean: '-1.731e-04'
+  min: '-1.263e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-7.259e+02'
+network.model.decoder.layers.16.fc2.bias:
+  device: cuda:0
+  max: '7.257e-02'
+  mean: '-1.059e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-1.085e-01'
+network.model.decoder.layers.16.fc2.weight:
+  device: cuda:0
+  max: '1.387e-01'
+  mean: '-4.515e-06'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '-1.894e+01'
+network.model.decoder.layers.16.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '-1.704e-02'
+  min: '-1.285e-01'
+  shape:
+  - 1024
+  sum: '-1.745e+01'
+network.model.decoder.layers.16.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.16.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.117e-01'
+  mean: '6.356e-03'
+  min: '-9.009e-02'
+  shape:
+  - 1024
+  sum: '6.508e+00'
+network.model.decoder.layers.16.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.27e-01'
+  mean: '-1.634e-05'
+  min: '-1.265e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.713e+01'
+network.model.decoder.layers.16.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '8.398e-02'
+  mean: '4.806e-05'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '4.921e-02'
+network.model.decoder.layers.16.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.553e-01'
+  mean: '-3.501e-06'
+  min: '-1.626e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-3.671e+00'
+network.model.decoder.layers.16.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '-1.884e-04'
+  min: '-1.246e-01'
+  shape:
+  - 1024
+  sum: '-1.929e-01'
+network.model.decoder.layers.16.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.261e-01'
+  mean: '2.789e-06'
+  min: '-1.278e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.924e+00'
+network.model.decoder.layers.16.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.462e-02'
+  mean: '-7.8e-04'
+  min: '-4.309e-02'
+  shape:
+  - 1024
+  sum: '-7.987e-01'
+network.model.decoder.layers.16.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.257e-01'
+  mean: '-9.28e-05'
+  min: '-1.259e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-9.731e+01'
+network.model.decoder.layers.16.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.252e-01'
+  mean: '1.154e-03'
+  min: '-2.112e-01'
+  shape:
+  - 1024
+  sum: '1.182e+00'
+network.model.decoder.layers.16.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.17.fc1.bias:
+  device: cuda:0
+  max: '1.113e-01'
+  mean: '-2.007e-02'
+  min: '-7.483e-02'
+  shape:
+  - 4096
+  sum: '-8.219e+01'
+network.model.decoder.layers.17.fc1.weight:
+  device: cuda:0
+  max: '1.27e-01'
+  mean: '-1.176e-04'
+  min: '-1.266e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-4.934e+02'
+network.model.decoder.layers.17.fc2.bias:
+  device: cuda:0
+  max: '6.415e-02'
+  mean: '2.448e-06'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '2.507e-03'
+network.model.decoder.layers.17.fc2.weight:
+  device: cuda:0
+  max: '1.431e-01'
+  mean: '-1.922e-06'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '-8.062e+00'
+network.model.decoder.layers.17.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '-1.363e-02'
+  min: '-1.307e-01'
+  shape:
+  - 1024
+  sum: '-1.396e+01'
+network.model.decoder.layers.17.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.17.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '3.524e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '3.609e+00'
+network.model.decoder.layers.17.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.257e-01'
+  mean: '-6.266e-06'
+  min: '-1.268e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-6.571e+00'
+network.model.decoder.layers.17.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '8.557e-02'
+  mean: '7.932e-05'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '8.123e-02'
+network.model.decoder.layers.17.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.682e-01'
+  mean: '1.080e-05'
+  min: '-1.591e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.133e+01'
+network.model.decoder.layers.17.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.081e-01'
+  mean: '8.627e-04'
+  min: '-1.006e-01'
+  shape:
+  - 1024
+  sum: '8.834e-01'
+network.model.decoder.layers.17.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.265e-01'
+  mean: '-1.448e-05'
+  min: '-1.262e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.518e+01'
+network.model.decoder.layers.17.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.285e-02'
+  mean: '4.112e-04'
+  min: '-4.175e-02'
+  shape:
+  - 1024
+  sum: '4.211e-01'
+network.model.decoder.layers.17.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.254e-01'
+  mean: '-1.06e-05'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.111e+01'
+network.model.decoder.layers.17.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.251e-01'
+  mean: '1.74e-04'
+  min: '-1.978e-01'
+  shape:
+  - 1024
+  sum: '1.781e-01'
+network.model.decoder.layers.17.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.18.fc1.bias:
+  device: cuda:0
+  max: '6.793e-02'
+  mean: '-1.838e-02'
+  min: '-8.258e-02'
+  shape:
+  - 4096
+  sum: '-7.527e+01'
+network.model.decoder.layers.18.fc1.weight:
+  device: cuda:0
+  max: '1.266e-01'
+  mean: '-1.719e-04'
+  min: '-1.256e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-7.209e+02'
+network.model.decoder.layers.18.fc2.bias:
+  device: cuda:0
+  max: '6.201e-02'
+  mean: '-3.286e-06'
+  min: '-1.06e-01'
+  shape:
+  - 1024
+  sum: '-3.364e-03'
+network.model.decoder.layers.18.fc2.weight:
+  device: cuda:0
+  max: '1.271e-01'
+  mean: '2.113e-06'
+  min: '-1.885e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '8.863e+00'
+network.model.decoder.layers.18.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '-1.239e-02'
+  min: '-1.262e-01'
+  shape:
+  - 1024
+  sum: '-1.268e+01'
+network.model.decoder.layers.18.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.18.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '5.307e-03'
+  min: '-1.218e-01'
+  shape:
+  - 1024
+  sum: '5.434e+00'
+network.model.decoder.layers.18.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.26e-01'
+  mean: '1.154e-05'
+  min: '-1.27e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.210e+01'
+network.model.decoder.layers.18.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '7.617e-02'
+  mean: '-8.257e-06'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-8.455e-03'
+network.model.decoder.layers.18.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.453e-01'
+  mean: '-6.184e-06'
+  min: '-1.554e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-6.484e+00'
+network.model.decoder.layers.18.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.002e-01'
+  mean: '-2.302e-03'
+  min: '-1.179e-01'
+  shape:
+  - 1024
+  sum: '-2.357e+00'
+network.model.decoder.layers.18.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.274e-01'
+  mean: '-2.129e-05'
+  min: '-1.27e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-2.233e+01'
+network.model.decoder.layers.18.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.874e-02'
+  mean: '-1.296e-04'
+  min: '-4.315e-02'
+  shape:
+  - 1024
+  sum: '-1.327e-01'
+network.model.decoder.layers.18.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.249e-01'
+  mean: '-5.472e-05'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-5.738e+01'
+network.model.decoder.layers.18.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.251e-01'
+  mean: '1.729e-03'
+  min: '-1.528e-01'
+  shape:
+  - 1024
+  sum: '1.771e+00'
+network.model.decoder.layers.18.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.19.fc1.bias:
+  device: cuda:0
+  max: '9.674e-02'
+  mean: '-1.617e-02'
+  min: '-7.123e-02'
+  shape:
+  - 4096
+  sum: '-6.623e+01'
+network.model.decoder.layers.19.fc1.weight:
+  device: cuda:0
+  max: '1.276e-01'
+  mean: '-1.816e-04'
+  min: '-1.266e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-7.616e+02'
+network.model.decoder.layers.19.fc2.bias:
+  device: cuda:0
+  max: '6.439e-02'
+  mean: '-2.292e-04'
+  min: '-7.587e-02'
+  shape:
+  - 1024
+  sum: '-2.347e-01'
+network.model.decoder.layers.19.fc2.weight:
+  device: cuda:0
+  max: '1.273e-01'
+  mean: '6.639e-06'
+  min: '-1.782e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '2.785e+01'
+network.model.decoder.layers.19.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '-9.252e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-9.474e+00'
+network.model.decoder.layers.19.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.19.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '7.829e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '8.017e+00'
+network.model.decoder.layers.19.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.265e-01'
+  mean: '-2.187e-05'
+  min: '-1.265e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-2.294e+01'
+network.model.decoder.layers.19.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '6.445e-02'
+  mean: '2.324e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '2.380e-01'
+network.model.decoder.layers.19.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.454e-01'
+  mean: '-5.801e-08'
+  min: '-1.431e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-6.083e-02'
+network.model.decoder.layers.19.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.252e-01'
+  mean: '-2.284e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-2.338e+00'
+network.model.decoder.layers.19.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.276e-01'
+  mean: '8.971e-05'
+  min: '-1.281e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '9.406e+01'
+network.model.decoder.layers.19.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.413e-02'
+  mean: '-1.693e-04'
+  min: '-4.315e-02'
+  shape:
+  - 1024
+  sum: '-1.733e-01'
+network.model.decoder.layers.19.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.249e-01'
+  mean: '-6.37e-05'
+  min: '-1.249e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-6.679e+01'
+network.model.decoder.layers.19.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '3.325e-03'
+  min: '-1.936e-01'
+  shape:
+  - 1024
+  sum: '3.405e+00'
+network.model.decoder.layers.19.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.2.fc1.bias:
+  device: cuda:0
+  max: '7.135e-02'
+  mean: '-2.341e-02'
+  min: '-6.665e-02'
+  shape:
+  - 4096
+  sum: '-9.591e+01'
+network.model.decoder.layers.2.fc1.weight:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '2.334e-04'
+  min: '-1.255e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '9.791e+02'
+network.model.decoder.layers.2.fc2.bias:
+  device: cuda:0
+  max: '7.172e-02'
+  mean: '3.129e-04'
+  min: '-7.66e-02'
+  shape:
+  - 1024
+  sum: '3.204e-01'
+network.model.decoder.layers.2.fc2.weight:
+  device: cuda:0
+  max: '1.294e-01'
+  mean: '-1.695e-06'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '-7.109e+00'
+network.model.decoder.layers.2.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.257e-01'
+  mean: '9.144e-03'
+  min: '-1.251e-01'
+  shape:
+  - 1024
+  sum: '9.364e+00'
+network.model.decoder.layers.2.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.2.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '6.384e-02'
+  mean: '8.869e-03'
+  min: '-6.445e-02'
+  shape:
+  - 1024
+  sum: '9.082e+00'
+network.model.decoder.layers.2.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.292e-01'
+  mean: '2.489e-05'
+  min: '-1.265e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.61e+01'
+network.model.decoder.layers.2.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '1.234e-01'
+  mean: '3.411e-04'
+  min: '-8.948e-02'
+  shape:
+  - 1024
+  sum: '3.493e-01'
+network.model.decoder.layers.2.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.317e-01'
+  mean: '-6.495e-06'
+  min: '-1.283e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-6.811e+00'
+network.model.decoder.layers.2.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.249e-01'
+  mean: '9.792e-04'
+  min: '-1.255e-01'
+  shape:
+  - 1024
+  sum: '1.003e+00'
+network.model.decoder.layers.2.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.257e-01'
+  mean: '1.202e-05'
+  min: '-1.271e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.260e+01'
+network.model.decoder.layers.2.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.211e-02'
+  mean: '-9.478e-05'
+  min: '-3.799e-02'
+  shape:
+  - 1024
+  sum: '-9.706e-02'
+network.model.decoder.layers.2.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.234e-01'
+  mean: '3.971e-05'
+  min: '-1.171e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '4.164e+01'
+network.model.decoder.layers.2.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.309e-01'
+  mean: '-1.911e-03'
+  min: '-1.254e-01'
+  shape:
+  - 1024
+  sum: '-1.957e+00'
+network.model.decoder.layers.2.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.20.fc1.bias:
+  device: cuda:0
+  max: '7.928e-02'
+  mean: '-1.524e-02'
+  min: '-7.220e-02'
+  shape:
+  - 4096
+  sum: '-6.244e+01'
+network.model.decoder.layers.20.fc1.weight:
+  device: cuda:0
+  max: '1.277e-01'
+  mean: '-1.853e-04'
+  min: '-1.271e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-7.770e+02'
+network.model.decoder.layers.20.fc2.bias:
+  device: cuda:0
+  max: '6.787e-02'
+  mean: '-1.132e-04'
+  min: '-7.617e-02'
+  shape:
+  - 1024
+  sum: '-1.159e-01'
+network.model.decoder.layers.20.fc2.weight:
+  device: cuda:0
+  max: '1.27e-01'
+  mean: '6.366e-06'
+  min: '-2.393e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '2.670e+01'
+network.model.decoder.layers.20.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '-9.149e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-9.369e+00'
+network.model.decoder.layers.20.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.20.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '1.126e-02'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '1.153e+01'
+network.model.decoder.layers.20.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.356e-01'
+  mean: '4.825e-05'
+  min: '-1.333e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '5.059e+01'
+network.model.decoder.layers.20.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '6.512e-02'
+  mean: '-8.754e-05'
+  min: '-1.215e-01'
+  shape:
+  - 1024
+  sum: '-8.964e-02'
+network.model.decoder.layers.20.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.334e-01'
+  mean: '8.321e-06'
+  min: '-1.311e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '8.725e+00'
+network.model.decoder.layers.20.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.252e-01'
+  mean: '-2.386e-03'
+  min: '-1.256e-01'
+  shape:
+  - 1024
+  sum: '-2.444e+00'
+network.model.decoder.layers.20.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.278e-01'
+  mean: '1.178e-07'
+  min: '-1.279e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.235e-01'
+network.model.decoder.layers.20.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.395e-02'
+  mean: '-3.544e-04'
+  min: '-4.248e-02'
+  shape:
+  - 1024
+  sum: '-3.629e-01'
+network.model.decoder.layers.20.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.246e-01'
+  mean: '1.676e-06'
+  min: '-1.249e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.757e+00'
+network.model.decoder.layers.20.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '3.003e-03'
+  min: '-1.256e-01'
+  shape:
+  - 1024
+  sum: '3.075e+00'
+network.model.decoder.layers.20.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.21.fc1.bias:
+  device: cuda:0
+  max: '8.362e-02'
+  mean: '-1.634e-02'
+  min: '-9.613e-02'
+  shape:
+  - 4096
+  sum: '-6.693e+01'
+network.model.decoder.layers.21.fc1.weight:
+  device: cuda:0
+  max: '1.289e-01'
+  mean: '-1.814e-04'
+  min: '-1.299e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-7.611e+02'
+network.model.decoder.layers.21.fc2.bias:
+  device: cuda:0
+  max: '9.045e-02'
+  mean: '5.474e-05'
+  min: '-7.306e-02'
+  shape:
+  - 1024
+  sum: '5.605e-02'
+network.model.decoder.layers.21.fc2.weight:
+  device: cuda:0
+  max: '1.322e-01'
+  mean: '3.575e-07'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '1.499e+00'
+network.model.decoder.layers.21.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '-5.773e-03'
+  min: '-1.249e-01'
+  shape:
+  - 1024
+  sum: '-5.912e+00'
+network.model.decoder.layers.21.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.21.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '9.81e-03'
+  min: '-1.318e-01'
+  shape:
+  - 1024
+  sum: '1.005e+01'
+network.model.decoder.layers.21.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.425e-01'
+  mean: '-2.337e-05'
+  min: '-1.454e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-2.450e+01'
+network.model.decoder.layers.21.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '7.263e-02'
+  mean: '-6.624e-05'
+  min: '-9.937e-02'
+  shape:
+  - 1024
+  sum: '-6.783e-02'
+network.model.decoder.layers.21.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.294e-01'
+  mean: '1.762e-06'
+  min: '-1.285e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.847e+00'
+network.model.decoder.layers.21.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.257e-01'
+  mean: '-1.89e-03'
+  min: '-1.257e-01'
+  shape:
+  - 1024
+  sum: '-1.935e+00'
+network.model.decoder.layers.21.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.327e-01'
+  mean: '-1.882e-05'
+  min: '-1.31e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.974e+01'
+network.model.decoder.layers.21.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.669e-02'
+  mean: '-2.74e-04'
+  min: '-4.211e-02'
+  shape:
+  - 1024
+  sum: '-2.806e-01'
+network.model.decoder.layers.21.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '-7.892e-05'
+  min: '-1.249e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-8.276e+01'
+network.model.decoder.layers.21.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '3.155e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '3.231e+00'
+network.model.decoder.layers.21.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.22.fc1.bias:
+  device: cuda:0
+  max: '1.251e-01'
+  mean: '-1.548e-02'
+  min: '-1.254e-01'
+  shape:
+  - 4096
+  sum: '-6.341e+01'
+network.model.decoder.layers.22.fc1.weight:
+  device: cuda:0
+  max: '1.278e-01'
+  mean: '-1.567e-04'
+  min: '-1.277e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '-6.574e+02'
+network.model.decoder.layers.22.fc2.bias:
+  device: cuda:0
+  max: '7.642e-02'
+  mean: '1.103e-04'
+  min: '-7.037e-02'
+  shape:
+  - 1024
+  sum: '1.13e-01'
+network.model.decoder.layers.22.fc2.weight:
+  device: cuda:0
+  max: '1.279e-01'
+  mean: '1.737e-06'
+  min: '-1.288e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '7.287e+00'
+network.model.decoder.layers.22.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '-4.785e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-4.9e+00'
+network.model.decoder.layers.22.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.22.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '6.801e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '6.964e+00'
+network.model.decoder.layers.22.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.401e-01'
+  mean: '-8.573e-06'
+  min: '-1.409e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-8.99e+00'
+network.model.decoder.layers.22.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '7.709e-02'
+  mean: '-1.158e-05'
+  min: '-8.099e-02'
+  shape:
+  - 1024
+  sum: '-1.186e-02'
+network.model.decoder.layers.22.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.302e-01'
+  mean: '-1.088e-06'
+  min: '-1.293e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.141e+00'
+network.model.decoder.layers.22.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.013e-01'
+  mean: '-1.666e-03'
+  min: '-1.021e-01'
+  shape:
+  - 1024
+  sum: '-1.706e+00'
+network.model.decoder.layers.22.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.331e-01'
+  mean: '-2.958e-05'
+  min: '-1.338e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-3.102e+01'
+network.model.decoder.layers.22.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.211e-02'
+  mean: '5.506e-04'
+  min: '-4.501e-02'
+  shape:
+  - 1024
+  sum: '5.638e-01'
+network.model.decoder.layers.22.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.257e-01'
+  mean: '-2.981e-05'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-3.125e+01'
+network.model.decoder.layers.22.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '7.961e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '8.152e-01'
+network.model.decoder.layers.22.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.23.fc1.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '2.694e-03'
+  min: '-1.278e-01'
+  shape:
+  - 4096
+  sum: '1.103e+01'
+network.model.decoder.layers.23.fc1.weight:
+  device: cuda:0
+  max: '2.107e-01'
+  mean: '8.400e-05'
+  min: '-2.146e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '3.523e+02'
+network.model.decoder.layers.23.fc2.bias:
+  device: cuda:0
+  max: '6.299e-02'
+  mean: '1.316e-03'
+  min: '-6.311e-02'
+  shape:
+  - 1024
+  sum: '1.348e+00'
+network.model.decoder.layers.23.fc2.weight:
+  device: cuda:0
+  max: '2.5e-01'
+  mean: '1.024e-05'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '4.294e+01'
+network.model.decoder.layers.23.final_layer_norm.bias:
+  device: cuda:0
+  max: '7.251e-02'
+  mean: '9.345e-03'
+  min: '-7.196e-02'
+  shape:
+  - 1024
+  sum: '9.57e+00'
+network.model.decoder.layers.23.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.23.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '2.219e-01'
+  mean: '3.647e-03'
+  min: '-1.824e-01'
+  shape:
+  - 1024
+  sum: '3.734e+00'
+network.model.decoder.layers.23.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.294e-01'
+  mean: '-1.63e-05'
+  min: '-1.304e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.709e+01'
+network.model.decoder.layers.23.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '7.605e-02'
+  mean: '-1.183e-04'
+  min: '-6.47e-02'
+  shape:
+  - 1024
+  sum: '-1.212e-01'
+network.model.decoder.layers.23.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '2.5e-01'
+  mean: '-1.078e-05'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.130e+01'
+network.model.decoder.layers.23.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '-2.744e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-2.809e-01'
+network.model.decoder.layers.23.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.338e-01'
+  mean: '2.096e-05'
+  min: '-1.337e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.197e+01'
+network.model.decoder.layers.23.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.068e-02'
+  mean: '2.158e-05'
+  min: '-4.48e-02'
+  shape:
+  - 1024
+  sum: '2.210e-02'
+network.model.decoder.layers.23.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.267e-01'
+  mean: '6.273e-05'
+  min: '-1.256e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '6.577e+01'
+network.model.decoder.layers.23.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '1.700e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '1.741e+00'
+network.model.decoder.layers.23.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.3.fc1.bias:
+  device: cuda:0
+  max: '8.453e-02'
+  mean: '-2.474e-02'
+  min: '-1.194e-01'
+  shape:
+  - 4096
+  sum: '-1.013e+02'
+network.model.decoder.layers.3.fc1.weight:
+  device: cuda:0
+  max: '1.251e-01'
+  mean: '1.348e-04'
+  min: '-1.252e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '5.654e+02'
+network.model.decoder.layers.3.fc2.bias:
+  device: cuda:0
+  max: '7.086e-02'
+  mean: '1.769e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '1.811e-01'
+network.model.decoder.layers.3.fc2.weight:
+  device: cuda:0
+  max: '1.276e-01'
+  mean: '1.857e-06'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '7.790e+00'
+network.model.decoder.layers.3.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.254e-01'
+  mean: '6.555e-03'
+  min: '-1.254e-01'
+  shape:
+  - 1024
+  sum: '6.712e+00'
+network.model.decoder.layers.3.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.3.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '6.372e-02'
+  mean: '8.278e-03'
+  min: '-3.555e-02'
+  shape:
+  - 1024
+  sum: '8.477e+00'
+network.model.decoder.layers.3.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.266e-01'
+  mean: '-1.901e-05'
+  min: '-1.266e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.993e+01'
+network.model.decoder.layers.3.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '1.240e-01'
+  mean: '1.084e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '1.11e-01'
+network.model.decoder.layers.3.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.764e-01'
+  mean: '-1.601e-06'
+  min: '-1.614e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.679e+00'
+network.model.decoder.layers.3.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.248e-01'
+  mean: '-2.804e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-2.871e-01'
+network.model.decoder.layers.3.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.266e-01'
+  mean: '-1.642e-05'
+  min: '-1.266e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.721e+01'
+network.model.decoder.layers.3.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '3.882e-02'
+  mean: '-9.93e-04'
+  min: '-4.312e-02'
+  shape:
+  - 1024
+  sum: '-1.017e+00'
+network.model.decoder.layers.3.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.216e-01'
+  mean: '-9.011e-05'
+  min: '-1.204e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-9.449e+01'
+network.model.decoder.layers.3.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.290e-01'
+  mean: '-4.648e-04'
+  min: '-1.259e-01'
+  shape:
+  - 1024
+  sum: '-4.76e-01'
+network.model.decoder.layers.3.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.4.fc1.bias:
+  device: cuda:0
+  max: '7.648e-02'
+  mean: '-2.333e-02'
+  min: '-1.11e-01'
+  shape:
+  - 4096
+  sum: '-9.556e+01'
+network.model.decoder.layers.4.fc1.weight:
+  device: cuda:0
+  max: '1.252e-01'
+  mean: '7.858e-05'
+  min: '-1.261e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '3.296e+02'
+network.model.decoder.layers.4.fc2.bias:
+  device: cuda:0
+  max: '6.671e-02'
+  mean: '6.644e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '6.803e-01'
+network.model.decoder.layers.4.fc2.weight:
+  device: cuda:0
+  max: '1.281e-01'
+  mean: '2.081e-06'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '8.729e+00'
+network.model.decoder.layers.4.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '2.551e-03'
+  min: '-1.259e-01'
+  shape:
+  - 1024
+  sum: '2.613e+00'
+network.model.decoder.layers.4.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.4.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '6.433e-02'
+  mean: '9.123e-03'
+  min: '-6.219e-02'
+  shape:
+  - 1024
+  sum: '9.342e+00'
+network.model.decoder.layers.4.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.298e-01'
+  mean: '3.159e-05'
+  min: '-1.27e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '3.312e+01'
+network.model.decoder.layers.4.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '1.113e-01'
+  mean: '3.284e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '3.363e-01'
+network.model.decoder.layers.4.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.307e-01'
+  mean: '5.154e-06'
+  min: '-1.296e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '5.404e+00'
+network.model.decoder.layers.4.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.251e-01'
+  mean: '1.442e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '1.477e+00'
+network.model.decoder.layers.4.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.277e-01'
+  mean: '-1.649e-06'
+  min: '-1.267e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.729e+00'
+network.model.decoder.layers.4.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '3.711e-02'
+  mean: '1.497e-04'
+  min: '-3.909e-02'
+  shape:
+  - 1024
+  sum: '1.533e-01'
+network.model.decoder.layers.4.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.139e-01'
+  mean: '6.411e-05'
+  min: '-1.227e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '6.722e+01'
+network.model.decoder.layers.4.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.271e-01'
+  mean: '1.923e-04'
+  min: '-1.272e-01'
+  shape:
+  - 1024
+  sum: '1.969e-01'
+network.model.decoder.layers.4.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.5.fc1.bias:
+  device: cuda:0
+  max: '9.772e-02'
+  mean: '-2.182e-02'
+  min: '-1.219e-01'
+  shape:
+  - 4096
+  sum: '-8.94e+01'
+network.model.decoder.layers.5.fc1.weight:
+  device: cuda:0
+  max: '1.257e-01'
+  mean: '1.105e-04'
+  min: '-1.254e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '4.637e+02'
+network.model.decoder.layers.5.fc2.bias:
+  device: cuda:0
+  max: '6.384e-02'
+  mean: '9.162e-05'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '9.382e-02'
+network.model.decoder.layers.5.fc2.weight:
+  device: cuda:0
+  max: '1.262e-01'
+  mean: '4.982e-07'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '2.089e+00'
+network.model.decoder.layers.5.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '4.158e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '4.258e-01'
+network.model.decoder.layers.5.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.5.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '7.245e-02'
+  mean: '1.13e-02'
+  min: '-5.319e-02'
+  shape:
+  - 1024
+  sum: '1.157e+01'
+network.model.decoder.layers.5.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.263e-01'
+  mean: '-5.184e-05'
+  min: '-1.263e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-5.436e+01'
+network.model.decoder.layers.5.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '1.068e-01'
+  mean: '2.054e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '2.103e-01'
+network.model.decoder.layers.5.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.582e-01'
+  mean: '2.069e-05'
+  min: '-1.821e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '2.169e+01'
+network.model.decoder.layers.5.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '-6.643e-04'
+  min: '-1.254e-01'
+  shape:
+  - 1024
+  sum: '-6.802e-01'
+network.model.decoder.layers.5.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.261e-01'
+  mean: '1.035e-05'
+  min: '-1.27e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.086e+01'
+network.model.decoder.layers.5.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.800e-02'
+  mean: '5.821e-04'
+  min: '-4.202e-02'
+  shape:
+  - 1024
+  sum: '5.960e-01'
+network.model.decoder.layers.5.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.182e-01'
+  mean: '1.019e-05'
+  min: '-1.202e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.068e+01'
+network.model.decoder.layers.5.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.263e-01'
+  mean: '-4.794e-04'
+  min: '-1.257e-01'
+  shape:
+  - 1024
+  sum: '-4.909e-01'
+network.model.decoder.layers.5.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.6.fc1.bias:
+  device: cuda:0
+  max: '1.191e-01'
+  mean: '-2.029e-02'
+  min: '-9.454e-02'
+  shape:
+  - 4096
+  sum: '-8.312e+01'
+network.model.decoder.layers.6.fc1.weight:
+  device: cuda:0
+  max: '1.282e-01'
+  mean: '1.416e-04'
+  min: '-1.27e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '5.939e+02'
+network.model.decoder.layers.6.fc2.bias:
+  device: cuda:0
+  max: '6.439e-02'
+  mean: '-1.532e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-1.569e-01'
+network.model.decoder.layers.6.fc2.weight:
+  device: cuda:0
+  max: '1.343e-01'
+  mean: '-3.220e-07'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '-1.351e+00'
+network.model.decoder.layers.6.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '-1.357e-04'
+  min: '-1.254e-01'
+  shape:
+  - 1024
+  sum: '-1.389e-01'
+network.model.decoder.layers.6.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.6.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '8.856e-02'
+  mean: '1.296e-02'
+  min: '-6.641e-02'
+  shape:
+  - 1024
+  sum: '1.327e+01'
+network.model.decoder.layers.6.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.300e-01'
+  mean: '1.62e-05'
+  min: '-1.300e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.698e+01'
+network.model.decoder.layers.6.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '6.47e-02'
+  mean: '-1.618e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-1.657e-01'
+network.model.decoder.layers.6.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.340e-01'
+  mean: '9.419e-06'
+  min: '-1.305e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '9.877e+00'
+network.model.decoder.layers.6.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.256e-01'
+  mean: '2.037e-03'
+  min: '-1.257e-01'
+  shape:
+  - 1024
+  sum: '2.086e+00'
+network.model.decoder.layers.6.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.272e-01'
+  mean: '4.741e-06'
+  min: '-1.276e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '4.972e+00'
+network.model.decoder.layers.6.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.633e-02'
+  mean: '3.225e-05'
+  min: '-4.407e-02'
+  shape:
+  - 1024
+  sum: '3.303e-02'
+network.model.decoder.layers.6.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.147e-01'
+  mean: '4.657e-05'
+  min: '-1.19e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '4.883e+01'
+network.model.decoder.layers.6.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '-1.389e-06'
+  min: '-1.257e-01'
+  shape:
+  - 1024
+  sum: '-1.423e-03'
+network.model.decoder.layers.6.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.7.fc1.bias:
+  device: cuda:0
+  max: '1.077e-01'
+  mean: '-2.155e-02'
+  min: '-1.226e-01'
+  shape:
+  - 4096
+  sum: '-8.828e+01'
+network.model.decoder.layers.7.fc1.weight:
+  device: cuda:0
+  max: '1.284e-01'
+  mean: '1.858e-04'
+  min: '-1.311e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '7.793e+02'
+network.model.decoder.layers.7.fc2.bias:
+  device: cuda:0
+  max: '6.897e-02'
+  mean: '4.677e-05'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '4.789e-02'
+network.model.decoder.layers.7.fc2.weight:
+  device: cuda:0
+  max: '1.459e-01'
+  mean: '-4.578e-07'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '-1.92e+00'
+network.model.decoder.layers.7.final_layer_norm.bias:
+  device: cuda:0
+  max: '1.093e-01'
+  mean: '-1.554e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-1.591e+00'
+network.model.decoder.layers.7.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.7.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.021e-01'
+  mean: '1.303e-02'
+  min: '-6.25e-02'
+  shape:
+  - 1024
+  sum: '1.334e+01'
+network.model.decoder.layers.7.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.323e-01'
+  mean: '1.285e-05'
+  min: '-1.333e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.348e+01'
+network.model.decoder.layers.7.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '5.948e-02'
+  mean: '2.333e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '2.389e-01'
+network.model.decoder.layers.7.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.316e-01'
+  mean: '-1.173e-06'
+  min: '-1.301e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.230e+00'
+network.model.decoder.layers.7.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.252e-01'
+  mean: '3.876e-03'
+  min: '-1.261e-01'
+  shape:
+  - 1024
+  sum: '3.969e+00'
+network.model.decoder.layers.7.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.272e-01'
+  mean: '-3.278e-06'
+  min: '-1.292e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-3.437e+00'
+network.model.decoder.layers.7.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.297e-02'
+  mean: '4.138e-04'
+  min: '-4.077e-02'
+  shape:
+  - 1024
+  sum: '4.237e-01'
+network.model.decoder.layers.7.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.183e-01'
+  mean: '-3.309e-05'
+  min: '-1.174e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-3.47e+01'
+network.model.decoder.layers.7.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '1.830e-04'
+  min: '-1.267e-01'
+  shape:
+  - 1024
+  sum: '1.874e-01'
+network.model.decoder.layers.7.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.8.fc1.bias:
+  device: cuda:0
+  max: '6.335e-02'
+  mean: '-2.258e-02'
+  min: '-1.26e-01'
+  shape:
+  - 4096
+  sum: '-9.249e+01'
+network.model.decoder.layers.8.fc1.weight:
+  device: cuda:0
+  max: '1.278e-01'
+  mean: '5.06e-05'
+  min: '-1.271e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '2.122e+02'
+network.model.decoder.layers.8.fc2.bias:
+  device: cuda:0
+  max: '6.818e-02'
+  mean: '-1.369e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-1.402e-01'
+network.model.decoder.layers.8.fc2.weight:
+  device: cuda:0
+  max: '1.392e-01'
+  mean: '-4.149e-06'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '-1.740e+01'
+network.model.decoder.layers.8.final_layer_norm.bias:
+  device: cuda:0
+  max: '6.47e-02'
+  mean: '-3.244e-03'
+  min: '-1.252e-01'
+  shape:
+  - 1024
+  sum: '-3.322e+00'
+network.model.decoder.layers.8.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.8.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '9.65e-02'
+  mean: '1.109e-02'
+  min: '-6.247e-02'
+  shape:
+  - 1024
+  sum: '1.136e+01'
+network.model.decoder.layers.8.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.318e-01'
+  mean: '8.991e-06'
+  min: '-1.32e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '9.428e+00'
+network.model.decoder.layers.8.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '6.317e-02'
+  mean: '-7.463e-05'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-7.643e-02'
+network.model.decoder.layers.8.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.306e-01'
+  mean: '6.679e-06'
+  min: '-1.327e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '7.003e+00'
+network.model.decoder.layers.8.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.256e-01'
+  mean: '1.131e-05'
+  min: '-1.257e-01'
+  shape:
+  - 1024
+  sum: '1.159e-02'
+network.model.decoder.layers.8.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.311e-01'
+  mean: '-4.181e-07'
+  min: '-1.293e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-4.384e-01'
+network.model.decoder.layers.8.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '4.486e-02'
+  mean: '5.294e-04'
+  min: '-4.657e-02'
+  shape:
+  - 1024
+  sum: '5.421e-01'
+network.model.decoder.layers.8.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.242e-01'
+  mean: '1.489e-05'
+  min: '-1.243e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '1.561e+01'
+network.model.decoder.layers.8.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '1.027e-03'
+  min: '-1.254e-01'
+  shape:
+  - 1024
+  sum: '1.052e+00'
+network.model.decoder.layers.8.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.9.fc1.bias:
+  device: cuda:0
+  max: '7.355e-02'
+  mean: '-2.086e-02'
+  min: '-8.301e-02'
+  shape:
+  - 4096
+  sum: '-8.545e+01'
+network.model.decoder.layers.9.fc1.weight:
+  device: cuda:0
+  max: '1.256e-01'
+  mean: '2.51e-05'
+  min: '-1.265e-01'
+  shape:
+  - 4096
+  - 1024
+  sum: '1.053e+02'
+network.model.decoder.layers.9.fc2.bias:
+  device: cuda:0
+  max: '6.647e-02'
+  mean: '2.622e-04'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '2.685e-01'
+network.model.decoder.layers.9.fc2.weight:
+  device: cuda:0
+  max: '1.256e-01'
+  mean: '-3.312e-06'
+  min: '-2.5e-01'
+  shape:
+  - 1024
+  - 4096
+  sum: '-1.389e+01'
+network.model.decoder.layers.9.final_layer_norm.bias:
+  device: cuda:0
+  max: '7.349e-02'
+  mean: '-8.035e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-8.227e+00'
+network.model.decoder.layers.9.final_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.layers.9.self_attn.k_proj.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '8.960e-03'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '9.175e+00'
+network.model.decoder.layers.9.self_attn.k_proj.weight:
+  device: cuda:0
+  max: '1.346e-01'
+  mean: '4.302e-05'
+  min: '-1.346e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '4.511e+01'
+network.model.decoder.layers.9.self_attn.out_proj.bias:
+  device: cuda:0
+  max: '6.616e-02'
+  mean: '-8.681e-05'
+  min: '-1.25e-01'
+  shape:
+  - 1024
+  sum: '-8.89e-02'
+network.model.decoder.layers.9.self_attn.out_proj.weight:
+  device: cuda:0
+  max: '1.497e-01'
+  mean: '-7.002e-06'
+  min: '-1.382e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-7.342e+00'
+network.model.decoder.layers.9.self_attn.q_proj.bias:
+  device: cuda:0
+  max: '1.25e-01'
+  mean: '2.336e-03'
+  min: '-1.208e-01'
+  shape:
+  - 1024
+  sum: '2.392e+00'
+network.model.decoder.layers.9.self_attn.q_proj.weight:
+  device: cuda:0
+  max: '1.344e-01'
+  mean: '-1.583e-05'
+  min: '-1.379e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-1.66e+01'
+network.model.decoder.layers.9.self_attn.v_proj.bias:
+  device: cuda:0
+  max: '6.241e-02'
+  mean: '2.777e-04'
+  min: '-6.464e-02'
+  shape:
+  - 1024
+  sum: '2.844e-01'
+network.model.decoder.layers.9.self_attn.v_proj.weight:
+  device: cuda:0
+  max: '1.131e-01'
+  mean: '-2.935e-05'
+  min: '-1.183e-01'
+  shape:
+  - 1024
+  - 1024
+  sum: '-3.077e+01'
+network.model.decoder.layers.9.self_attn_layer_norm.bias:
+  device: cuda:0
+  max: '7.812e-02'
+  mean: '9.632e-04'
+  min: '-1.255e-01'
+  shape:
+  - 1024
+  sum: '9.864e-01'
+network.model.decoder.layers.9.self_attn_layer_norm.weight:
+  device: cuda:0
+  max: '1.e+00'
+  mean: '1.e+00'
+  min: '1.e+00'
+  shape:
+  - 1024
+  sum: '1.024e+03'
+network.model.decoder.project_in.weight:
+  device: cuda:0
+  max: '1.305e-01'
+  mean: '3.482e-05'
+  min: '-1.318e-01'
+  shape:
+  - 1024
+  - 512
+  sum: '1.826e+01'
+network.model.decoder.project_out.weight:
+  device: cuda:0
+  max: '1.373e-01'
+  mean: '8.706e-05'
+  min: '-1.376e-01'
+  shape:
+  - 512
+  - 1024
+  sum: '4.564e+01'
diff --git a/.regression_files/project/algorithms/llm_finetuning_test/test_training_batch_doesnt_change/llm_finetuning.yaml b/.regression_files/project/algorithms/llm_finetuning_test/test_training_batch_doesnt_change/llm_finetuning.yaml
new file mode 100644
index 00000000..84eb1516
--- /dev/null
+++ b/.regression_files/project/algorithms/llm_finetuning_test/test_training_batch_doesnt_change/llm_finetuning.yaml
@@ -0,0 +1,27 @@
+attention_mask:
+  device: cuda:0
+  max: 1
+  mean: '1.e+00'
+  min: 1
+  shape:
+  - 8
+  - 256
+  sum: 2048
+input_ids:
+  device: cuda:0
+  max: 50118
+  mean: '5.447e+03'
+  min: 2
+  shape:
+  - 8
+  - 256
+  sum: 11154886
+labels:
+  device: cuda:0
+  max: 50118
+  mean: '5.447e+03'
+  min: 2
+  shape:
+  - 8
+  - 256
+  sum: 11154886
diff --git a/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_test.yaml b/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_test.yaml
deleted file mode 100644
index 37d8958b..00000000
--- a/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_test.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-attention_mask:
-  device: cpu
-  max: 1
-  mean: '1.021e-01'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 418
-input_ids:
-  device: cpu
-  max: 29043
-  mean: '1.648e+02'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 675172
-labels:
-  device: cpu
-  max: -1
-  mean: '-1.e+00'
-  min: -1
-  shape:
-  - 32
-  sum: -32
-token_type_ids:
-  device: cpu
-  max: 0
-  mean: '0.e+00'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 0
diff --git a/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_train.yaml b/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_train.yaml
deleted file mode 100644
index 89d6925e..00000000
--- a/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_train.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-attention_mask:
-  device: cpu
-  max: 1
-  mean: '8.374e-02'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 343
-input_ids:
-  device: cpu
-  max: 26101
-  mean: '1.597e+02'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 654306
-labels:
-  device: cpu
-  max: 1
-  mean: '7.188e-01'
-  min: 0
-  shape:
-  - 32
-  sum: 23
-token_type_ids:
-  device: cpu
-  max: 0
-  mean: '0.e+00'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 0
diff --git a/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_validate.yaml b/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_validate.yaml
deleted file mode 100644
index ef5d1104..00000000
--- a/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_validate.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-attention_mask:
-  device: cpu
-  max: 1
-  mean: '9.277e-02'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 380
-input_ids:
-  device: cpu
-  max: 29043
-  mean: '1.362e+02'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 557879
-labels:
-  device: cpu
-  max: 1
-  mean: '7.5e-01'
-  min: 0
-  shape:
-  - 32
-  sum: 24
-token_type_ids:
-  device: cpu
-  max: 0
-  mean: '0.e+00'
-  min: 0
-  shape:
-  - 32
-  - 128
-  sum: 0
diff --git a/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_test.yaml b/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_test.yaml
deleted file mode 100644
index 8e49803a..00000000
--- a/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_test.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-'0':
-  device: cpu
-  max: '1.e+00'
-  mean: '4.611e-01'
-  min: '0.e+00'
-  shape:
-  - 64
-  - 3
-  - 32
-  - 32
-  sum: '9.065e+04'
-'1':
-  device: cpu
-  max: 987
-  mean: '5.432e+02'
-  min: 49
-  shape:
-  - 64
-  sum: 34767
diff --git a/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_train.yaml b/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_train.yaml
deleted file mode 100644
index 214d5795..00000000
--- a/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_train.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-'0':
-  device: cpu
-  max: '2.640e+00'
-  mean: '3.701e-03'
-  min: '-2.118e+00'
-  shape:
-  - 64
-  - 3
-  - 32
-  - 32
-  sum: '7.277e+02'
-'1':
-  device: cpu
-  max: 993
-  mean: '4.871e+02'
-  min: 1
-  shape:
-  - 64
-  sum: 31176
diff --git a/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_validate.yaml b/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_validate.yaml
deleted file mode 100644
index 2cf23250..00000000
--- a/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_validate.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-'0':
-  device: cpu
-  max: '1.e+00'
-  mean: '4.266e-01'
-  min: '0.e+00'
-  shape:
-  - 64
-  - 3
-  - 32
-  - 32
-  sum: '8.388e+04'
-'1':
-  device: cpu
-  max: 973
-  mean: '4.845e+02'
-  min: 21
-  shape:
-  - 64
-  sum: 31006
diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
index 5dba41f0..a65eb75e 100644
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -7,11 +7,11 @@
     * [Thorough automated testing on SLURM clusters](features/testing.md)
     * features/*.md
   * [Examples 🧪](examples/index.md)
-    * [Image Classification (⚡)](examples/torch_sl_example.md)
-    * [Image Classification (jax+⚡)](examples/jax_sl_example.md)
+    * [Image Classification (⚡)](examples/image_classification.md)
+    * [Image Classification (jax+⚡)](examples/jax_image_classification.md)
     * [Text Classification (🤗+⚡)](examples/text_classification.md)
     * [Fine-tuning an LLM (🤗+⚡)](examples/llm_finetuning.md)
-    * [RL (jax)](examples/jax_rl_example.md)
+    * [Reinforcement Learning (jax)](examples/jax_rl.md)
     * [Running sweeps](examples/sweeps.md)
     * [Profiling your code📎](examples/profiling.md)
     * examples/*.md
diff --git a/docs/examples/image_classification.md b/docs/examples/image_classification.md
new file mode 100644
index 00000000..b8f83160
--- /dev/null
+++ b/docs/examples/image_classification.md
@@ -0,0 +1,29 @@
+---
+additional_python_references:
+  - project.algorithms.image_classifier
+  - lightning.pytorch.core.module
+---
+
+# Supervised Learning (PyTorch)
+
+
+## ImageClassifier
+
+The `ImageClassifier` is a simple `LightningModule` for image classification.
+It accepts a vision datamodule as input.
+
+??? note "Click to show the code of the ImageClassifier class."
+    {{ inline('project.algorithms.image_classifier.ImageClassifier', 4) }}
+
+## Running the example
+
+Here is a configuration file that you can use to launch a simple experiment:
+
+??? note "Click to show the yaml config file"
+    {{ inline('project/configs/experiment/example.yaml', 4) }}
+
+You can use it like so:
+
+```console
+python project/main.py experiment=example
+```
diff --git a/docs/examples/index.md b/docs/examples/index.md
index 4278e2a4..91600c14 100644
--- a/docs/examples/index.md
+++ b/docs/examples/index.md
@@ -1,9 +1,9 @@
 ---
 additional_python_references:
-  - project.algorithms.jax_rl_example
-  - project.algorithms.example
-  - project.algorithms.jax_example
-  - project.algorithms.text_classification_example
+  - project.algorithms.jax_ppo
+  - project.algorithms.image_classifier
+  - project.algorithms.jax_image_classifier
+  - project.algorithms.text_classifier
   - project.algorithms.llm_finetuning
   - project.trainers.jax_trainer
 ---
@@ -12,10 +12,10 @@ additional_python_references:
 
 This template includes examples that use either Jax, PyTorch, or both!
 
-| Example link                                        | Research Area                              | Reference link              | Frameworks      |
-| --------------------------------------------------- | ------------------------------------------ | --------------------------- | --------------- |
-| [ExampleAlgorithm](torch_sl_example.md)             | Supervised Learning (image classification) | `ExampleAlgorithm`          | Torch + ⚡       |
-| [JaxExample](jax_sl_example.md)                     | Supervised Learning (image classification) | `JaxExample`                | Torch + Jax + ⚡ |
-| [TextClassificationExample](text_classification.md) | NLP (text classification)                  | `TextClassificationExample` | Torch + 🤗 + ⚡   |
-| [JaxRLExample](jax_rl_example.md)                   | RL                                         | `JaxRLExample`              | Jax             |
-| [LLMFinetuningExample](llm_finetuning.md)           | NLP (Causal language modeling)             | `LLMFineTuningExample`      | Torch + 🤗 + ⚡   |
+| Example link                                              | Research Area                              | Reference link         | Frameworks      |
+| --------------------------------------------------------- | ------------------------------------------ | ---------------------- | --------------- |
+| [Image Classification](image_classification.md)           | Supervised Learning (image classification) | `ImageClassifier`      | Torch + ⚡       |
+| [Image Classification (Jax)](jax_image_classification.md) | Supervised Learning (image classification) | `JaxImageClassifier`   | Torch + Jax + ⚡ |
+| [Text Classification](text_classification.md)             | NLP (text classification)                  | `TextClassifier`       | Torch + 🤗 + ⚡   |
+| [Reinforcement Learning (Jax)](jax_rl.md)                 | RL                                         | `JaxRLExample`         | Jax             |
+| [LLM Fine-tuning](llm_finetuning.md)                      | NLP (Causal language modeling)             | `LLMFineTuningExample` | Torch + 🤗 + ⚡   |
diff --git a/docs/examples/jax_sl_example.md b/docs/examples/jax_image_classification.md
similarity index 64%
rename from docs/examples/jax_sl_example.md
rename to docs/examples/jax_image_classification.md
index 1491f7b3..ee1ddc99 100644
--- a/docs/examples/jax_sl_example.md
+++ b/docs/examples/jax_image_classification.md
@@ -1,8 +1,14 @@
+---
+additional_python_references:
+  - project.algorithms.jax_image_classifier
+  - project.trainers.jax_trainer
+---
+
 # Jax + PyTorch-Lightning ⚡
 
-## `JaxExample`: a LightningModule that trains a Jax network
+## A LightningModule that trains a Jax network
 
-The [JaxExample][project.algorithms.jax_example.JaxExample] algorithm uses a network which is a [flax.linen.Module](https://flax.readthedocs.io/en/latest/).
+The `JaxImageClassifier` algorithm uses a network which is a [flax.linen.Module](https://flax.readthedocs.io/en/latest/).
 The network is wrapped with `torch_jax_interop.JaxFunction`, so that it can accept torch tensors as inputs, produces torch tensors as outputs, and the parameters are saved as as `torch.nn.Parameter`s (which use the same underlying memory as the jax arrays).
 In this example, the loss function and optimizers are in PyTorch, while the network forward and backward passes are written in Jax.
 
@@ -16,24 +22,24 @@ pass uses Jax to calculate the gradients, and the weights are updated by a PyTor
 
 !!! question "What about end-to-end training in Jax?"
 
-    See the [Jax RL Example](../examples/jax_rl_example.md)! :smile:
+    See the [Jax RL Example](../examples/jax_rl.md)! :smile:
 
 ### Jax Network
 
-{{ inline('project.algorithms.jax_example.CNN') }}
+{{ inline('project.algorithms.jax_image_classifier.JaxCNN') }}
 
 ### Jax Algorithm
 
-{{ inline('project.algorithms.jax_example.JaxExample') }}
+{{ inline('project.algorithms.jax_image_classifier.JaxImageClassifier') }}
 
 ### Configs
 
-#### JaxExample algorithm config
+#### LightningModule config
 
-{{ inline('project/configs/algorithm/jax_example.yaml') }}
+{{ inline('project/configs/algorithm/jax_image_classifier.yaml') }}
 
 ## Running the example
 
 ```console
-$ python project/main.py algorithm=jax_example network=jax_cnn datamodule=cifar10
+$ python project/main.py algorithm=jax_image_classifier network=jax_cnn datamodule=cifar10
 ```
diff --git a/docs/examples/jax_rl_example.md b/docs/examples/jax_rl.md
similarity index 92%
rename from docs/examples/jax_rl_example.md
rename to docs/examples/jax_rl.md
index e41e6269..ac20b0d5 100644
--- a/docs/examples/jax_rl_example.md
+++ b/docs/examples/jax_rl.md
@@ -1,6 +1,6 @@
 ---
 additional_python_references:
-  - project.algorithms.jax_rl_example
+  - project.algorithms.jax_ppo
   - project.trainers.jax_trainer
 ---
 
@@ -31,7 +31,7 @@ It follows the structure of a `JaxModule`, and is trained with a `JaxTrainer`.
 
 
 ??? note "Click to show the code for JaxRLExample"
-    {{ inline('project.algorithms.jax_rl_example.JaxRLExample', 4) }}
+    {{ inline('project.algorithms.jax_ppo.JaxRLExample', 4) }}
 
 
 ## JaxModule
diff --git a/docs/examples/llm_finetuning.md b/docs/examples/llm_finetuning.md
index 0a3d07de..908a7eb7 100644
--- a/docs/examples/llm_finetuning.md
+++ b/docs/examples/llm_finetuning.md
@@ -7,6 +7,7 @@ additional_python_references:
 This example is based on [this language modeling example from the HuggingFace transformers documentation](https://huggingface.co/docs/transformers/en/tasks/language_modeling).
 
 To better understand what's going on in this example, it is a good idea to read through these tutorials first:
+
 * [Causal language modeling simple example - HuggingFace docs](https://huggingface.co/docs/transformers/en/tasks/language_modeling)
 * [Fine-tune a language model - Colab Notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb#scrollTo=X6HrpprwIrIz)
 
diff --git a/docs/examples/text_classification.md b/docs/examples/text_classification.md
index 68122bc5..1ebc1c00 100644
--- a/docs/examples/text_classification.md
+++ b/docs/examples/text_classification.md
@@ -1,22 +1,28 @@
-# Text Classification ( + 🤗)
+---
+additional_python_references:
+  - project.algorithms.text_classifier
+  - project.datamodules.text.text_classification
+---
+
+# Text Classification (⚡ + 🤗)
 
 ## Overview
 
-The [TextClassificationExample][project.algorithms.text_classification_example.TextClassificationExample] is a [LightningModule][lightning.pytorch.core.module.LightningModule] for a simple text classification task.
+The `TextClassifier` is a [LightningModule][lightning.pytorch.core.module.LightningModule] for a simple text classification task.
 
-It accepts a [TextClassificationDataModule][project.datamodules.text.TextClassificationDataModule] as input, along with a network.
+It accepts a `TextClassificationDataModule` as input, along with a network.
 
-??? note "Click to show the code for HFExample"
-    {{ inline('project.algorithms.text_classification_example.TextClassificationExample', 4) }}
+??? note "Click to show the code of the lightningmodule"
+    {{ inline('project.algorithms.text_classifier.TextClassifier', 4) }}
 
 ## Config files
 
 ### Algorithm config
 
 ??? note "Click to show the Algorithm config"
-    Source: project/configs/algorithm/text_classification_example.yaml
+    Source: project/configs/algorithm/text_classifier.yaml
 
-    {{ inline('project/configs/algorithm/text_classification_example.yaml', 4) }}
+    {{ inline('project/configs/algorithm/text_classifier.yaml', 4) }}
 
 ### Datamodule config
 
diff --git a/docs/examples/torch_sl_example.md b/docs/examples/torch_sl_example.md
deleted file mode 100644
index 842b8cc9..00000000
--- a/docs/examples/torch_sl_example.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# Supervised Learning (PyTorch)
-
-The [ExampleAlgorithm][project.algorithms.ExampleAlgorithm] is a simple [LightningModule][lightning.pytorch.core.module.LightningModule] for image classification.
-
-??? note "Click to show the code for ExampleAlgorithm"
-    {{ inline('project.algorithms.example.ExampleAlgorithm', 4) }}
-
-Here is a configuration file that you can use to launch a simple experiment:
-
-??? note "Click to show the yaml config file"
-    {{ inline('project/configs/experiment/example.yaml', 4) }}
-
-You can use it like so:
-
-```console
-python project/main.py experiment=example
-```
diff --git a/docs/features/jax.md b/docs/features/jax.md
index e54d4b19..41c67fd3 100644
--- a/docs/features/jax.md
+++ b/docs/features/jax.md
@@ -1,9 +1,9 @@
 ---
 additional_python_references:
-  - project.algorithms.jax_rl_example
-  - project.algorithms.example
-  - project.algorithms.jax_example
-  - project.algorithms.text_classification_example
+  - project.algorithms.jax_ppo
+  - project.algorithms.image_classifier
+  - project.algorithms.jax_image_classifier
+  - project.algorithms.text_classifier
   - project.trainers.jax_trainer
 ---
 
@@ -12,18 +12,10 @@ additional_python_references:
 > 🔥 NOTE: This is a feature that is entirely unique to this template! 🔥
 
 This template includes examples that use either Jax, PyTorch, or both!
+There's a table describing each example [here](../examples/index.md#examples).
 
-<!-- TODO: De-duplicate: This is a bit like a duplicate of the table from the examples/index.md -->
 
-| Example link                                                    | Reference                   | Framework   | Lightning?   |
-| --------------------------------------------------------------- | --------------------------- | ----------- | ------------ |
-| [ExampleAlgorithm](../examples/jax_sl_example.md)               | `ExampleAlgorithm`          | Torch       | yes          |
-| [JaxExample](../examples/jax_sl_example.md)                     | `JaxExample`                | Torch + Jax | yes          |
-| [TextClassificationExample](../examples/text_classification.md) | `TextClassificationExample` | Torch + 🤗   | yes          |
-| [JaxRLExample](../examples/jax_rl_example.md)                   | `JaxRLExample`              | Jax         | no (almost!) |
-
-
-In fact, here you can mix and match both Jax and Torch code. For example, you can use Jax for your dataloading, your network, or the learning algorithm, all while still benefiting from the nice stuff that comes from using PyTorch-Lightning.
+You can mix and match both Jax and Torch code. For example, you can use Jax for your dataloading, your network, or the learning algorithm, all while still benefiting from the nice stuff that comes from using PyTorch-Lightning.
 
 ??? note "**How does this work?**"
     Well, we use [torch-jax-interop](https://www.github.com/lebrice/torch_jax_interop), another package developed here at Mila 😎, that allows easy interop between torch and jax code. Feel free to take a look at it if you'd like to use it as part of your own project. 😁
@@ -40,12 +32,12 @@ training loop as usual, you can!
 
 The [lightning.Trainer][lightning.pytorch.trainer.trainer.Trainer] will not be able to tell that you're using Jax!
 
-**Take a look at [this image classification example that uses a Jax network](../examples/jax_sl_example.md).**
+**Take a look at [this image classification example that uses a Jax network](../examples/jax_image_classification.md).**
 
 
 ## End-to-end training in Jax: the `JaxTrainer`
 
-The `JaxTrainer`, used in the [Jax RL Example](../examples/jax_rl_example.md), follows a similar structure as the lightning Trainer. However, instead of training LightningModules, it trains `JaxModule`s, which are a simplified, jax-based look-alike of `lightning.LightningModule`s.
+The `JaxTrainer`, used in the [Jax RL Example](../examples/jax_rl.md), follows a similar structure as the lightning Trainer. However, instead of training LightningModules, it trains `JaxModule`s, which are a simplified, jax-based look-alike of `lightning.LightningModule`s.
 
 
 The "algorithm" needs to match the `JaxModule` protocol:
diff --git a/docs/features/testing.md b/docs/features/testing.md
index 8e621fd1..e9ea31f2 100644
--- a/docs/features/testing.md
+++ b/docs/features/testing.md
@@ -55,7 +55,7 @@ The built-in tests cover the following:
     - forward pass is deterministic & reproducibile;
     - backward pass is deterministic & reproducibile;
 
-Take a look at [project.algorithms.testsuites.algorithm_tests][] to see the included base tests for algorithms.
+Take a look at [project.algorithms.testsuites.lightning_module_tests][] to see the included base tests for algorithms.
 
 If you use [Visual Studio Code](https://code.visualstudio.com/), you may want to look into adding
 the "test explorer" tab to your editor. Then, you'll be able to see and debug the tests using the GUI.
@@ -93,7 +93,7 @@ pytest -x -v --slow
 ## Continuous Integration
 
 <!--
-::: project.algorithms.testsuites.algorithm_tests
+::: project.algorithms.testsuites.lightning_module_tests
     options:
         show_bases: false
         show_source: true
diff --git a/docs/profiling_test.py b/docs/profiling_test.py
index 31cd2f20..14d02549 100644
--- a/docs/profiling_test.py
+++ b/docs/profiling_test.py
@@ -23,6 +23,7 @@
 from project.utils.hydra_utils import resolve_dictconfig
 
 
+# NTOE: could also run these commands with the `resources` group and `cluster=mila`
 @pytest.mark.skipif(not shutil.which("sbatch"), reason="Needs to be run on a SLURM cluster")
 @pytest.mark.parametrize(
     "command_line_arguments",
@@ -30,7 +31,7 @@
         # Instrumenting your code -baseline
         """
         experiment=profiling \
-        algorithm=example \
+        algorithm=image_classifier \
         trainer.logger.wandb.name="Baseline" \
         trainer.logger.wandb.tags=["Training","Baseline comparison","CPU/GPU comparison"]
         """,
@@ -77,7 +78,7 @@
         # Identifying potential bottlenecks - fcnet mnist
         """
         experiment=profiling \
-        algorithm=example \
+        algorithm=image_classifier \
         algorithm/network=fcnet \
         datamodule=mnist \
         trainer.logger.wandb.name="FcNet/MNIST baseline with training" \
@@ -86,7 +87,7 @@
         # Throughput across GPU types
         """
         experiment=profiling \
-        algorithm=example \
+        algorithm=image_classifier \
         resources=gpu \
         hydra.launcher.gres='gpu:a100:1' \
         hydra.launcher.cpus_per_task=4 \
@@ -98,7 +99,7 @@
         pytest.param(
             """
         -m experiment=profiling \
-        algorithm=example \
+        algorithm=image_classifier \
         datamodule.num_workers=8 \
         datamodule.batch_size=32,64,128,256 \
         trainer.logger.wandb.tags=["Batch size comparison"]\
diff --git a/mkdocs.yml b/mkdocs.yml
index 5fec699a..ba35959c 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,4 +1,4 @@
-site_name: Research Project Template
+site_name: Research Project Template (preview)
 site_description: Template for a ML Research project.
 # TODO: Need to make sure that the repo URL and site URL are updated correctly in downstream
 # projects, otherwise they will either get a 403 error, or change the docs of the template!
diff --git a/project/algorithms/__init__.py b/project/algorithms/__init__.py
index de0fcedd..cbd55ece 100644
--- a/project/algorithms/__init__.py
+++ b/project/algorithms/__init__.py
@@ -1,13 +1,13 @@
-from .example import ExampleAlgorithm
-from .jax_example import JaxExample
-from .jax_rl_example import JaxRLExample
+from .image_classifier import ImageClassifier
+from .jax_image_classifier import JaxImageClassifier
+from .jax_ppo import JaxRLExample
 from .no_op import NoOp
-from .text_classification_example import TextClassificationExample
+from .text_classifier import TextClassifier
 
 __all__ = [
-    "ExampleAlgorithm",
-    "JaxExample",
+    "ImageClassifier",
+    "JaxImageClassifier",
     "NoOp",
-    "TextClassificationExample",
+    "TextClassifier",
     "JaxRLExample",
 ]
diff --git a/project/algorithms/callbacks/callback.py b/project/algorithms/callbacks/callback.py
index f8635ff6..05c42bbb 100644
--- a/project/algorithms/callbacks/callback.py
+++ b/project/algorithms/callbacks/callback.py
@@ -11,7 +11,6 @@
 from typing_extensions import TypeVar, override
 
 from project.utils.typing_utils import NestedMapping
-from project.utils.utils import get_log_dir
 
 logger = get_logger(__name__)
 
@@ -54,7 +53,7 @@ def setup(
         # todo: "tune" is mentioned in the docstring, is it still used?
         stage: Literal["fit", "validate", "test", "predict", "tune"],
     ) -> None:
-        self.log_dir = get_log_dir(trainer=trainer)
+        self.log_dir = Path(trainer.log_dir or trainer.default_root_dir)
 
     def on_shared_batch_start(
         self,
diff --git a/project/algorithms/example.py b/project/algorithms/image_classifier.py
similarity index 74%
rename from project/algorithms/example.py
rename to project/algorithms/image_classifier.py
index fc11ff0f..7d397a96 100644
--- a/project/algorithms/example.py
+++ b/project/algorithms/image_classifier.py
@@ -3,16 +3,17 @@
 This can be run from the command-line like so:
 
 ```console
-python project/main.py algorithm=example
+python project/main.py algorithm=image_classification datamodule=cifar10
 ```
 """
 
+import functools
 from collections.abc import Sequence
 from logging import getLogger
-from typing import Literal, TypeVar
+from typing import Literal
 
+import hydra_zen
 import torch
-from hydra_zen.typing import Builds, PartialBuilds
 from lightning.pytorch.callbacks.callback import Callback
 from lightning.pytorch.core import LightningModule
 from torch import Tensor
@@ -20,29 +21,20 @@
 from torch.optim.optimizer import Optimizer
 
 from project.algorithms.callbacks.classification_metrics import ClassificationMetricsCallback
-from project.configs.algorithm.optimizer import AdamConfig
 from project.datamodules.image_classification import ImageClassificationDataModule
-from project.experiment import instantiate
+from project.utils.typing_utils import HydraConfigFor
 
 logger = getLogger(__name__)
 
 
-# NOTE: These are just type hints. Don't worry about it. It's just to make the code more readable.
-T = TypeVar("T")
-# Config that returns the object of type T when instantiated.
-_Config = Builds[type[T]]
-# Config that returns a function that creates the object of type T when instantiated.
-_PartialConfig = PartialBuilds[type[T]]
-
-
-class ExampleAlgorithm(LightningModule):
+class ImageClassifier(LightningModule):
     """Example learning algorithm for image classification."""
 
     def __init__(
         self,
         datamodule: ImageClassificationDataModule,
-        network: _Config[torch.nn.Module],
-        optimizer: _PartialConfig[Optimizer] = AdamConfig(lr=3e-4),
+        network: HydraConfigFor[torch.nn.Module],
+        optimizer: HydraConfigFor[functools.partial[Optimizer]],
         init_seed: int = 42,
     ):
         """Create a new instance of the algorithm.
@@ -64,26 +56,18 @@ def __init__(
         self.init_seed = init_seed
 
         # Save hyper-parameters.
-        self.save_hyperparameters(
-            {
-                "network_config": self.network_config,
-                "optimizer_config": self.optimizer_config,
-                "init_seed": init_seed,
-            }
-        )
-
-        # Small fix for the `device` property in LightningModule, which is CPU by default.
-        self._device = next((p.device for p in self.parameters()), torch.device("cpu"))
+        self.save_hyperparameters(ignore=["datamodule"])
         # Used by Pytorch-Lightning to compute the input/output shapes of the network.
-        self.example_input_array = torch.zeros(
-            (datamodule.batch_size, *datamodule.dims), device=self.device
-        )
 
+        self.network: torch.nn.Module | None = None
+
+    def configure_model(self):
+        # Save this for PyTorch-Lightning to infer the input/output shapes of the network.
+        self.example_input_array = torch.zeros((self.datamodule.batch_size, *self.datamodule.dims))
         with torch.random.fork_rng():
             # deterministic weight initialization
             torch.manual_seed(self.init_seed)
-            self.network = instantiate(self.network_config)
-
+            self.network = hydra_zen.instantiate(self.network_config)
             if any(torch.nn.parameter.is_lazy(p) for p in self.network.parameters()):
                 # Do a forward pass to initialize any lazy weights. This is necessary for
                 # distributed training and to infer shapes.
@@ -91,6 +75,7 @@ def __init__(
 
     def forward(self, input: Tensor) -> Tensor:
         """Forward pass of the network."""
+        assert self.network is not None
         logits = self.network(input)
         return logits
 
@@ -123,7 +108,7 @@ def configure_optimizers(self):
         See [`lightning.pytorch.core.LightningModule.configure_optimizers`][] for more information.
         """
         # Instantiate the optimizer config into a functools.partial object.
-        optimizer_partial = instantiate(self.optimizer_config)
+        optimizer_partial = hydra_zen.instantiate(self.optimizer_config)
         # Call the functools.partial object, passing the parameters as an argument.
         optimizer = optimizer_partial(self.parameters())
         # This then returns the optimizer.
diff --git a/project/algorithms/example_test.py b/project/algorithms/image_classifier_test.py
similarity index 52%
rename from project/algorithms/example_test.py
rename to project/algorithms/image_classifier_test.py
index d3e69a9b..7d7023f2 100644
--- a/project/algorithms/example_test.py
+++ b/project/algorithms/image_classifier_test.py
@@ -4,45 +4,50 @@
 import torch
 from transformers import PreTrainedModel
 
-from project.algorithms.testsuites.algorithm_tests import LearningAlgorithmTests
+from project.algorithms.testsuites.lightning_module_tests import LightningModuleTests
 from project.configs import Config
-from project.conftest import command_line_overrides
+from project.conftest import command_line_overrides, skip_on_macOS_in_CI
 from project.datamodules.image_classification.cifar10 import CIFAR10DataModule
 from project.datamodules.image_classification.image_classification import (
     ImageClassificationDataModule,
 )
 from project.utils.testutils import run_for_all_configs_of_type
 
-from .example import ExampleAlgorithm
+from .image_classifier import ImageClassifier
 
 
 @pytest.mark.parametrize(
-    command_line_overrides.__name__, ["algorithm=example datamodule=cifar10"], indirect=True
+    command_line_overrides.__name__,
+    ["algorithm=image_classifier datamodule=cifar10"],
+    indirect=True,
 )
 def test_example_experiment_defaults(experiment_config: Config) -> None:
     """Test to check that the datamodule is required (even when just an algorithm is set?!)."""
 
     assert experiment_config.algorithm["_target_"] == (
-        ExampleAlgorithm.__module__ + "." + ExampleAlgorithm.__qualname__
+        ImageClassifier.__module__ + "." + ImageClassifier.__qualname__
     )
 
     assert isinstance(experiment_config.datamodule, CIFAR10DataModule)
 
 
-@run_for_all_configs_of_type("algorithm", ExampleAlgorithm)
+@skip_on_macOS_in_CI
+@run_for_all_configs_of_type("algorithm", ImageClassifier)
 @run_for_all_configs_of_type("datamodule", ImageClassificationDataModule)
 @run_for_all_configs_of_type("algorithm/network", torch.nn.Module, excluding=PreTrainedModel)
-class TestExampleAlgo(LearningAlgorithmTests[ExampleAlgorithm]):
-    """Tests for the `ExampleAlgorithm`.
+class TestImageClassifier(LightningModuleTests[ImageClassifier]):
+    """Tests for the `ImageClassifier`.
 
     This runs all the tests included in the base class, with the given parametrizations:
 
-    - `algorithm_config` will take the value `"example"`
-        - This is because there is an `example.yaml` config file whose `_target_` is the ``ExampleAlgorithm``.
-    - `datamodule_config` will take these values: `['cifar10', 'fashion_mnist', 'imagenet', 'imagenet32', 'inaturalist', 'mnist']`
+    - `algorithm_config` will take the value `"image_classifier"`
+        - This is because there is an `image_classifier.yaml` config file in project/configs/algorithms
+          whose `_target_` is the `ImageClassifier`.
+    - `datamodule_config` will take these values: `['cifar10', 'fashion_mnist', 'imagenet', 'inaturalist', 'mnist']`
         - These are all the configs whose target is an `ImageClassificationDataModule`.
-    - Similarly, `network_config` will be parametrized by the names of all configs which produce an nn.Module.
+    - Similarly, `network_config` will be parametrized by the names of all configs which produce an nn.Module,
+      except those that would create a `PreTrainedModel` from HuggingFace.
+        - This is currently the easiest way for us to say "any network for image classification.
 
-    Take a look at the [LearningAlgorithmTests class][project.algorithms.testsuites.algorithm_tests.LearningAlgorithmTests]
-    if you want to see the actual test code.
+    Take a look at the `LightningModuleTests` class if you want to see the actual test code.
     """
diff --git a/project/algorithms/jax_example_test.py b/project/algorithms/jax_example_test.py
deleted file mode 100644
index a1bce1f8..00000000
--- a/project/algorithms/jax_example_test.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import flax
-import flax.linen
-
-from project.algorithms.jax_example import JaxExample
-from project.datamodules.image_classification.image_classification import (
-    ImageClassificationDataModule,
-)
-from project.utils.testutils import run_for_all_configs_of_type
-
-from .testsuites.algorithm_tests import LearningAlgorithmTests
-
-
-@run_for_all_configs_of_type("algorithm", JaxExample)
-@run_for_all_configs_of_type("datamodule", ImageClassificationDataModule)
-@run_for_all_configs_of_type("network", flax.linen.Module)
-class TestJaxExample(LearningAlgorithmTests[JaxExample]):
-    """Tests for the Jax example algorithm.
-
-    This simply reuses all the tests in the base test suite, specifying that the `datamodule`
-    passed to the ``JaxExample`` should be for image classification and the `network` should be a
-    `flax.linen.Module`.
-    """
diff --git a/project/algorithms/jax_example.py b/project/algorithms/jax_image_classifier.py
similarity index 67%
rename from project/algorithms/jax_example.py
rename to project/algorithms/jax_image_classifier.py
index 6817e4d2..cdbf0653 100644
--- a/project/algorithms/jax_example.py
+++ b/project/algorithms/jax_image_classifier.py
@@ -1,16 +1,17 @@
-import dataclasses
+import functools
 import logging
-import os
 from typing import Literal
 
-import chex
 import flax.linen
+import hydra_zen
 import jax
 import rich
 import rich.logging
 import torch
 import torch.distributed
 from lightning import Callback, LightningModule, Trainer
+from torch.nn import functional as F
+from torch.optim.optimizer import Optimizer
 from torch_jax_interop import WrappedJaxFunction, torch_to_jax
 
 from project.algorithms.callbacks.classification_metrics import ClassificationMetricsCallback
@@ -19,14 +20,14 @@
     ImageClassificationDataModule,
 )
 from project.datamodules.image_classification.mnist import MNISTDataModule
-from project.utils.typing_utils.protocols import ClassificationDataModule
+from project.utils.typing_utils import HydraConfigFor
 
 
 def flatten(x: jax.Array) -> jax.Array:
     return x.reshape((x.shape[0], -1))
 
 
-class CNN(flax.linen.Module):
+class JaxCNN(flax.linen.Module):
     """A simple CNN model.
 
     Taken from https://flax.readthedocs.io/en/latest/quick_start.html#define-network
@@ -56,62 +57,71 @@ class JaxFcNet(flax.linen.Module):
     num_features: int = 256
 
     @flax.linen.compact
-    def __call__(self, x: jax.Array, forward_rng: chex.PRNGKey | None = None):
-        # x = flatten(x)
+    def __call__(self, x: jax.Array):
+        x = flatten(x)
         x = flax.linen.Dense(features=self.num_features)(x)
         x = flax.linen.relu(x)
         x = flax.linen.Dense(features=self.num_classes)(x)
         return x
 
 
-class JaxExample(LightningModule):
+class JaxImageClassifier(LightningModule):
     """Example of a learning algorithm (`LightningModule`) that uses Jax.
 
     In this case, the network is a flax.linen.Module, and its forward and backward passes are
     written in Jax, and the loss function is in pytorch.
     """
 
-    @dataclasses.dataclass(frozen=True)
-    class HParams:
-        """Hyper-parameters of the algo."""
-
-        lr: float = 1e-3
-        seed: int = 123
-        debug: bool = True
-
     def __init__(
         self,
-        *,
-        network: flax.linen.Module,
         datamodule: ImageClassificationDataModule,
-        hp: HParams = HParams(),
+        network: HydraConfigFor[flax.linen.Module],
+        optimizer: HydraConfigFor[functools.partial[Optimizer]],
+        init_seed: int = 123,
+        debug: bool = True,
     ):
         super().__init__()
-        os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"
-
         self.datamodule = datamodule
-        self.hp = hp or self.HParams()
-
+        self.network_config = network
+        self.optimizer_config = optimizer
+        self.init_seed = init_seed
+        self.debug = debug
+
+        # Create the jax network (safe to do even on CPU here).
+        self.jax_network: flax.linen.Module = hydra_zen.instantiate(self.network_config)
+        # We'll instantiate the parameters and the torch wrapper around the jax network in
+        # `configure_model` so the weights are directly on the GPU.
+        self.network: torch.nn.Module | None = None
+        self.save_hyperparameters(ignore=["datamodule"])
+
+    def configure_model(self):
         example_input = torch.zeros(
-            (datamodule.batch_size, *datamodule.dims),
-            device=self.device,
+            (self.datamodule.batch_size, *self.datamodule.dims),
         )
+        # Save this for PyTorch-Lightning to infer the input/output shapes of the network.
+        self.example_input_array = example_input
+
         # Initialize the jax parameters with a forward pass.
-        params = network.init(jax.random.key(self.hp.seed), x=torch_to_jax(example_input))
+        jax_params = self.jax_network.init(
+            jax.random.key(self.init_seed), torch_to_jax(example_input)
+        )
+
+        jax_network_forward = self.jax_network.apply
+        if not self.debug:
+            jax_network_forward = jax.jit(jax_network_forward)
 
         # Wrap the jax network into a nn.Module:
         self.network = WrappedJaxFunction(
-            jax_function=jax.jit(network.apply) if not self.hp.debug else network.apply,
-            jax_params=params,
+            jax_function=jax_network_forward,
+            jax_params=jax_params,
             # Need to call .clone() when doing distributed training, otherwise we get a RuntimeError:
             # Invalid device pointer when trying to share the CUDA tensors that come from jax.
             clone_params=True,
             has_aux=False,
         )
 
-        self.example_input_array = example_input
-
     def forward(self, input: torch.Tensor) -> torch.Tensor:
+        assert self.network is not None
         logits = self.network(input)
         return logits
 
@@ -130,23 +140,34 @@ def shared_step(
         batch_index: int,
         phase: Literal["train", "val", "test"],
     ):
+        # This is the same thing as the `ImageClassifier.shared_step`!
         x, y = batch
         assert not x.requires_grad
+        assert self.network is not None
         logits = self.network(x)
         assert isinstance(logits, torch.Tensor)
         # In this example we use a jax "encoder" network and a PyTorch loss function, but we could
         # also just as easily have done the whole forward and backward pass in jax if we wanted to.
-        loss = torch.nn.functional.cross_entropy(logits, target=y, reduction="mean")
+        loss = F.cross_entropy(logits, y, reduction="mean")
         acc = logits.argmax(-1).eq(y).float().mean()
         self.log(f"{phase}/loss", loss, prog_bar=True, sync_dist=True)
         self.log(f"{phase}/acc", acc, prog_bar=True, sync_dist=True)
         return {"loss": loss, "logits": logits, "y": y}
 
     def configure_optimizers(self):
-        return torch.optim.SGD(self.parameters(), lr=self.hp.lr)
+        """Creates the optimizers.
+
+        See [`lightning.pytorch.core.LightningModule.configure_optimizers`][] for more information.
+        """
+        # Instantiate the optimizer config into a functools.partial object.
+        optimizer_partial = hydra_zen.instantiate(self.optimizer_config)
+        # Call the functools.partial object, passing the parameters as an argument.
+        optimizer = optimizer_partial(self.parameters())
+        # This then returns the optimizer.
+        return optimizer
 
     def configure_callbacks(self) -> list[Callback]:
-        assert isinstance(self.datamodule, ClassificationDataModule)
+        assert isinstance(self.datamodule, ImageClassificationDataModule)
         return [
             MeasureSamplesPerSecondCallback(),
             ClassificationMetricsCallback.attach_to(self, num_classes=self.datamodule.num_classes),
@@ -193,27 +214,30 @@ def to_channels_last(x: jax.Array) -> jax.Array:
     return x.transpose(0, 2, 3, 1)
 
 
-def main():
+def demo(**trainer_kwargs):
     logging.basicConfig(
         level=logging.INFO, format="%(message)s", handlers=[rich.logging.RichHandler()]
     )
     from lightning.pytorch.callbacks import RichProgressBar
 
     trainer = Trainer(
-        devices="auto",
-        max_epochs=10,
+        **trainer_kwargs,
         accelerator="auto",
         callbacks=[RichProgressBar()],
     )
-    datamodule = MNISTDataModule(num_workers=4, batch_size=512)
-    network = CNN(num_classes=datamodule.num_classes)
-
-    model = JaxExample(network=network, datamodule=datamodule)
+    datamodule = MNISTDataModule(num_workers=4, batch_size=64)
+    network = JaxCNN(num_classes=datamodule.num_classes)
+    optimizer = functools.partial(torch.optim.SGD, lr=0.01)  # type: ignore
+    model = JaxImageClassifier(
+        datamodule=datamodule,
+        network=hydra_zen.just(network),  # type: ignore
+        optimizer=hydra_zen.just(optimizer),  # type: ignore
+    )
     trainer.fit(model, datamodule=datamodule)
 
     ...
 
 
 if __name__ == "__main__":
-    main()
+    demo()
     print("Done!")
diff --git a/project/algorithms/jax_image_classifier_test.py b/project/algorithms/jax_image_classifier_test.py
new file mode 100644
index 00000000..8af161ac
--- /dev/null
+++ b/project/algorithms/jax_image_classifier_test.py
@@ -0,0 +1,35 @@
+from pathlib import Path
+
+import flax
+import flax.linen
+import pytest
+
+from project.algorithms.jax_image_classifier import JaxImageClassifier
+from project.conftest import fails_on_macOS_in_CI
+from project.datamodules.image_classification.image_classification import (
+    ImageClassificationDataModule,
+)
+from project.utils.testutils import run_for_all_configs_of_type
+
+from .testsuites.lightning_module_tests import LightningModuleTests
+
+
+@fails_on_macOS_in_CI
+@run_for_all_configs_of_type("algorithm", JaxImageClassifier)
+@run_for_all_configs_of_type("algorithm/network", flax.linen.Module)
+@run_for_all_configs_of_type("datamodule", ImageClassificationDataModule)
+class TestJaxImageClassifier(LightningModuleTests[JaxImageClassifier]):
+    """Tests for the Jax image classification algorithm.
+
+    This simply reuses all the tests in the base test suite, specifying that the `datamodule`
+    passed to the `JaxImageClassifier` should be for image classification and the `network` should be a
+    `flax.linen.Module`.
+    """
+
+
+@pytest.mark.slow
+def test_demo(tmp_path: Path):
+    """Test the demo at the bottom of the module."""
+    from .jax_image_classifier import demo
+
+    demo(devices=1, overfit_batches=0.1, max_epochs=1, default_root_dir=tmp_path / "logs")
diff --git a/project/algorithms/jax_rl_example.py b/project/algorithms/jax_ppo.py
similarity index 98%
rename from project/algorithms/jax_rl_example.py
rename to project/algorithms/jax_ppo.py
index 8cbfedc5..cd6527bf 100644
--- a/project/algorithms/jax_rl_example.py
+++ b/project/algorithms/jax_ppo.py
@@ -40,7 +40,6 @@
 from project.utils.typing_utils.jax_typing_utils import field, jit
 
 logger = get_logger(__name__)
-# os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"
 
 TEnvParams = TypeVar("TEnvParams", bound=gymnax.EnvParams, default=gymnax.EnvParams)
 """Type variable for the env params (`gymnax.EnvParams`)."""
@@ -549,7 +548,7 @@ def train(
 
         num_evals = np.ceil(self.hp.total_timesteps / self.hp.eval_freq).astype(int)
         ts, evaluation = jax.lax.scan(
-            self.training_epoch,
+            self._training_epoch,
             init=ts,
             xs=None,
             length=num_evals,
@@ -567,7 +566,7 @@ def train(
         return ts, evaluation
 
     # @jit
-    def training_epoch(
+    def _training_epoch(
         self, ts: PPOState[TEnvState], epoch: int
     ) -> tuple[PPOState[TEnvState], EvalMetrics]:
         # Run a few training iterations
@@ -577,17 +576,18 @@ def training_epoch(
             0,
             num_iterations,
             # drop metrics for now
-            lambda i, train_state_i: self.fused_training_step(i, train_state_i)[0],
+            lambda i, train_state_i: self._fused_training_step(i, train_state_i)[0],
             ts,
         )
         # Run evaluation
         return ts, self.eval_callback(ts)
 
     # @jit
-    def fused_training_step(self, iteration: int, ts: PPOState[TEnvState]):
+    def _fused_training_step(self, iteration: int, ts: PPOState[TEnvState]):
         """Fused training step in jax (joined data collection + training).
 
-        *MUCH* faster than using pytorch-lightning, but you lose the callbacks and such.
+        This is the equivalent of the training step from rejax.PPO. It is only used in tests to
+        verify the correctness of the training step.
         """
 
         data_collection_state, trajectories = self.collect_trajectories(
@@ -809,7 +809,7 @@ def render_episode(
 class RenderEpisodesCallback(JaxCallback):
     on_every_epoch: bool = False
 
-    def on_fit_start(self, trainer: JaxTrainer, module: JaxRLExample, ts: PPOState):
+    def on_fit_start(self, trainer: JaxTrainer, module: JaxRLExample, ts: PPOState):  # type: ignore
         if not self.on_every_epoch:
             return
         log_dir = trainer.logger.save_dir if trainer.logger else trainer.default_root_dir
@@ -818,7 +818,7 @@ def on_fit_start(self, trainer: JaxTrainer, module: JaxRLExample, ts: PPOState):
         module.visualize(ts=ts, gif_path=gif_path)
         jax.debug.print("Saved gif to {gif_path}", gif_path=gif_path)
 
-    def on_train_epoch_start(self, trainer: JaxTrainer, module: JaxRLExample, ts: PPOState):
+    def on_train_epoch_start(self, trainer: JaxTrainer, module: JaxRLExample, ts: PPOState):  # type: ignore
         if not self.on_every_epoch:
             return
         log_dir = trainer.logger.save_dir if trainer.logger else trainer.default_root_dir
diff --git a/project/algorithms/jax_rl_example_test.py b/project/algorithms/jax_ppo_test.py
similarity index 74%
rename from project/algorithms/jax_rl_example_test.py
rename to project/algorithms/jax_ppo_test.py
index 11a37967..20a3026a 100644
--- a/project/algorithms/jax_rl_example_test.py
+++ b/project/algorithms/jax_ppo_test.py
@@ -4,7 +4,7 @@
 import functools
 import operator
 import time
-from collections.abc import Callable, Iterable
+from collections.abc import Callable, Iterable, Sequence
 from logging import getLogger
 from pathlib import Path
 from typing import Any
@@ -29,8 +29,9 @@
 
 from project.algorithms.callbacks.samples_per_second import MeasureSamplesPerSecondCallback
 from project.trainers.jax_trainer import JaxTrainer, hparams_to_dict
+from project.utils.testutils import IN_GITHUB_CI
 
-from .jax_rl_example import (
+from .jax_ppo import (
     EvalMetrics,
     JaxRLExample,
     PPOHParams,
@@ -46,62 +47,76 @@
 logger = getLogger(__name__)
 
 
-@pytest.fixture(scope="session", params=[123])
-def seed(request: pytest.FixtureRequest) -> int:
-    seed = getattr(request, "param", 123)
+@pytest.fixture(scope="session", params=[[42, 123]], ids=str)
+def seed(request: pytest.FixtureRequest) -> int | list[int]:
+    seed = getattr(request, "param", 42)
     return seed
 
 
 @pytest.fixture(scope="session")
 def rng(seed: int) -> chex.PRNGKey:
-    return jax.random.key(seed)
+    if isinstance(seed, int):
+        return jax.random.key(seed)
+    else:
+        # multiple seeds
+        return jax.vmap(jax.random.key)(jnp.asarray(seed))
 
 
 @pytest.fixture(scope="session")
-def n_agents(request: pytest.FixtureRequest) -> int | None:
-    return getattr(request, "param", None)
+def n_agents(seed: int | Sequence[int]) -> int | None:
+    if isinstance(seed, int):
+        return None
+    return len(seed)
 
 
 @pytest.fixture(scope="session")
-def results_ours(
-    algo: JaxRLExample,
-    rng: chex.PRNGKey,
-    n_agents: int | None,
-):
+def results_ours(algo: JaxRLExample, rng: chex.PRNGKey, seed: int | Sequence[int]):
     train_fn = algo.train
 
-    if n_agents is not None:
+    if not isinstance(seed, int):
         train_fn = jax.vmap(train_fn)
-        rng = jax.random.split(rng, n_agents)
-
+        # rng should already be an array.
+        # rng = jax.random.split(rng, n_agents)
+    _start = time.perf_counter()
     train_fn = jax.jit(train_fn).lower(rng).compile()
+    print(f"Our tweaked rejax.PPO: Compiled in {time.perf_counter() - _start:.1f} seconds.")
+
     _start = time.perf_counter()
     train_states_ours, evals_ours = train_fn(rng)
     jax.block_until_ready((train_states_ours, evals_ours))
-    print(f"Our tweaked rejax.PPO: {time.perf_counter() - _start:.1f} seconds.")
+    print(f"Our tweaked rejax.PPO: trained in {time.perf_counter() - _start:.1f} seconds.")
     return train_states_ours, evals_ours
 
 
 @pytest.fixture
 def results_ours_with_trainer(
     algo: JaxRLExample,
+    seed: int | Sequence[int],
     rng: chex.PRNGKey,
-    n_agents: int,
     jax_trainer: JaxTrainer,
 ):
     train_fn = jax_trainer.fit
 
-    if n_agents is not None:
+    if not isinstance(seed, int):
+        # Drop callbacks if we want to use vmap.
         jax_trainer = jax_trainer.replace(callbacks=())
         train_fn = jax_trainer.fit
         train_fn = jax.vmap(train_fn, in_axes=(None, 0))
-        rng = jax.random.split(rng, n_agents)
+        # rng is already a key array.
+        # rng = jax.random.split(rng, n_agents)
+    _start = time.perf_counter()
 
     train_fn_with_trainer = jax.jit(train_fn).lower(algo, rng).compile()
+    print(
+        f"Our tweaked rejax.PPO with JaxTrainer: Compiled in {time.perf_counter() - _start:.1f} seconds."
+    )
+
     _start = time.perf_counter()
     _train_states_ours_with_trainer, evals_ours_with_trainer = train_fn_with_trainer(algo, rng)
     jax.block_until_ready((_train_states_ours_with_trainer, evals_ours_with_trainer))
-    print(f"Our tweaked rejax.PPO with JaxTrainer: {time.perf_counter() - _start:.1f} seconds.")
+    print(
+        f"Our tweaked rejax.PPO with JaxTrainer: Trained in {time.perf_counter() - _start:.1f} seconds."
+    )
     return _train_states_ours_with_trainer, evals_ours_with_trainer
 
 
@@ -109,74 +124,98 @@ def results_ours_with_trainer(
 def results_rejax(
     algo: JaxRLExample,
     rng: chex.PRNGKey,
-    n_agents: int,
+    n_agents: int | None,
 ):
-    # _start = time.perf_counter()
     _rejax_ppo, train_states_rejax, evals_rejax = _train_rejax(
         env=algo.env, env_params=algo.env_params, hp=algo.hp, rng=rng, n_agents=n_agents
     )
-    # jax.block_until_ready((train_states_rejax, evals_rejax))
-    # print(f"rejax.PPO: {time.perf_counter() - _start:.1f} seconds.")
     return _rejax_ppo, train_states_rejax, evals_rejax
 
 
+def _should_skip_making_gif(gif_path: Path) -> bool:
+    if gif_path.exists():
+        print(f"Skipping visualization, {gif_path} already exists.")
+        return True
+    return IN_GITHUB_CI
+
+
+# @pytest.mark.xfail(strict=False, reason="TODO: test is flaky!")
 def test_ours(
     algo: JaxRLExample,
     results_ours: tuple[PPOState, EvalMetrics],
     tensor_regression: TensorRegressionFixture,
-    seed: int,
+    seed: int | Sequence[int],
     rng: chex.PRNGKey,
     n_agents: int | None,
     original_datadir: Path,
 ):
-    evaluations = results_ours[1]
-    tensor_regression.check(jax.tree.map(lambda v: v.__array__(), dataclasses.asdict(evaluations)))
+    ts, evaluations = results_ours
+    tensor_regression.check(
+        jax.tree.map(operator.methodcaller("__array__"), dataclasses.asdict(evaluations))
+    )
 
     eval_rng = rng
-    if n_agents is None:
+    if isinstance(seed, int):
         gif_path = original_datadir / f"ours_{seed=}.gif"
-        algo.visualize(results_ours[0], gif_path=gif_path, eval_rng=eval_rng)
-    else:
-        gif_path = original_datadir / f"ours_{n_agents=}_{seed=}_first.gif"
-        fn = functools.partial(jax.tree.map, operator.itemgetter(0))
-        algo.visualize(fn(results_ours[0]), gif_path=gif_path, eval_rng=eval_rng)
+        if not _should_skip_making_gif(gif_path):
+            algo.visualize(ts, gif_path=gif_path, eval_rng=eval_rng)
+        return
+
+    for i, seed_i in enumerate(seed):
+        gif_path = original_datadir / f"ours_seed={seed_i}.gif"
+
+        if _should_skip_making_gif(gif_path):
+            continue
+
+        get_slice = functools.partial(jax.tree.map, operator.itemgetter(i))
+        ts_i = get_slice(ts)
+        eval_rng_i = get_slice(eval_rng)
+        algo.visualize(ts_i, gif_path=gif_path, eval_rng=eval_rng_i)
 
 
 def test_ours_with_trainer(
     algo: JaxRLExample,
     results_ours_with_trainer: tuple[PPOState, EvalMetrics],
     tensor_regression: TensorRegressionFixture,
-    tmp_path: Path,
     seed: int,
     rng: chex.PRNGKey,
     n_agents: int | None,
     original_datadir: Path,
 ):
     ts, evaluations = results_ours_with_trainer
-    tensor_regression.check(jax.tree.map(lambda v: v.__array__(), dataclasses.asdict(evaluations)))
+    tensor_regression.check(
+        jax.tree.map(operator.methodcaller("__array__"), dataclasses.asdict(evaluations))
+    )
 
     eval_rng = rng
     if n_agents is None:
         gif_path = original_datadir / f"ours_with_trainer_{seed=}.gif"
-        algo.visualize(ts, gif_path=gif_path, eval_rng=eval_rng)
-    else:
-        gif_path = original_datadir / f"ours_with_trainer_{n_agents=}_{seed=}_first.gif"
-        fn = functools.partial(jax.tree.map, operator.itemgetter(0))
-        algo.visualize(fn(ts), gif_path=gif_path, eval_rng=eval_rng)
+        if not _should_skip_making_gif(gif_path):
+            algo.visualize(ts, gif_path=gif_path, eval_rng=eval_rng)
+        return
+    assert isinstance(seed, list)
+    for i, seed_i in enumerate(seed):
+        gif_path = original_datadir / f"ours_with_trainer_seed={seed_i}.gif"
+        if _should_skip_making_gif(gif_path):
+            continue
 
+        get_slice = functools.partial(jax.tree.map, operator.itemgetter(i))
+        ts_i = get_slice(ts)
+        eval_rng_i = get_slice(eval_rng)
+        algo.visualize(ts_i, gif_path=gif_path, eval_rng=eval_rng_i)
 
+
+@pytest.mark.xfail(not torch.cuda.is_available(), reason="Fails on CPU in the CI")
 def test_results_are_same_with_or_without_jax_trainer(
     results_ours: tuple[PPOState, EvalMetrics],
     results_ours_with_trainer: tuple[PPOState, EvalMetrics],
 ):
-    np.testing.assert_allclose(
-        results_ours[1].cumulative_reward, results_ours_with_trainer[1].cumulative_reward
-    )
-    # jax.tree.map(
-    #     np.testing.assert_allclose,
-    #     jax.tree.leaves(results_ours),
-    #     jax.tree.leaves(results_ours_with_trainer),
+    # np.testing.assert_allclose(
+    #     results_ours[1].cumulative_reward, results_ours_with_trainer[1].cumulative_reward
     # )
+    # This should also be correct, but we can't use `assert_allclose` between `PRNGKeyArray`s.
+    # jax.tree.map(np.testing.assert_allclose, results_ours, results_ours_with_trainer)
+    jax.tree.map(np.testing.assert_allclose, results_ours[1], results_ours_with_trainer[1])
 
 
 def test_rejax(
@@ -184,24 +223,38 @@ def test_rejax(
     results_rejax: tuple[rejax.PPO, Any, EvalMetrics],
     tensor_regression: TensorRegressionFixture,
     original_datadir: Path,
-    n_agents: int | None,
+    seed: int | Sequence[int],
 ):
     """Train `rejax.PPO` with the same parameters."""
 
-    _algo, ts, evaluations = results_rejax
-    tensor_regression.check(jax.tree.map(lambda v: v.__array__(), dataclasses.asdict(evaluations)))
-    eval_rng = rng
+    rejax_algo, ts, evaluations = results_rejax
+    tensor_regression.check(
+        jax.tree.map(operator.methodcaller("__array__"), dataclasses.asdict(evaluations))
+    )
 
-    if n_agents is None:
+    if isinstance(seed, int):
+        eval_rng = rng
         gif_path = original_datadir / f"rejax_{seed=}.gif"
-        _visualize_rejax(rejax_algo=_algo, rejax_ts=ts, eval_rng=rng, gif_path=gif_path)
-    else:
-        fn = functools.partial(jax.tree.map, operator.itemgetter(0))
+        if not _should_skip_making_gif(gif_path):
+            _visualize_rejax(
+                rejax_algo=rejax_algo, rejax_ts=ts, eval_rng=eval_rng, gif_path=gif_path
+            )
+        return
+
+    for i, seed_i in enumerate(seed):
+        gif_path = original_datadir / f"rejax_seed={seed_i}.gif"
+        if _should_skip_making_gif(gif_path):
+            continue
+
+        get_slice = functools.partial(jax.tree.map, operator.itemgetter(i))
+        rejax_ts_i = get_slice(ts)
+        eval_rng_i = get_slice(rng)
+
         _visualize_rejax(
-            rejax_algo=results_rejax[0],
-            rejax_ts=fn(results_rejax[1]),
-            eval_rng=eval_rng,
-            gif_path=original_datadir / f"rejax_{n_agents=}_{seed=}_first.gif.gif",
+            rejax_algo=rejax_algo,
+            rejax_ts=rejax_ts_i,
+            eval_rng=eval_rng_i,
+            gif_path=original_datadir / f"rejax_seed={seed_i}.gif",
         )
 
 
@@ -219,7 +272,15 @@ def get_slicing_fn(eval: EvalMetrics, get_index_fn: Callable[[EvalMetrics], int]
     return functools.partial(jax.tree.map, operator.itemgetter(index))
 
 
-@pytest.mark.parametrize("n_agents", [pytest.param(100, marks=pytest.mark.slow)], indirect=True)
+@pytest.mark.skip(reason="Saving some time since we're not interpreting the result yet anyway.")
+@pytest.mark.parametrize(
+    "seed",
+    [
+        # Run with 100 different seeds, check that results are statistically equivalent.
+        pytest.param(np.arange(100), marks=pytest.mark.slow),
+    ],
+    indirect=True,
+)
 def test_algos_are_equivalent(
     algo: JaxRLExample,
     n_agents: int | None,
@@ -287,10 +348,11 @@ def _train_rejax(
     start = time.perf_counter()
 
     train_fn = algo.train
-    if n_agents:
+    if n_agents is not None:
         # Vmap training function over n_agents initial seeds
         train_fn = jax.vmap(train_fn)
-        rng = jax.random.split(rng, n_agents)
+        # `rng` should already be an array of seeds.
+        # rng = jax.random.split(rng, n_agents)
 
     train_fn = jax.jit(train_fn).lower(rng).compile()
     print(f"Compiled in {time.perf_counter() - start} seconds.")
@@ -308,13 +370,15 @@ def train_lightning(
     algo: JaxRLExample,
     rng: chex.PRNGKey,
     trainer: lightning.Trainer,
+    n_agents: int | None,
 ):
+    assert n_agents is None, "can't train multiple agents with Lightning (would be too long!)"
     # Fit with pytorch-lightning.
     print("Lightning")
 
     module = PPOLightningModule(
         learner=algo,
-        ts=algo.init_train_state(rng),
+        ts=jax.jit(algo.init_train_state)(rng),
     )
 
     start = time.perf_counter()
@@ -438,8 +502,7 @@ def jax_trainer(algo: JaxRLExample, max_epochs: int, tmp_path: Path):
 
 
 class PPOLightningModule(lightning.LightningModule):
-    """Uses the same code as [project.algorithms.jax_rl_example.JaxRLExample][], but the training
-    loop is run with pytorch-lightning.
+    """Uses the same code as `JaxRLExample`, but the training loop is run with pytorch-lightning.
 
     This is currently only meant to be used to compare the difference fully-jitted training loop
     and lightning.
@@ -457,6 +520,14 @@ def __init__(
         self.ts = ts
 
         self.save_hyperparameters(hparams_to_dict(learner))
+        self.automatic_optimization = False
+        iteration_steps = self.learner.hp.num_envs * self.learner.hp.num_steps
+        # number of "iterations" (collecting batches of episodes in the environment) per epoch.
+        self.num_train_iterations = np.ceil(self.learner.hp.eval_freq / iteration_steps).astype(
+            int
+        )
+
+    def configure_model(self):
         self.actor_params = torch.nn.ParameterList(
             jax.tree.leaves(
                 jax.tree.map(
@@ -473,28 +544,39 @@ def __init__(
                 )
             )
         )
-
-        self.automatic_optimization = False
-
-        iteration_steps = self.learner.hp.num_envs * self.learner.hp.num_steps
-        # number of "iterations" (collecting batches of episodes in the environment) per epoch.
-        self.num_train_iterations = np.ceil(self.learner.hp.eval_freq / iteration_steps).astype(
-            int
+        self.fused_training_step = jax.jit(
+            self.learner._fused_training_step,
         )
 
     @override
     def training_step(self, batch: torch.Tensor, batch_idx: int):
         start = time.perf_counter()
-        with jax.disable_jit(self.learner.hp.debug):
-            algo_struct = self.learner
-            self.ts, train_metrics = algo_struct.fused_training_step(batch_idx, self.ts)
+        assert not self.learner.hp.debug  # for now.
+        # IDEA: Trying to use `donate_argnames='ts'` so Jax reuses the same memory for the parameters,
+        # with the hope that our `torch.nn.Parameters` still magically point to the same memory
+        # (the new param value).
+        # note: Should be using static_argnums=["iteration"], but the value ends up not being used
+        # anyway at the moment.
+        new_ts, train_metrics = self.fused_training_step(batch_idx, self.ts)
+        assert isinstance(new_ts, PPOState)
+        self.ts = new_ts
 
         duration = time.perf_counter() - start
         logger.debug(f"Training step took {duration:.1f} seconds.")
         actor_losses = train_metrics.actor_losses
         critic_losses = train_metrics.critic_losses
-        self.log("train/actor_loss", actor_losses.mean().item(), logger=True, prog_bar=True)
-        self.log("train/critic_loss", critic_losses.mean().item(), logger=True, prog_bar=True)
+        self.log(
+            "train/actor_loss",
+            torch_jax_interop.jax_to_torch(actor_losses.mean()),
+            logger=True,
+            prog_bar=True,
+        )
+        self.log(
+            "train/critic_loss",
+            torch_jax_interop.jax_to_torch(critic_losses.mean()),
+            logger=True,
+            prog_bar=True,
+        )
 
         updates_per_second = (
             self.learner.hp.num_epochs * self.learner.hp.num_minibatches
@@ -511,14 +593,17 @@ def training_step(self, batch: torch.Tensor, batch_idx: int):
             prog_bar=True,
             on_step=True,
         )
-
+        # We could also update the views on the parameters here, but that's pointless since we're
+        # just updating `self.ts`.
+        # Perhaps we could update the "reference" of the nn.Parameters so they point to the new jax
+        # arrays?
         # for jax_param, torch_param in zip(
-        #     jax.tree.leaves(self.train_state.actor_ts.params), self.actor_params
+        #     jax.tree.leaves(self.ts.actor_ts.params), self.actor_params
         # ):
         #     torch_param.set_(torch_jax_interop.to_torch.jax_to_torch_tensor(jax_param))
 
         # for jax_param, torch_param in zip(
-        #     jax.tree.leaves(self.train_state.critic_ts.params), self.critic_params
+        #     jax.tree.leaves(self.ts.critic_ts.params), self.critic_params
         # ):
         #     torch_param.set_(torch_jax_interop.to_torch.jax_to_torch_tensor(jax_param))
 
@@ -648,7 +733,7 @@ def log(
         self,
         name: str,
         value: Any,
-        module: JaxRLExample,
+        module: lightning.LightningModule,
         trainer: lightning.Trainer | JaxTrainer,
         **kwargs,
     ):
@@ -668,10 +753,8 @@ def log(
         # )
 
 
-# TODO: potentially just use the Lightning adapter for unit tests for now?
-# @pytest.mark.skip(reason="TODO: ests assume a LightningModule atm (.state_dict()), etc.")
-# @run_for_all_configs_of_type("algorithm", JaxRLExample)
-# class TestJaxRLExample(LearningAlgorithmTests[JaxRLExample]):  # type: ignore
+# TODO: potentially reuse our test suite by testing the lightning wrapper around the jax algo?
+# class TestJaxRLExample(LightningModuleTests[PPOLightningModule]):  # type: ignore
 #     pass
 
 
@@ -697,28 +780,36 @@ def lightning_trainer(max_epochs: int, tmp_path: Path):
     )
 
 
-# reducing the max_epochs from 75 down to 10 because it's just wayyy too slow.
-@pytest.mark.xfail(reason="Seems to not be completely reproducible")
-@pytest.mark.slow
+# reducing the max_epochs from 75 down to 10 because it's just wayyy too slow otherwise.
 # @pytest.mark.timeout(80)
+@pytest.mark.slow
+@pytest.mark.skip(reason="Seems to not be completely reproducible")
 @pytest.mark.parametrize("max_epochs", [15], indirect=True)
+@pytest.mark.parametrize("seed", [42], indirect=True)  # only do one seed to save time.
 def test_lightning(
     algo: JaxRLExample,
     rng: chex.PRNGKey,
     lightning_trainer: lightning.Trainer,
     tensor_regression: TensorRegressionFixture,
     original_datadir: Path,
+    n_agents: int | None,
+    seed: int | Sequence[int],
 ):
     # todo: save a gif and some metrics?
     train_state, evaluations = train_lightning(
         algo,
         rng=rng,
         trainer=lightning_trainer,
+        n_agents=n_agents,
     )
-    gif_path = original_datadir / "lightning.gif"
-    algo.visualize(train_state, gif_path=gif_path)
-    # file_regression.check(gif_path.read_bytes(), binary=True, extension=".gif")
-    assert len(evaluations) == 1
     # floats in regression files are saved with full precision, and the last few digits are
     # different for some reason.
     tensor_regression.check(jax.tree.map(np.asarray, evaluations[0]))
+    assert len(evaluations) == 1
+
+    gif_path = original_datadir / f"lightning_{seed=}.gif"
+    if _should_skip_making_gif(gif_path):
+        return
+
+    algo.visualize(train_state, gif_path=gif_path)
+    # file_regression.check(gif_path.read_bytes(), binary=True, extension=".gif")
diff --git a/project/algorithms/llm_finetuning_test.py b/project/algorithms/llm_finetuning_test.py
index 99f0edf1..de75dc1a 100644
--- a/project/algorithms/llm_finetuning_test.py
+++ b/project/algorithms/llm_finetuning_test.py
@@ -2,15 +2,14 @@
 
 import copy
 import operator
+from pathlib import Path
+from typing import Any
 
 import jax
 import lightning
-import numpy as np
 import pytest
 import torch
 from tensor_regression import TensorRegressionFixture
-from tensor_regression.stats import get_simple_attributes
-from tensor_regression.to_array import to_ndarray
 from torch.utils.data import DataLoader
 
 from project.algorithms.llm_finetuning import (
@@ -19,13 +18,11 @@
     TokenizerConfig,
     get_hash_of,
 )
-from project.algorithms.testsuites.algorithm_tests import LearningAlgorithmTests
+from project.algorithms.testsuites.lightning_module_tests import LightningModuleTests
 from project.configs.config import Config
-from project.conftest import command_line_overrides
 from project.utils.env_vars import SLURM_JOB_ID
-from project.utils.testutils import IN_GITHUB_COULD_CI, run_for_all_configs_of_type
+from project.utils.testutils import run_for_all_configs_of_type, total_vram_gb
 from project.utils.typing_utils import PyTree
-from project.utils.typing_utils.protocols import DataModule
 
 
 @pytest.mark.parametrize(
@@ -49,44 +46,9 @@ def test_get_hash_of(c1, c2):
     assert get_hash_of(c2) == get_hash_of(copy.deepcopy(c2))
 
 
-@get_simple_attributes.register(tuple)
-def _get_tuple_attributes(value: tuple, precision: int | None):
-    # This is called to get some simple stats to store in regression files during tests, in
-    # particular for tuples (since there isn't already a handler for it in the tensor_regression
-    # package.)
-    # Note: This information about this output is not very descriptive.
-    # not this is called only for the `out.past_key_values` entry in the `CausalLMOutputWithPast`
-    # that is returned from the forward pass output.
-    num_items_to_include = 5  # only show the stats of some of the items.
-    return {
-        "length": len(value),
-        **{
-            f"{i}": get_simple_attributes(item, precision=precision)
-            for i, item in enumerate(value[:num_items_to_include])
-        },
-    }
-
-
-@to_ndarray.register(tuple)
-def _tuple_to_ndarray(v: tuple) -> np.ndarray:
-    """Convert a tuple of values to a numpy array to be stored in a regression file."""
-    # This could get a bit tricky because the items might not have the same shape and so on.
-    # However it seems like the ndarrays_regression fixture (which is what tensor_regression uses
-    # under the hood) is not complaining about us returning a list here, so we'll leave it at that
-    # for now.
-    return [to_ndarray(v_i) for v_i in v]  # type: ignore
-
-
-@pytest.mark.skipif(
-    IN_GITHUB_COULD_CI, reason="This test is too resource-intensive to run on the GitHub CI."
-)
-@pytest.mark.parametrize(
-    command_line_overrides.__name__,
-    ["trainer.strategy=auto" if SLURM_JOB_ID is None else ""],
-    indirect=True,
-)
+@pytest.mark.skipif(total_vram_gb() < 16, reason="Not enough VRAM to run this test.")
 @run_for_all_configs_of_type("algorithm", LLMFinetuningExample)
-class TestLLMFinetuningExample(LearningAlgorithmTests[LLMFinetuningExample]):
+class TestLLMFinetuningExample(LightningModuleTests[LLMFinetuningExample]):
     @pytest.fixture(scope="function")
     def train_dataloader(
         self,
@@ -102,8 +64,12 @@ def train_dataloader(
         """
         # a bit hacky: Set the trainer on the lightningmodule.
         algorithm._trainer = trainer
-        algorithm.prepare_data()
-        algorithm.setup("fit")
+        with torch.random.fork_rng(list(range(torch.cuda.device_count()))):
+            # TODO: This is necessary because torchvision transforms use the global pytorch RNG!
+            lightning.seed_everything(42, workers=True)
+
+            algorithm.prepare_data()
+            algorithm.setup("fit")
 
         train_dataloader = algorithm.train_dataloader()
         assert isinstance(train_dataloader, DataLoader)
@@ -117,12 +83,12 @@ def training_batch(
 
         # The batch of data will always be the same because the dataloaders are passed a Generator
         # object in their constructor.
-        assert isinstance(train_dataloader, DataLoader)
-        dataloader_iterator = iter(train_dataloader)
 
         with torch.random.fork_rng(list(range(torch.cuda.device_count()))):
-            # TODO: This ugliness is because torchvision transforms use the global pytorch RNG!
-            torch.random.manual_seed(42)
+            # TODO: This is necessary because torchvision transforms use the global pytorch RNG!
+            lightning.seed_everything(42, workers=True)
+            assert isinstance(train_dataloader, DataLoader)
+            dataloader_iterator = iter(train_dataloader)
             batch = next(dataloader_iterator)
 
         return jax.tree.map(operator.methodcaller("to", device=device), batch)
@@ -137,15 +103,30 @@ def forward_pass_input(self, training_batch: PyTree[torch.Tensor], device: torch
         assert isinstance(training_batch, dict)
         return training_batch
 
-    # Checking all the weights against the 900mb reference .npz file is a bit slow.
-    @pytest.mark.slow
+    @pytest.mark.xfail(
+        SLURM_JOB_ID is not None, reason="TODO: Seems to be failing when run on a SLURM cluster."
+    )
+    def test_training_batch_doesnt_change(
+        self, training_batch: dict, tensor_regression: TensorRegressionFixture
+    ):
+        # For other algos that have a datamodule, those have a dedicated test class in
+        # datamodules_test.py.
+        # Here since this lightningmodule does not use a datamodule, we test the train_dataloader
+        # method.
+        tensor_regression.check(training_batch, include_gpu_name_in_stats=False)
+
+    @pytest.mark.xfail(
+        SLURM_JOB_ID is not None, reason="TODO: Seems to be failing when run on a SLURM cluster."
+    )
+    @pytest.mark.slow  # Checking against the 900mb reference .npz file is a bit slow.
     def test_initialization_is_reproducible(
         self,
         experiment_config: Config,
-        datamodule: DataModule,
+        datamodule: lightning.LightningDataModule,
         seed: int,
         tensor_regression: TensorRegressionFixture,
         trainer: lightning.Trainer,
+        device: torch.device,
     ):
         super().test_initialization_is_reproducible(
             experiment_config=experiment_config,
@@ -153,4 +134,39 @@ def test_initialization_is_reproducible(
             seed=seed,
             tensor_regression=tensor_regression,
             trainer=trainer,
+            device=device,
+        )
+
+    @pytest.mark.xfail(
+        SLURM_JOB_ID is not None, reason="TODO: Seems to be failing when run on a SLURM cluster."
+    )
+    def test_forward_pass_is_reproducible(
+        self,
+        forward_pass_input: Any,
+        algorithm: LLMFinetuningExample,
+        seed: int,
+        tensor_regression: TensorRegressionFixture,
+    ):
+        return super().test_forward_pass_is_reproducible(
+            forward_pass_input=forward_pass_input,
+            algorithm=algorithm,
+            seed=seed,
+            tensor_regression=tensor_regression,
+        )
+
+    @pytest.mark.xfail(
+        SLURM_JOB_ID is not None, reason="TODO: Seems to be failing when run on a SLURM cluster."
+    )
+    def test_backward_pass_is_reproducible(
+        self,
+        datamodule: lightning.LightningDataModule,
+        algorithm: LLMFinetuningExample,
+        seed: int,
+        accelerator: str,
+        devices: int | list[int],
+        tensor_regression: TensorRegressionFixture,
+        tmp_path: Path,
+    ):
+        return super().test_backward_pass_is_reproducible(
+            datamodule, algorithm, seed, accelerator, devices, tensor_regression, tmp_path
         )
diff --git a/project/algorithms/no_op.py b/project/algorithms/no_op.py
index f4c35909..b2000452 100644
--- a/project/algorithms/no_op.py
+++ b/project/algorithms/no_op.py
@@ -1,16 +1,16 @@
 from typing import Any, Literal
 
+import lightning
 import torch
 from lightning import Callback, LightningModule
 
 from project.algorithms.callbacks.samples_per_second import MeasureSamplesPerSecondCallback
-from project.utils.typing_utils.protocols import DataModule
 
 
 class NoOp(LightningModule):
-    """No-op algorithm that does no learning and is used to benchmark the dataloading speed."""
+    """Algorithm that does no learning and is used to benchmark the dataloading speed."""
 
-    def __init__(self, datamodule: DataModule):
+    def __init__(self, datamodule: lightning.LightningDataModule):
         super().__init__()
         self.datamodule = datamodule
         # Set this so PyTorch-Lightning doesn't try to train the model using our 'loss'
diff --git a/project/algorithms/testsuites/__init__.py b/project/algorithms/testsuites/__init__.py
index f85748da..dffb7d34 100644
--- a/project/algorithms/testsuites/__init__.py
+++ b/project/algorithms/testsuites/__init__.py
@@ -1,3 +1,3 @@
-from .algorithm_tests import LearningAlgorithmTests
+from .lightning_module_tests import LightningModuleTests
 
-__all__ = ["LearningAlgorithmTests"]
+__all__ = ["LightningModuleTests"]
diff --git a/project/algorithms/testsuites/algorithm_tests.py b/project/algorithms/testsuites/lightning_module_tests.py
similarity index 80%
rename from project/algorithms/testsuites/algorithm_tests.py
rename to project/algorithms/testsuites/lightning_module_tests.py
index 13287173..17290827 100644
--- a/project/algorithms/testsuites/algorithm_tests.py
+++ b/project/algorithms/testsuites/lightning_module_tests.py
@@ -1,6 +1,6 @@
-"""Suite of tests for an "algorithm".
+"""Suite of tests for an a `LightningModule`.
 
-See the [project.algorithms.example_test][] module for an example of how to use this.
+See the [project.algorithms.image_classifier_test][] module for an example of how to use this.
 """
 
 import copy
@@ -21,22 +21,20 @@
 from project.configs.config import Config
 from project.experiment import instantiate_algorithm
 from project.utils.typing_utils import PyTree, is_sequence_of
-from project.utils.typing_utils.protocols import DataModule
 
 logger = get_logger(__name__)
 
-# todo: potentially use an Algorithm protocol once the Example algo is type-checking OK against it.
 AlgorithmType = TypeVar("AlgorithmType", bound=LightningModule)
 
 
 @pytest.mark.incremental
-class LearningAlgorithmTests(Generic[AlgorithmType], ABC):
-    """Suite of unit tests for an "Algorithm" (LightningModule).
+class LightningModuleTests(Generic[AlgorithmType], ABC):
+    """Suite of generic tests for a LightningModule.
 
     Simply inherit from this class and decorate the class with the appropriate markers to get a set
     of decent unit tests that should apply to any LightningModule.
 
-    See the [project.algorithms.example_test][] module for an example.
+    See the [project.algorithms.image_classifier_test][] module for an example.
     """
 
     # algorithm_config: ParametrizedFixture[str]
@@ -56,18 +54,34 @@ def forward_pass(self, algorithm: LightningModule, input: PyTree[torch.Tensor]):
     def test_initialization_is_deterministic(
         self,
         experiment_config: Config,
-        datamodule: DataModule,
+        datamodule: lightning.LightningDataModule | None,
         seed: int,
+        trainer: lightning.Trainer,
+        device: torch.device,
     ):
         """Checks that the weights initialization is consistent given the a random seed."""
 
         with torch.random.fork_rng(devices=list(range(torch.cuda.device_count()))):
             torch.random.manual_seed(seed)
             algorithm_1 = instantiate_algorithm(experiment_config.algorithm, datamodule)
+            assert isinstance(algorithm_1, lightning.LightningModule)
+
+            with trainer.init_module(), device:
+                # A bit hacky, but we have to do this because the lightningmodule isn't associated
+                # with a Trainer.
+                algorithm_1._device = device
+                algorithm_1.configure_model()
 
         with torch.random.fork_rng(devices=list(range(torch.cuda.device_count()))):
             torch.random.manual_seed(seed)
             algorithm_2 = instantiate_algorithm(experiment_config.algorithm, datamodule)
+            assert isinstance(algorithm_2, lightning.LightningModule)
+
+            with trainer.init_module(), device:
+                # A bit hacky, but we have to do this because the lightningmodule isn't associated
+                # with a Trainer.
+                algorithm_2._device = device
+                algorithm_2.configure_model()
 
         torch.testing.assert_close(algorithm_1.state_dict(), algorithm_2.state_dict())
 
@@ -140,21 +154,22 @@ def test_backward_pass_is_deterministic(
     def test_initialization_is_reproducible(
         self,
         experiment_config: Config,
-        datamodule: DataModule,
+        datamodule: lightning.LightningDataModule,
         seed: int,
         tensor_regression: TensorRegressionFixture,
         trainer: lightning.Trainer,
+        device: torch.device,
     ):
         """Check that the network initialization is reproducible given the same random seed."""
         with torch.random.fork_rng(devices=list(range(torch.cuda.device_count()))):
             torch.random.manual_seed(seed)
             algorithm = instantiate_algorithm(experiment_config.algorithm, datamodule=datamodule)
-
-            if isinstance(algorithm, LightningModule):
-                # Using `init_module` so the weights are on the right device and with the right
-                # precision.
-                with trainer.init_module():
-                    algorithm.configure_model()
+            assert isinstance(algorithm, lightning.LightningModule)
+            with trainer.init_module(), device:
+                # A bit hacky, but we have to do this because the lightningmodule isn't associated
+                # with a Trainer.
+                algorithm._device = device
+                algorithm.configure_model()
 
         tensor_regression.check(
             algorithm.state_dict(),
@@ -174,7 +189,9 @@ def test_forward_pass_is_reproducible(
         with torch.random.fork_rng(devices=list(range(torch.cuda.device_count()))):
             torch.random.manual_seed(seed)
             out = self.forward_pass(algorithm, forward_pass_input)
-
+        # todo: make tensor-regression more flexible so it can handle tuples in the nested dict.
+        forward_pass_input = convert_list_and_tuples_to_dicts(forward_pass_input)
+        out = convert_list_and_tuples_to_dicts(out)
         tensor_regression.check(
             {"input": forward_pass_input, "out": out},
             default_tolerance={"rtol": 1e-5, "atol": 1e-6},  # some tolerance for changes.
@@ -210,27 +227,18 @@ def test_backward_pass_is_reproducible(
         # BUG: Fix issue in tensor_regression calling .numpy() on cuda tensors.
         assert isinstance(gradients_callback.grads, dict)
         assert isinstance(gradients_callback.outputs, dict)
-        batch = gradients_callback.batch
-        if isinstance(batch, list | tuple):
-            cpu_batch = {str(i): t.cpu() for i, t in enumerate(batch)}
-        else:
-            assert isinstance(batch, dict) and all(
-                isinstance(v, torch.Tensor) for v in batch.values()
-            )
-            cpu_batch = {k: v.cpu() for k, v in batch.items()}
+        # todo: make tensor-regression more flexible so it can handle tuples and lists in the dict.
+        batch = convert_list_and_tuples_to_dicts(gradients_callback.batch)
+        outputs = convert_list_and_tuples_to_dicts(gradients_callback.outputs)
         tensor_regression.check(
             {
-                # FIXME: This is ugly, and specific to the image classification example.
-                "batch": cpu_batch,
-                "grads": {
-                    k: v.cpu() if v is not None else None
-                    for k, v in gradients_callback.grads.items()
-                },
-                "outputs": {k: v.cpu() for k, v in gradients_callback.outputs.items()},
+                "batch": batch,
+                "grads": gradients_callback.grads,
+                "outputs": outputs,
             },
             default_tolerance={"rtol": 1e-5, "atol": 1e-6},  # some tolerance for the jax example.
             # Save the regression files on a different subfolder for each device (cpu / cuda)
-            additional_label=next(algorithm.parameters()).device.type,
+            additional_label=accelerator if accelerator not in ["auto", "gpu"] else None,
             include_gpu_name_in_stats=False,
         )
 
@@ -313,7 +321,7 @@ def do_one_step_of_training(
 
 
 def _get_algorithm_class_from_generic_arg(
-    cls: type[LearningAlgorithmTests[AlgorithmType]],
+    cls: type[LightningModuleTests[AlgorithmType]],
 ) -> type[AlgorithmType]:
     """Retrieves the class under test from the class definition (without having to set a class
     attribute."""
@@ -357,3 +365,22 @@ def on_after_backward(self, trainer: lightning.Trainer, pl_module: LightningModu
 
         for name, param in pl_module.named_parameters():
             self.grads[name] = copy.deepcopy(param.grad)
+
+
+def convert_list_and_tuples_to_dicts(value: Any) -> Any:
+    """Converts all lists and tuples in a nested structure to dictionaries.
+
+    >>> convert_list_and_tuples_to_dicts([1, 2, 3])
+    {'0': 1, '1': 2, '2': 3}
+    >>> convert_list_and_tuples_to_dicts((1, 2, 3))
+    {'0': 1, '1': 2, '2': 3}
+    >>> convert_list_and_tuples_to_dicts({"a": [1, 2, 3], "b": (4, 5, 6)})
+    {'a': {'0': 1, '1': 2, '2': 3}, 'b': {'0': 4, '1': 5, '2': 6}}
+    """
+    if isinstance(value, Mapping):
+        return {k: convert_list_and_tuples_to_dicts(v) for k, v in value.items()}
+    if isinstance(value, list | tuple):
+        # NOTE: Here we won't be able to distinguish between {"0": "bob"} and ["bob"]!
+        # But that's not too bad.
+        return {f"{i}": convert_list_and_tuples_to_dicts(v) for i, v in enumerate(value)}
+    return value
diff --git a/project/algorithms/text_classification_example.py b/project/algorithms/text_classifier.py
similarity index 91%
rename from project/algorithms/text_classification_example.py
rename to project/algorithms/text_classifier.py
index 25b7f6d0..2ef16b1a 100644
--- a/project/algorithms/text_classification_example.py
+++ b/project/algorithms/text_classifier.py
@@ -1,10 +1,8 @@
 from datetime import datetime
-from typing import TypeVar
 
 import evaluate
 import hydra_zen
 import torch
-from hydra_zen.typing import Builds
 from lightning import LightningModule
 from torch.optim.adamw import AdamW
 from transformers import (
@@ -14,19 +12,16 @@
 from transformers.modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 
 from project.datamodules.text.text_classification import TextClassificationDataModule
+from project.utils.typing_utils import HydraConfigFor
 
-T = TypeVar("T")
-# Config that returns the object of type T when instantiated.
-ConfigFor = Builds[type[T]]
 
-
-class TextClassificationExample(LightningModule):
+class TextClassifier(LightningModule):
     """Example of a lightning module used to train a huggingface model for text classification."""
 
     def __init__(
         self,
         datamodule: TextClassificationDataModule,
-        network: ConfigFor[PreTrainedModel],
+        network: HydraConfigFor[PreTrainedModel],
         hf_metric_name: str,
         learning_rate: float = 2e-5,
         adam_epsilon: float = 1e-8,
@@ -36,7 +31,7 @@ def __init__(
     ):
         super().__init__()
         self.network_config = network
-        self.num_labels = getattr(datamodule, "num_classes", None)
+        self.num_labels = datamodule.num_classes
         self.task_name = datamodule.task_name
         self.init_seed = init_seed
         self.hf_metric_name = hf_metric_name
@@ -52,7 +47,7 @@ def __init__(
             experiment_id=datetime.now().strftime("%d-%m-%Y_%H-%M-%S"),
         )
 
-        self.save_hyperparameters(ignore=["network", "datamodule"])
+        self.save_hyperparameters(ignore=["datamodule"])
 
     def configure_model(self) -> None:
         with torch.random.fork_rng(devices=[self.device]):
diff --git a/project/algorithms/text_classification_example_test.py b/project/algorithms/text_classifier_test.py
similarity index 81%
rename from project/algorithms/text_classification_example_test.py
rename to project/algorithms/text_classifier_test.py
index 280c4763..7f50ff84 100644
--- a/project/algorithms/text_classification_example_test.py
+++ b/project/algorithms/text_classifier_test.py
@@ -11,12 +11,12 @@
 from transformers import PreTrainedModel
 from typing_extensions import override
 
-from project.algorithms.text_classification_example import TextClassificationExample
+from project.algorithms.text_classifier import TextClassifier
 from project.datamodules.text.text_classification import TextClassificationDataModule
 from project.utils.env_vars import SLURM_JOB_ID
-from project.utils.testutils import run_for_all_configs_of_type
+from project.utils.testutils import run_for_all_configs_of_type, total_vram_gb
 
-from .testsuites.algorithm_tests import LearningAlgorithmTests
+from .testsuites.lightning_module_tests import LightningModuleTests
 
 
 class RecordTrainingLossCb(lightning.Callback):
@@ -36,32 +36,22 @@ def on_train_batch_end(
         self.losses.append(loss.detach())
 
 
-def total_vram_gb() -> float:
-    """Returns the total VRAM in GB."""
-    if not torch.cuda.is_available():
-        return 0.0
-    return torch.cuda.get_device_properties(0).total_memory / 1024**3
-
-
-# TODO: There's a failing test here only on SLURM?
-
-
 @pytest.mark.skipif(total_vram_gb() < 16, reason="Not enough VRAM to run this test.")
-@run_for_all_configs_of_type("algorithm", TextClassificationExample)
+@run_for_all_configs_of_type("algorithm", TextClassifier)
 @run_for_all_configs_of_type("datamodule", TextClassificationDataModule)
 @run_for_all_configs_of_type("algorithm/network", PreTrainedModel)
-class TestTextClassificationExample(LearningAlgorithmTests[TextClassificationExample]):
+class TestTextClassifier(LightningModuleTests[TextClassifier]):
     """Tests for the HF example."""
 
     @pytest.mark.xfail(
         SLURM_JOB_ID is not None,
-        reason="Weird reproducibility issue with HuggingFace model/dataset on the cluster?",
+        reason="Weird reproducibility issue with HuggingFace model/dataset?",
         raises=AssertionError,
     )
     def test_backward_pass_is_reproducible(  # type: ignore
         self,
         datamodule: TextClassificationDataModule,
-        algorithm: TextClassificationExample,
+        algorithm: TextClassifier,
         seed: int,
         accelerator: str,
         devices: int | list[int],
@@ -82,7 +72,7 @@ def test_backward_pass_is_reproducible(  # type: ignore
     @pytest.mark.slow
     def test_overfit_batch(
         self,
-        algorithm: TextClassificationExample,
+        algorithm: TextClassifier,
         datamodule: TextClassificationDataModule,
         tmp_path: Path,
         num_steps: int = 3,
diff --git a/project/configs/algorithm/example.yaml b/project/configs/algorithm/image_classifier.yaml
similarity index 74%
rename from project/configs/algorithm/example.yaml
rename to project/configs/algorithm/image_classifier.yaml
index 67fa1324..af6d4323 100644
--- a/project/configs/algorithm/example.yaml
+++ b/project/configs/algorithm/image_classifier.yaml
@@ -1,11 +1,11 @@
 # This is an example of how you can use a config file to configure a LightningModule.
-# In this case we configure the example algorithm.
+# In this case we configure the image classifier algorithm.
 defaults:
   - network: resnet18
   - optimizer: Adam
   - _self_
 
-_target_: project.algorithms.example.ExampleAlgorithm
+_target_: project.algorithms.image_classifier.ImageClassifier
 # Note: Why _partial_ here? Because the config doesn't create the algo directly:
 # the datamodule is instantiated first and then passed to the algorithm.
 _partial_: true
diff --git a/project/configs/algorithm/jax_example.yaml b/project/configs/algorithm/jax_image_classifier.yaml
similarity index 52%
rename from project/configs/algorithm/jax_example.yaml
rename to project/configs/algorithm/jax_image_classifier.yaml
index af35750f..68378175 100644
--- a/project/configs/algorithm/jax_example.yaml
+++ b/project/configs/algorithm/jax_image_classifier.yaml
@@ -1,12 +1,15 @@
-# Config for the JaxExample algorithm
+# Config for the JaxImageClassifier algorithm
 defaults:
   - network: jax_cnn
-
-_target_: project.algorithms.jax_example.JaxExample
+  - optimizer: SGD
+_target_: project.algorithms.jax_image_classifier.JaxImageClassifier
 # NOTE: Why _partial_ here? Because the config doesn't create the algo directly.
 # The datamodule is instantiated first and then passed to the algorithm.
 _partial_: true
-hp:
+_recursive_: false
+
+optimizer:
   lr: 0.001
-  seed: 123
-  debug: False
+
+init_seed: 123
+debug: False
diff --git a/project/configs/algorithm/jax_rl_example.yaml b/project/configs/algorithm/jax_ppo.yaml
similarity index 61%
rename from project/configs/algorithm/jax_rl_example.yaml
rename to project/configs/algorithm/jax_ppo.yaml
index 3e210bcc..79914fc7 100644
--- a/project/configs/algorithm/jax_rl_example.yaml
+++ b/project/configs/algorithm/jax_ppo.yaml
@@ -1,15 +1,15 @@
 # Config for the Jax RL Example (PPO).
 # To run this, use the following command:
 # ```
-# python project/main.py algorithm=jax_rl_example trainer=jax
+# python project/main.py algorithm=jax_ppo trainer=jax
 # ```
 
-_target_: project.algorithms.jax_rl_example.JaxRLExample.create
+_target_: project.algorithms.jax_ppo.JaxRLExample.create
 env:
   _target_: gymnax.environments.classic_control.pendulum.Pendulum
 env_params:
   _target_: gymnax.environments.classic_control.pendulum.EnvParams
-  dt: 0.05000000074505806
+  dt: 0.05
   g: 10.0
   l: 1.0
   m: 1.0
@@ -17,14 +17,14 @@ env_params:
   max_steps_in_episode: 200
   max_torque: 2.0
 hp:
-  _target_: project.algorithms.jax_rl_example.PPOHParams
-  clip_eps: 0.20000000298023224
+  _target_: project.algorithms.jax_ppo.PPOHParams
+  clip_eps: 0.2
   debug: false
   ent_coef: 0.0
   eval_freq: 2000
-  gae_lambda: 0.949999988079071
-  gamma: 0.9950000047683716
-  learning_rate: 0.0010000000474974513
+  gae_lambda: 0.95
+  gamma: 0.995
+  learning_rate: 0.001
   max_grad_norm: 10
   normalize_observations: true
   num_envs: 100
diff --git a/project/configs/algorithm/llm_finetuning_example.yaml b/project/configs/algorithm/llm_finetuning.yaml
similarity index 100%
rename from project/configs/algorithm/llm_finetuning_example.yaml
rename to project/configs/algorithm/llm_finetuning.yaml
diff --git a/project/configs/algorithm/network/jax_cnn.yaml b/project/configs/algorithm/network/jax_cnn.yaml
index 2b76cb7a..e38928c3 100644
--- a/project/configs/algorithm/network/jax_cnn.yaml
+++ b/project/configs/algorithm/network/jax_cnn.yaml
@@ -1,2 +1,2 @@
-_target_: project.algorithms.jax_example.CNN
+_target_: project.algorithms.jax_image_classifier.JaxCNN
 num_classes: ${instance_attr:datamodule.num_classes}
diff --git a/project/configs/algorithm/network/jax_fcnet.yaml b/project/configs/algorithm/network/jax_fcnet.yaml
index 0c7df8d4..5cb3ebf7 100644
--- a/project/configs/algorithm/network/jax_fcnet.yaml
+++ b/project/configs/algorithm/network/jax_fcnet.yaml
@@ -1,3 +1,3 @@
-_target_: project.algorithms.jax_example.JaxFcNet
+_target_: project.algorithms.jax_image_classifier.JaxFcNet
 num_classes: ${instance_attr:datamodule.num_classes}
 num_features: 256
diff --git a/project/configs/algorithm/optimizer/__init__.py b/project/configs/algorithm/optimizer/__init__.py
index 67531cdc..8d97fbf3 100644
--- a/project/configs/algorithm/optimizer/__init__.py
+++ b/project/configs/algorithm/optimizer/__init__.py
@@ -1,41 +1,36 @@
 """Configurations for optimizers.
 
-You can add configurations either with a config file or in code using
-[hydra-zen.builds](https://mit-ll-responsible-ai.github.io/hydra-zen/generated/hydra_zen.builds.html#).
-"""
+You can add configurations either with a config file or by registering structured configs in code.
+
+Here is an example of how you could register a new configuration in code using
+[hydra-zen.builds](https://mit-ll-responsible-ai.github.io/hydra-zen/generated/hydra_zen.builds.html#):
 
+
+```python
 import hydra_zen
+from torch.optim import Adam  # type: ignore
 
-# NOTE: Can also create configs programmatically with hydra-zen.
-# This works the same way as creating config files for each algorithm under
-# `configs/algorithm`. From the command-line, you can select both configs that are yaml files as
-# well as structured config (dataclasses).
-from hydra_zen.typing import PartialBuilds
-from torch.optim import SGD, Adam  # type: ignore
-
-# Create some configs manually so they can get nice type hints when imported.
-AdamConfig: type[PartialBuilds[type[Adam]]] = hydra_zen.builds(
-    # note: getting this 'Adam is not exported from `torch.optim`' typing error, but importing it
-    # from torch.optim.adam doesn't work (because they del the `adam` module in torch.optim!)
-    Adam,
-    zen_partial=True,
-    populate_full_signature=True,
-    zen_dataclass={"cls_name": "AdamConfig", "frozen": True},
-)
+optimizers_store = hydra_zen.store(group="algorithm/optimizer")
 
-SGDConfig: type[PartialBuilds[type[SGD]]] = hydra_zen.builds(
-    SGD,
-    zen_partial=True,
-    populate_full_signature=True,
-    zen_dataclass={"cls_name": "SGDConfig", "frozen": True},
+AdamConfig = optimizers_store(
+    hydra_zen.builds(
+        Adam,
+        zen_partial=True,
+        populate_full_signature=True,
+        zen_exclude=["params"],
+        zen_dataclass={"cls_name": "AdamConfig", "frozen": False},
+    ),
+    name="base_adam",
 )
+```
 
-# If you add a configuration file under `project/configs/algorithm`, it will also be available as an option
-# from the command-line, and can use these configs in their default list.
-optimizers_store = hydra_zen.store(group="optimizer")
-# NOTE: You can also add your configs to the config store programmatically like this instead of
-# adding a config file:
+From the command-line, you can select both configs that are yaml files as well as structured config
+(dataclasses).
+
+This works the same way as creating config files for each optimizer under `configs/algorithm/optimizer`.
+Config files can also use structured configs in their defaults list.
+"""
+
+import hydra_zen
 
-# store the config in the config group.
-# optimizers_store(AdamConfig, name="Adam")
-# optimizers_store(SGDConfig, name="SGD")
+optimizers_store = hydra_zen.store(group="algorithm/optimizer")
diff --git a/project/configs/algorithm/text_classification_example.yaml b/project/configs/algorithm/text_classifier.yaml
similarity index 84%
rename from project/configs/algorithm/text_classification_example.yaml
rename to project/configs/algorithm/text_classifier.yaml
index 2540a5fe..481455fa 100644
--- a/project/configs/algorithm/text_classification_example.yaml
+++ b/project/configs/algorithm/text_classifier.yaml
@@ -1,5 +1,5 @@
 # Config for the Text classification example algorithm
-_target_: project.algorithms.text_classification_example.TextClassificationExample
+_target_: project.algorithms.text_classifier.TextClassifier
 _recursive_: false
 network:
   _target_: transformers.models.auto.modeling_auto.AutoModelForSequenceClassification.from_pretrained
diff --git a/project/configs/config.yaml b/project/configs/config.yaml
index 79d8bae2..bf77f664 100644
--- a/project/configs/config.yaml
+++ b/project/configs/config.yaml
@@ -1,7 +1,7 @@
 defaults:
   - base_config
   - _self_
-  - optional algorithm: ???
+  - algorithm: ???
   - optional datamodule: null
   - trainer: default.yaml
   - hydra: default.yaml
diff --git a/project/configs/datamodule/imagenet32.yaml b/project/configs/datamodule/imagenet32.yaml
deleted file mode 100644
index ace5c8ae..00000000
--- a/project/configs/datamodule/imagenet32.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-defaults:
-  - vision
-  - _self_
-_target_: project.datamodules.ImageNet32DataModule
-data_dir: ${constant:SCRATCH}
-val_split: -1
-num_images_per_val_class: 50
-normalize: True
-train_transforms:
-  _target_: project.datamodules.image_classification.imagenet32.imagenet32_train_transforms
diff --git a/project/configs/experiment/example.yaml b/project/configs/experiment/example.yaml
index 4d1a97c1..90d2ca6f 100644
--- a/project/configs/experiment/example.yaml
+++ b/project/configs/experiment/example.yaml
@@ -6,7 +6,7 @@
 # python project/main.py experiment=example
 
 defaults:
-  - override /algorithm: example
+  - override /algorithm: image_classifier
   - override /algorithm/network: resnet18
   - override /datamodule: cifar10
   - override /trainer: default
diff --git a/project/configs/experiment/jax_rl_example.yaml b/project/configs/experiment/jax_rl_example.yaml
index 41cdc2fa..38b1f479 100644
--- a/project/configs/experiment/jax_rl_example.yaml
+++ b/project/configs/experiment/jax_rl_example.yaml
@@ -1,8 +1,8 @@
 # @package _global_
 
 defaults:
-  - override /algorithm: jax_rl_example
-  - override /trainer: jax
+  - override /algorithm: jax_ppo
+  - override /trainer: jax_trainer
   - override /trainer/callbacks: rich_progress_bar
   - override /datamodule: null
   # - /trainer/logger: tensorboard
@@ -12,7 +12,7 @@ trainer:
   training_steps_per_epoch: 1
   callbacks:
     render_episodes:
-      _target_: project.algorithms.jax_rl_example.RenderEpisodesCallback
+      _target_: project.algorithms.jax_ppo.RenderEpisodesCallback
       on_every_epoch: false
     # progress_bar:
     #   _target_: lightning.pytorch.callbacks.progress.rich_progress.RichProgressBar
diff --git a/project/configs/experiment/llm_finetuning_example.yaml b/project/configs/experiment/llm_finetuning_example.yaml
index 30ae5e6a..48b36028 100644
--- a/project/configs/experiment/llm_finetuning_example.yaml
+++ b/project/configs/experiment/llm_finetuning_example.yaml
@@ -1,7 +1,7 @@
 # @package _global_
 
 defaults:
-  - override /algorithm: llm_finetuning_example
+  - override /algorithm: llm_finetuning
   - override /trainer/callbacks: default
 
 algorithm:
diff --git a/project/configs/experiment/profiling.yaml b/project/configs/experiment/profiling.yaml
index de7cbcf8..93c73176 100644
--- a/project/configs/experiment/profiling.yaml
+++ b/project/configs/experiment/profiling.yaml
@@ -2,7 +2,7 @@
 
 defaults:
   - override /datamodule: imagenet
-  - override /algorithm: example
+  - override /algorithm: image_classifier
   - override /trainer/logger: wandb
 
 trainer:
diff --git a/project/configs/experiment/text_classification_example.yaml b/project/configs/experiment/text_classification_example.yaml
index 5f81445f..8fddfcab 100644
--- a/project/configs/experiment/text_classification_example.yaml
+++ b/project/configs/experiment/text_classification_example.yaml
@@ -1,6 +1,6 @@
 # @package _global_
 defaults:
-  - override /algorithm: text_classification_example
+  - override /algorithm: text_classifier
   - override /datamodule: glue_cola
   - override /trainer/callbacks: none
 
diff --git a/project/configs/trainer/default.yaml b/project/configs/trainer/default.yaml
index 1b463ff3..4366d592 100644
--- a/project/configs/trainer/default.yaml
+++ b/project/configs/trainer/default.yaml
@@ -6,7 +6,7 @@ accelerator: auto
 strategy: auto
 devices: 1
 
-deterministic: true
+deterministic: false
 
 fast_dev_run: false
 
diff --git a/project/configs/trainer/jax.yaml b/project/configs/trainer/jax_trainer.yaml
similarity index 100%
rename from project/configs/trainer/jax.yaml
rename to project/configs/trainer/jax_trainer.yaml
diff --git a/project/conftest.py b/project/conftest.py
index bc7ddc2a..6e3d0393 100644
--- a/project/conftest.py
+++ b/project/conftest.py
@@ -7,7 +7,7 @@
 
 Our goal here is to make sure that the way we create networks/datasets/algorithms during tests match
 as closely as possible how they are created normally in a real run.
-For example, when running `python project/main.py algorithm=example`.
+For example, when running `python project/main.py algorithm=image_classifier`.
 
 We achieve this like so: All the components of an experiment are created using fixtures.
 The first fixtures to be invoked are the ones that would correspond to command-line arguments.
@@ -55,6 +55,7 @@
 from __future__ import annotations
 
 import copy
+import functools
 import operator
 import os
 import shlex
@@ -67,9 +68,12 @@
 from pathlib import Path
 from typing import Literal
 
+import hydra.errors
 import jax
 import lightning
+import lightning.pytorch
 import lightning.pytorch as pl
+import lightning.pytorch.utilities
 import pytest
 import tensor_regression.stats
 import torch
@@ -79,7 +83,11 @@
 from hydra import compose, initialize_config_module
 from hydra.conf import HydraHelpConf
 from hydra.core.hydra_config import HydraConfig
+from hydra_plugins.auto_schema import auto_schema_plugin
+from hydra_plugins.auto_schema.auto_schema_plugin import add_schemas_to_all_hydra_configs
 from omegaconf import DictConfig, open_dict
+from tensor_regression.stats import get_simple_attributes
+from tensor_regression.to_array import to_ndarray
 from torch import Tensor
 from torch.utils.data import DataLoader
 
@@ -95,14 +103,13 @@
 from project.trainers.jax_trainer import JaxTrainer
 from project.utils.env_vars import REPO_ROOTDIR
 from project.utils.hydra_utils import resolve_dictconfig
-from project.utils.seeding import seeded_rng
 from project.utils.testutils import (
+    IN_GITHUB_CI,
     PARAM_WHEN_USED_MARK_NAME,
     default_marks_for_config_combinations,
     default_marks_for_config_name,
 )
 from project.utils.typing_utils import is_sequence_of
-from project.utils.typing_utils.protocols import DataModule
 
 if typing.TYPE_CHECKING:
     from _pytest.mark.structures import ParameterSet
@@ -115,6 +122,45 @@
 DEFAULT_TIMEOUT = 1.0
 DEFAULT_SEED = 42
 
+# Note: Here we attempt to make this happen only once.
+auto_schema_plugin.add_schemas_to_all_hydra_configs = functools.cache(
+    add_schemas_to_all_hydra_configs
+)
+
+
+fails_on_macOS_in_CI = pytest.mark.xfail(
+    sys.platform == "darwin" and IN_GITHUB_CI,
+    raises=(RuntimeError, hydra.errors.InstantiationException),
+    reason="Raises 'MPS backend out of memory' error on MacOS in GitHub CI.",
+)
+skip_on_macOS_in_CI = pytest.mark.skipif(
+    sys.platform == "darwin" and IN_GITHUB_CI,
+    reason="TODO: Fails for some reason on MacOS in GitHub CI.",
+)
+
+
+@pytest.fixture(autouse=True, scope="session")
+def prevent_jax_from_reserving_all_the_vram():
+    # note; not using monkeypatch because we want this to be session-scoped.
+    @contextmanager
+    def change_env(variable_name: str, value: str):
+        val_before = os.environ.get(variable_name)
+        os.environ[variable_name] = value
+        yield
+        if val_before is None:
+            os.environ.pop(variable_name)
+        else:
+            os.environ[variable_name] = val_before
+
+    # Set these so that we can use torch and jax during tests on the same GPU (and so that Jax lets
+    # go of the VRAM it doesn't need anymore.
+    # See https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html for more info.
+    with (
+        change_env("XLA_PYTHON_CLIENT_PREALLOCATE", "false"),
+        change_env("XLA_PYTHON_CLIENT_ALLOCATOR", "platform"),
+    ):
+        yield
+
 
 @pytest.fixture(autouse=True)
 def original_datadir(original_datadir: Path):
@@ -140,7 +186,7 @@ def algorithm_config(request: pytest.FixtureRequest) -> str | None:
     """The algorithm config to use in the experiment, as if `algorithm=<value>` was passed.
 
     This is parametrized with all the configurations for a given algorithm type when using the
-    included tests, for example as is done in [project.algorithms.example_test][].
+    included tests, for example as is done in [project.algorithms.image_classifier_test][].
     """
     algorithm_config_name = getattr(request, "param", None)
     if algorithm_config_name:
@@ -270,7 +316,7 @@ def experiment_config(
 
 
 @pytest.fixture(scope="session")
-def datamodule(experiment_dictconfig: DictConfig) -> DataModule | None:
+def datamodule(experiment_dictconfig: DictConfig) -> lightning.LightningDataModule | None:
     """Fixture that creates the datamodule for the given config."""
     # NOTE: creating the datamodule by itself instead of with everything else.
     return instantiate_datamodule(experiment_dictconfig["datamodule"])
@@ -278,15 +324,20 @@ def datamodule(experiment_dictconfig: DictConfig) -> DataModule | None:
 
 @pytest.fixture(scope="function")
 def algorithm(
-    experiment_config: Config, datamodule: DataModule | None, device: torch.device, seed: int
+    experiment_config: Config,
+    datamodule: lightning.LightningDataModule | None,
+    trainer: lightning.Trainer | JaxTrainer,
+    seed: int,
+    device: torch.device,
 ):
     """Fixture that creates the "algorithm" (a
     [LightningModule][lightning.pytorch.core.module.LightningModule])."""
-    # todo: Use the `with device` block only for `configure_model` to replicate the same conditions
-    # as when we're using the PyTorch-Lightning Trainer.
-    with device:
-        algorithm = instantiate_algorithm(experiment_config.algorithm, datamodule=datamodule)
-        if isinstance(algorithm, lightning.LightningModule):
+    algorithm = instantiate_algorithm(experiment_config.algorithm, datamodule=datamodule)
+    if isinstance(trainer, lightning.Trainer) and isinstance(algorithm, lightning.LightningModule):
+        with trainer.init_module(), device:
+            # A bit hacky, but we have to do this because the lightningmodule isn't associated
+            # with a Trainer.
+            algorithm._device = device
             algorithm.configure_model()
     return algorithm
 
@@ -344,7 +395,8 @@ def seed(request: pytest.FixtureRequest, make_torch_deterministic: None):
     random_seed = getattr(request, "param", DEFAULT_SEED)
     assert isinstance(random_seed, int) or random_seed is None
 
-    with seeded_rng(random_seed):
+    with torch.random.fork_rng(devices=list(range(torch.cuda.device_count()))):
+        lightning.seed_everything(random_seed, workers=True)
         yield random_seed
 
 
@@ -665,3 +717,32 @@ def _patched_simple_attributes(v, precision: int | None):
     stats = tensor_regression.stats.get_simple_attributes(v, precision=precision)
     stats.pop("hash", None)
     return stats
+
+
+@get_simple_attributes.register(tuple)
+def _get_tuple_attributes(value: tuple, precision: int | None):
+    # This is called to get some simple stats to store in regression files during tests, in
+    # particular for tuples (since there isn't already a handler for it in the tensor_regression
+    # package.)
+    # Note: This information about this output is not very descriptive.
+    # not this is called only for the `out.past_key_values` entry in the `CausalLMOutputWithPast`
+    # that is returned from the forward pass output.
+    num_items_to_include = 5  # only show the stats of some of the items.
+    return {
+        "length": len(value),
+        **{
+            f"{i}": get_simple_attributes(item, precision=precision)
+            for i, item in enumerate(value[:num_items_to_include])
+        },
+    }
+
+
+@to_ndarray.register(list)
+@to_ndarray.register(tuple)
+def _tuple_to_ndarray(v: tuple | list):
+    """Convert a tuple of values to a numpy array to be stored in a regression file."""
+    # This could get a bit tricky because the items might not have the same shape and so on.
+    # However it seems like the ndarrays_regression fixture (which is what tensor_regression uses
+    # under the hood) is not complaining about us returning a list here, so we'll leave it at that
+    # for now.
+    return {i: to_ndarray(v_i) for i, v_i in enumerate(v)}  # type: ignore
diff --git a/project/datamodules/__init__.py b/project/datamodules/__init__.py
index a9905dfc..65bb8580 100644
--- a/project/datamodules/__init__.py
+++ b/project/datamodules/__init__.py
@@ -7,7 +7,6 @@
 from .image_classification.cifar10 import CIFAR10DataModule, cifar10_normalization
 from .image_classification.fashion_mnist import FashionMNISTDataModule
 from .image_classification.imagenet import ImageNetDataModule
-from .image_classification.imagenet32 import ImageNet32DataModule, imagenet32_normalization
 from .image_classification.inaturalist import INaturalistDataModule
 from .image_classification.mnist import MNISTDataModule
 from .text.text_classification import TextClassificationDataModule
@@ -19,8 +18,6 @@
     "FashionMNISTDataModule",
     "INaturalistDataModule",
     "ImageClassificationDataModule",
-    "imagenet32_normalization",
-    "ImageNet32DataModule",
     "ImageNetDataModule",
     "MNISTDataModule",
     "VisionDataModule",
diff --git a/project/datamodules/image_classification/cifar10.py b/project/datamodules/image_classification/cifar10.py
index 99f41c5f..0e924186 100644
--- a/project/datamodules/image_classification/cifar10.py
+++ b/project/datamodules/image_classification/cifar10.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from collections.abc import Callable
-
 import torch
 from torchvision.datasets import CIFAR10
 from torchvision.transforms import v2 as transforms
@@ -9,7 +7,6 @@
 from project.datamodules.image_classification.image_classification import (
     ImageClassificationDataModule,
 )
-from project.datamodules.vision import VisionDataModule
 from project.utils.typing_utils import C, H, W
 
 
@@ -26,7 +23,7 @@ def cifar10_train_transforms():
     )
 
 
-def cifar10_normalization() -> Callable:
+def cifar10_normalization() -> transforms.Normalize:
     return transforms.Normalize(
         mean=[x / 255.0 for x in [125.3, 123.0, 113.9]],
         std=[x / 255.0 for x in [63.0, 62.1, 66.7]],
@@ -42,7 +39,7 @@ def cifar10_unnormalization(x: torch.Tensor) -> torch.Tensor:
     return (x * std) + mean
 
 
-class CIFAR10DataModule(ImageClassificationDataModule, VisionDataModule):
+class CIFAR10DataModule(ImageClassificationDataModule):
     """
     .. figure:: https://3qeqpr26caki16dnhd19sv6by6v-wpengine.netdna-ssl.com/wp-content/uploads/2019/01/
         Plot-of-a-Subset-of-Images-from-the-CIFAR-10-Dataset.png
@@ -94,9 +91,9 @@ def num_samples(self) -> int:
         train_len, _ = self._get_splits(len_dataset=50_000)
         return train_len
 
-    def default_transforms(self) -> Callable:
+    def default_transforms(self) -> transforms.Compose:
         if self.normalize:
-            cf10_transforms = transforms.Compose(
+            return transforms.Compose(
                 [
                     transforms.ToImage(),
                     transforms.ToDtype(torch.float32, scale=True),
@@ -104,12 +101,9 @@ def default_transforms(self) -> Callable:
                     transforms.ToImage(),  # unsure if this is necessary.
                 ]
             )
-        else:
-            cf10_transforms = transforms.Compose(
-                [
-                    transforms.ToImage(),
-                    transforms.ToDtype(torch.float32, scale=True),
-                ]
-            )
-
-        return cf10_transforms
+        return transforms.Compose(
+            [
+                transforms.ToImage(),
+                transforms.ToDtype(torch.float32, scale=True),
+            ]
+        )
diff --git a/project/datamodules/image_classification/fashion_mnist.py b/project/datamodules/image_classification/fashion_mnist.py
index 8b8c080d..613ea6be 100644
--- a/project/datamodules/image_classification/fashion_mnist.py
+++ b/project/datamodules/image_classification/fashion_mnist.py
@@ -7,8 +7,7 @@
 
 class FashionMNISTDataModule(MNISTDataModule):
     """
-    .. figure:: https://3qeqpr26caki16dnhd19sv6by6v-wpengine.netdna-ssl.com/
-        wp-content/uploads/2019/02/Plot-of-a-Subset-of-Images-from-the-Fashion-MNIST-Dataset.png
+    .. figure:: https://storage.googleapis.com/kaggle-datasets-images/2243/3791/9384af51de8baa77f6320901f53bd26b/dataset-cover.png
         :width: 400
         :alt: Fashion MNIST
 
diff --git a/project/datamodules/image_classification/image_classification.py b/project/datamodules/image_classification/image_classification.py
index 18fed50d..3fe2e26a 100644
--- a/project/datamodules/image_classification/image_classification.py
+++ b/project/datamodules/image_classification/image_classification.py
@@ -1,27 +1,35 @@
 from __future__ import annotations
 
-from typing import TypeVar
-
 from torch import Tensor
 from torchvision.tv_tensors import Image
+from typing_extensions import TypeVar
 
 from project.datamodules.vision import VisionDataModule
 from project.utils.typing_utils import C, H, W
 from project.utils.typing_utils.protocols import ClassificationDataModule
 
-# todo: need to decide whether this should be a base class or just a protocol.
-# - IF this is a protocol, then we can't use issubclass with it, so it can't be used in the
-# `supported_datamodule_types` field on AlgorithmTests subclasses (for example `ClassificationAlgorithmTests`).
-BatchType = TypeVar("BatchType", bound=tuple[Image, Tensor])
+ImageBatchType = TypeVar(
+    "ImageBatchType", bound=tuple[Image, Tensor], default=tuple[Image, Tensor]
+)
+
+
+# todo: this should probably be a protocol. The only issue with that is that we do `issubclass` in
+# tests to determine which datamodule configs are for image classification, so we can't do that
+# with a Protocol.
 
 
 class ImageClassificationDataModule(
-    VisionDataModule[BatchType], ClassificationDataModule[BatchType]
+    VisionDataModule[ImageBatchType], ClassificationDataModule[ImageBatchType]
 ):
     """Lightning data modules for image classification."""
 
+    # This just adds the `num_classes` property to `VisionDataModule`.
+
     num_classes: int
     """Number of classes in the dataset."""
 
     dims: tuple[C, H, W]
     """A tuple describing the shape of the data."""
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
diff --git a/project/datamodules/image_classification/imagenet.py b/project/datamodules/image_classification/imagenet.py
index bcfaa3e1..9c774262 100644
--- a/project/datamodules/image_classification/imagenet.py
+++ b/project/datamodules/image_classification/imagenet.py
@@ -16,28 +16,30 @@
 import rich.logging
 import torch
 import torch.utils.data
+import torchvision
 import tqdm
 from torchvision.datasets import ImageNet
 from torchvision.models.resnet import ResNet152_Weights
-from torchvision.transforms import v2 as transform_lib
+from torchvision.transforms import v2 as transforms
 
-from project.datamodules.vision import VisionDataModule
+from project.datamodules.image_classification.image_classification import (
+    ImageClassificationDataModule,
+)
 from project.utils.env_vars import DATA_DIR, NETWORK_DIR, NUM_WORKERS
 from project.utils.typing_utils import C, H, W
-from project.utils.typing_utils.protocols import Module
 
 logger = get_logger(__name__)
 
 
 def imagenet_normalization():
-    return transform_lib.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    return transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 
 
 ClassIndex = NewType("ClassIndex", int)
 ImageIndex = NewType("ImageIndex", int)
 
 
-class ImageNetDataModule(VisionDataModule):
+class ImageNetDataModule(ImageClassificationDataModule):
     """ImageNet datamodule.
 
     Extracted from https://github.com/Lightning-Universe/lightning-bolts/blob/master/src/pl_bolts/datamodules/imagenet_datamodule.py
@@ -52,16 +54,16 @@ class ImageNetDataModule(VisionDataModule):
         - TODO: need to pass num_imgs_per_class=-1 for test dataset and split="test".
     """
 
-    name: ClassVar[str] = "imagenet"
+    name: str | None = "imagenet"
     """Dataset name."""
 
-    dataset_cls: ClassVar[type[ImageNet]] = ImageNet
+    dataset_cls: ClassVar[type[torchvision.datasets.VisionDataset]] = ImageNet
     """Dataset class to use."""
 
     dims: tuple[C, H, W] = (C(3), H(224), W(224))
     """A tuple describing the shape of the data."""
 
-    num_classes: ClassVar[int] = 1000
+    num_classes: int = 1000
 
     def __init__(
         self,
@@ -146,10 +148,13 @@ def setup(self, stage: Literal["fit", "validate", "test", "predict"] | None = No
         logger.debug(f"Setup ImageNet datamodule for {stage=}")
         super().setup(stage)
 
-    def _split_dataset(self, dataset: ImageNet, train: bool = True) -> torch.utils.data.Dataset:
+    def _split_dataset(
+        self, dataset: torchvision.datasets.VisionDataset, train: bool = True
+    ) -> torch.utils.data.Dataset:
+        assert isinstance(dataset, ImageNet)
         class_item_indices: dict[ClassIndex, list[ImageIndex]] = defaultdict(list)
         for dataset_index, y in enumerate(dataset.targets):
-            class_item_indices[y].append(dataset_index)
+            class_item_indices[ClassIndex(y)].append(ImageIndex(dataset_index))
 
         train_val_split_seed = self.seed
         gen = torch.Generator().manual_seed(train_val_split_seed)
@@ -187,56 +192,56 @@ def _verify_splits(self, data_dir: str | Path, split: str) -> None:
                 f" make sure the folder contains a subfolder named {split}"
             )
 
-    def default_transforms(self) -> Module[[torch.Tensor], torch.Tensor]:
+    def default_transforms(self) -> torch.nn.Module:
         return ResNet152_Weights.IMAGENET1K_V1.transforms
 
-    def train_transform(self) -> Module[[torch.Tensor], torch.Tensor]:
+    def train_transform(self) -> torch.nn.Module:
         """The standard imagenet transforms.
 
-        .. code-block:: python
-
-            transform_lib.Compose([
-                transform_lib.RandomResizedCrop(self.image_size),
-                transform_lib.RandomHorizontalFlip(),
-                transform_lib.ToTensor(),
-                transform_lib.Normalize(
-                    mean=[0.485, 0.456, 0.406],
-                    std=[0.229, 0.224, 0.225]
-                ),
-            ])
+        ```python
+        transforms.Compose([
+            transforms.RandomResizedCrop(self.image_size),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]
+            ),
+        ])
+        ```
         """
-        return transform_lib.Compose(
+        return transforms.Compose(
             [
-                transform_lib.RandomResizedCrop(self.image_size),
-                transform_lib.RandomHorizontalFlip(),
-                transform_lib.ToImage(),
-                transform_lib.ToDtype(torch.float32, scale=True),
+                transforms.RandomResizedCrop(self.image_size),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToImage(),
+                transforms.ToDtype(torch.float32, scale=True),
                 imagenet_normalization(),
             ]
         )
 
-    def val_transform(self) -> Callable:
+    def val_transform(self) -> transforms.Compose:
         """The standard imagenet transforms for validation.
 
         .. code-block:: python
 
-            transform_lib.Compose([
-                transform_lib.Resize(self.image_size + 32),
-                transform_lib.CenterCrop(self.image_size),
-                transform_lib.ToTensor(),
-                transform_lib.Normalize(
+            transforms.Compose([
+                transforms.Resize(self.image_size + 32),
+                transforms.CenterCrop(self.image_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
                     mean=[0.485, 0.456, 0.406],
                     std=[0.229, 0.224, 0.225]
                 ),
             ])
         """
 
-        return transform_lib.Compose(
+        return transforms.Compose(
             [
-                transform_lib.Resize(self.image_size + 32),
-                transform_lib.CenterCrop(self.image_size),
-                transform_lib.ToImage(),
-                transform_lib.ToDtype(torch.float32, scale=True),
+                transforms.Resize(self.image_size + 32),
+                transforms.CenterCrop(self.image_size),
+                transforms.ToImage(),
+                transforms.ToDtype(torch.float32, scale=True),
                 imagenet_normalization(),
             ]
         )
diff --git a/project/datamodules/image_classification/imagenet32.py b/project/datamodules/image_classification/imagenet32.py
deleted file mode 100644
index 698bdb9b..00000000
--- a/project/datamodules/image_classification/imagenet32.py
+++ /dev/null
@@ -1,349 +0,0 @@
-from __future__ import annotations
-
-import copy
-import pickle
-import shutil
-from collections import defaultdict
-from collections.abc import Callable, Sequence
-from logging import getLogger
-from pathlib import Path
-from typing import ClassVar, Literal
-
-import gdown
-import numpy as np
-import torch
-from PIL import Image
-from torch.utils.data import DataLoader, Dataset, Subset
-from torchvision.datasets import VisionDataset
-from torchvision.transforms import v2 as transforms
-
-from project.datamodules.vision import VisionDataModule
-from project.utils.env_vars import DATA_DIR, SCRATCH
-from project.utils.typing_utils import C, H, W
-
-logger = getLogger(__name__)
-
-
-def imagenet32_normalization():
-    return transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
-
-
-class ImageNet32Dataset(VisionDataset):
-    """Downsampled ImageNet 32x32 Dataset."""
-
-    url: ClassVar[str] = "https://drive.google.com/uc?id=1XAlD_wshHhGNzaqy8ML-Jk0ZhAm8J5J_"
-    md5: ClassVar[str] = "64cae578416aebe1576729ee93e41c25"
-    archive_filename: ClassVar[str] = "imagenet32.tar.gz"
-
-    dims: ClassVar[tuple[C, H, W]] = (C(3), H(32), W(32))
-
-    def __init__(
-        self,
-        root: str | Path,
-        readonly_datasets_dir: str | Path | None = None,
-        train: bool = True,
-        transform: Callable | None = None,
-        target_transform: Callable | None = None,
-        download: bool = False,
-    ):
-        super().__init__(str(root), transform=transform, target_transform=target_transform)
-        self.base_folder = "imagenet32"
-        self.train = train  # training set or test set
-        self.split = "train" if self.train else "val"
-        self.split_folder = f"out_data_{self.split}"
-        # TODO: Look for the archive in this directory before downloading it.
-        self.readonly_datasets_dir = (
-            Path(readonly_datasets_dir).expanduser().absolute() if readonly_datasets_dir else None
-        )
-
-        self._data_loaded = False
-        self.data: np.ndarray
-        self.targets: np.ndarray
-
-        if download:
-            self._download_dataset()
-            self._load_dataset()
-        else:
-            try:
-                self._load_dataset()
-            except FileNotFoundError as err:
-                raise RuntimeError(
-                    f"Missing the files for ImageNet32 {self.split} dataset, run this with "
-                    f"`download=True` first."
-                ) from err
-
-    def __getitem__(self, index):
-        """
-        Args:
-            index (int): Index
-        Returns:
-            tuple: (image, target) where target is index of the target class.
-        """
-        img, target = self.data[index], self.targets[index]
-        img = Image.fromarray(img)
-
-        if self.transform is not None:
-            img = self.transform(img)
-
-        if self.target_transform is not None:
-            target = self.target_transform(target)
-
-        return img, target
-
-    def __len__(self):
-        return len(self.data)
-
-    def _download_dataset(self) -> None:
-        archive_path = (Path(self.root) / self.archive_filename).absolute()
-        extracted_path = (Path(self.root) / self.base_folder).absolute()
-        root_path = Path(self.root).absolute()
-
-        def extract_archive_in_root():
-            # Check if the archive is already extracted somehow?
-            logger.info(f"Extracting archive {archive_path} to {root_path}")
-            shutil.unpack_archive(archive_path, extract_dir=str(root_path))
-
-        if extracted_path.exists():
-            logger.info(f"Extraction path {extracted_path} already exists.")
-            try:
-                self._load_dataset()
-                logger.info(f"Archive already downloaded and extracted to {extracted_path}")
-            except Exception as exc:
-                # Unable to load the dataset, for some reason. Re-extract it.
-                logger.info(f"Unable to load the dataset from {extracted_path}: {exc}\n")
-                logger.info("Re-extracting the archive, which will overwrite the files present.")
-                extract_archive_in_root()
-            return
-
-        if archive_path.exists():
-            extract_archive_in_root()
-            return
-        if (
-            self.readonly_datasets_dir
-            and (self.readonly_datasets_dir / self.archive_filename).exists()
-        ):
-            readonly_archive_path = self.readonly_datasets_dir / self.archive_filename
-            logger.info(f"Found the archive at {readonly_archive_path}")
-            logger.info(f"Copying archive from {readonly_archive_path} -> {archive_path}")
-            shutil.copyfile(src=readonly_archive_path, dst=archive_path, follow_symlinks=False)
-            extract_archive_in_root()
-            return
-
-        if not archive_path.exists():
-            logger.info(f"Downloading the archive to {archive_path}")
-            # TODO: This uses the ~/.cache/gdown/ directory, which is not great!
-            gdown.cached_download(
-                url=self.url,
-                md5=self.md5,
-                path=str(archive_path),
-                quiet=False,
-                postprocess=gdown.extractall,
-            )
-
-    def _load_dataset(self):
-        if self._data_loaded:
-            logger.info("Data already loaded. Skipping.")
-            return
-        data = []
-        targets = []
-
-        # Load the picked numpy arrays
-        logger.info(f"Loading ImageNet32 {self.split} dataset...")
-        for i in range(1, 11):
-            file_name = "train_data_batch_" + str(i)
-            file_path = Path(self.root, self.base_folder, self.split_folder, file_name).absolute()
-            with open(file_path, "rb") as f:
-                entry = pickle.load(f, encoding="latin1")
-                data.append(entry["data"])
-                if "labels" in entry:
-                    targets.extend(entry["labels"])
-                else:
-                    targets.extend(entry["fine_labels"])
-        self.targets = np.array(targets) - 1
-        # self.targets = [t - 1 for t in self.targets]
-        self.data = np.vstack(data).reshape(-1, 3, 32, 32)
-        self.data = self.data.transpose((0, 2, 3, 1))
-        logger.info(f"Loaded {len(self.data)} images from ImageNet32 {self.split} split")
-        self._data_loaded = True
-
-
-class ImageNet32DataModule(VisionDataModule):
-    """TODO: Add a `val_split` argument, that supports a value of `0`."""
-
-    name: ClassVar[str] = "imagenet32"
-    dataset_cls: ClassVar[type[ImageNet32Dataset]] = ImageNet32Dataset
-    dims: ClassVar[tuple[C, H, W]] = (C(3), H(32), W(32))
-    num_classes: ClassVar[int] = 1000
-
-    def __init__(
-        self,
-        data_dir: Path = DATA_DIR,
-        readonly_datasets_dir: str | Path | None = SCRATCH,
-        val_split: int | float = -1,
-        num_images_per_val_class: int | None = 50,
-        num_workers: int = 0,
-        normalize: bool = False,
-        batch_size: int = 32,
-        seed: int = 42,
-        shuffle: bool = True,
-        pin_memory: bool = True,
-        drop_last: bool = False,
-        train_transforms: Callable | None = None,
-        val_transforms: Callable | None = None,
-        test_transforms: Callable | None = None,
-    ) -> None:
-        Path(data_dir).mkdir(parents=True, exist_ok=True)
-        super().__init__(
-            data_dir=data_dir,
-            val_split=val_split,
-            num_workers=num_workers,
-            normalize=normalize,
-            batch_size=batch_size,
-            seed=seed,
-            shuffle=shuffle,
-            pin_memory=pin_memory,
-            drop_last=drop_last,
-            train_transforms=train_transforms,
-            val_transforms=val_transforms,
-            test_transforms=test_transforms,
-            # extra kwargs
-            readonly_datasets_dir=readonly_datasets_dir,
-        )
-        self.num_images_per_val_class = num_images_per_val_class
-        if self.val_split == -1 and self.num_images_per_val_class is None:
-            raise ValueError(
-                "Can't have both `val_split` and `num_images_per_val_class` set to `None`!"
-            )
-        if val_split != -1 and self.num_images_per_val_class is not None:
-            logger.warning(
-                "Both `num_images_per_val_class` and `val_split` are set. "
-                "Ignoring value of `num_images_per_val_class` and setting it to None."
-            )
-            self.num_images_per_val_class = None
-
-        self.dataset_train: ImageNet32Dataset | Subset
-        self.dataset_val: ImageNet32Dataset | Subset
-        self.dataset_test: ImageNet32Dataset | Subset
-
-    @property
-    def num_samples(self) -> int:
-        return len(self.dataset_train)
-
-    def prepare_data(self) -> None:
-        """Saves files to data_dir."""
-        super().prepare_data()
-
-    def setup(self, stage: Literal["fit", "validate", "test", "predict"] | None = None) -> None:
-        # """Creates train, val, and test dataset."""
-        if stage:
-            logger.debug(f"Setting up for stage {stage}")
-        else:
-            logger.debug("Setting up for all stages")
-
-        if stage in ["fit", "validate", None]:
-            base_dataset = self.dataset_cls(self.data_dir, **self.train_kwargs)
-            assert len(base_dataset) == 1_281_159
-
-            base_dataset_train = copy.deepcopy(base_dataset)
-            base_dataset_train.transform = self.train_transforms
-            base_dataset_train.data = base_dataset.data
-            base_dataset_train.targets = base_dataset.targets
-
-            base_dataset_valid = copy.deepcopy(base_dataset)
-            base_dataset_valid.transform = self.val_transforms
-            base_dataset_valid.data = base_dataset.data
-            base_dataset_valid.targets = base_dataset.targets
-
-            if self.num_images_per_val_class is not None:
-                train_indices, val_indices = get_train_val_indices(
-                    dataset_labels=base_dataset.targets,
-                    nb_imgs_in_val=self.num_images_per_val_class,
-                    split_seed=self.seed,
-                )
-                self.dataset_train = Subset(base_dataset_train, train_indices)
-                self.dataset_val = Subset(base_dataset_valid, val_indices)
-            else:
-                self.dataset_train = self._split_dataset(base_dataset_train, train=True)
-                self.dataset_val = self._split_dataset(base_dataset_valid, train=False)
-
-        if stage in ["test", "predict", None]:
-            test_transforms = self.test_transforms or self.default_transforms()
-            self.dataset_test = self.dataset_cls(
-                self.data_dir, train=False, transform=test_transforms, **self.EXTRA_ARGS
-            )
-
-    def default_transforms(self) -> Callable:
-        """Default transform for the dataset."""
-        return transforms.Compose(
-            [
-                transforms.ToImage(),
-                transforms.ToDtype(torch.float32, scale=True),
-            ]
-            + ([imagenet32_normalization()] if self.normalize else [])
-        )
-
-    def train_dataloader(self) -> DataLoader:
-        """The train dataloader."""
-        return self._data_loader(self.dataset_train, shuffle=self.shuffle)
-
-    def val_dataloader(self) -> DataLoader:
-        """The val dataloader."""
-        return self._data_loader(self.dataset_val)
-
-    def test_dataloader(self) -> DataLoader:
-        """The test dataloader."""
-        return self._data_loader(self.dataset_test)
-
-    def _data_loader(self, dataset: Dataset, shuffle: bool = False) -> DataLoader:
-        return DataLoader(
-            dataset,
-            batch_size=self.batch_size,
-            shuffle=shuffle,
-            num_workers=self.num_workers,
-            drop_last=self.drop_last,
-            pin_memory=self.pin_memory,
-        )
-
-    def _split_dataset(self, dataset: ImageNet32Dataset, train: bool = True) -> Subset:
-        assert self.val_split >= 0
-        split_dataset = super()._split_dataset(dataset, train=train)
-        assert isinstance(split_dataset, Subset)
-        return split_dataset
-
-
-# TODO: Do something like this to partition the train and val sets, instead of using a val_fraction
-
-
-def get_train_val_indices(
-    dataset_labels: Sequence[int] | np.ndarray,
-    nb_imgs_in_val: int,
-    split_seed: int,
-) -> tuple[list[int], list[int]]:
-    """Keeps the first `nb_imgs_in_val` images of each class in the validation set."""
-    val_indices: list[int] = []
-    train_indices: list[int] = []
-
-    index_and_label = np.array(list(enumerate(dataset_labels)))
-    rng = np.random.RandomState(split_seed)
-    rng.shuffle(index_and_label)
-
-    n_val_samples_per_class = defaultdict(int)
-    for index, y in index_and_label:
-        if n_val_samples_per_class[y] < nb_imgs_in_val:
-            val_indices.append(index)
-            n_val_samples_per_class[y] += 1
-        else:
-            train_indices.append(index)
-    return train_indices, val_indices
-
-
-def imagenet32_train_transforms():
-    return transforms.Compose(
-        [
-            transforms.ToImage(),
-            transforms.ToDtype(torch.float32, scale=True),
-            transforms.RandomHorizontalFlip(p=0.5),
-            transforms.RandomCrop(size=32, padding=4, padding_mode="edge"),
-            imagenet32_normalization(),
-        ]
-    )
diff --git a/project/datamodules/image_classification/imagenet32_test.py b/project/datamodules/image_classification/imagenet32_test.py
deleted file mode 100644
index 537c91ce..00000000
--- a/project/datamodules/image_classification/imagenet32_test.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import itertools
-
-import pytest
-
-from project.utils.env_vars import DATA_DIR, SCRATCH
-from project.utils.testutils import IN_GITHUB_CI
-
-from .imagenet32 import ImageNet32DataModule
-
-
-@pytest.mark.skipif(IN_GITHUB_CI, reason="Can't run on the GitHub CI.")
-@pytest.mark.slow
-def test_dataset_download_works():
-    batch_size = 16
-    datamodule = ImageNet32DataModule(
-        data_dir=DATA_DIR,
-        readonly_datasets_dir=SCRATCH,
-        batch_size=batch_size,
-        num_images_per_val_class=10,
-    )
-    assert datamodule.num_images_per_val_class == 10
-    assert datamodule.val_split == -1
-    datamodule.prepare_data()
-    datamodule.setup(None)
-    expected_total = 1_281_159
-
-    assert (
-        datamodule.num_samples
-        == expected_total - datamodule.num_classes * datamodule.num_images_per_val_class
-    )
-    for loader_fn in [
-        datamodule.train_dataloader,
-        datamodule.val_dataloader,
-        datamodule.test_dataloader,
-    ]:
-        loader = loader_fn()
-        for x, y in itertools.islice(loader, 1):
-            assert x.shape == (batch_size, 3, 32, 32)
-            assert y.shape == (batch_size,)
-            break
-
-
-if __name__ == "__main__":
-    import logging
-
-    logging.basicConfig(level=logging.DEBUG)
-    assert SCRATCH
-    test_dataset_download_works(SCRATCH / "data")
diff --git a/project/datamodules/image_classification/inaturalist.py b/project/datamodules/image_classification/inaturalist.py
index 58eb8d52..14856fba 100644
--- a/project/datamodules/image_classification/inaturalist.py
+++ b/project/datamodules/image_classification/inaturalist.py
@@ -7,11 +7,9 @@
 from typing import Any, ClassVar, Literal
 
 import torchvision.transforms as T
-from torchvision.datasets import INaturalist
+from torchvision.datasets import INaturalist, VisionDataset
 
-from project.datamodules.image_classification.image_classification import (
-    ImageClassificationDataModule,
-)
+from project.datamodules.vision import VisionDataModule
 from project.utils.env_vars import DATA_DIR, NUM_WORKERS, SLURM_TMPDIR
 from project.utils.typing_utils import C, H, W
 
@@ -34,11 +32,11 @@ def inat_dataset_dir() -> Path:
     return network_dir
 
 
-class INaturalistDataModule(ImageClassificationDataModule):
-    name: ClassVar[str] = "inaturalist"
+class INaturalistDataModule(VisionDataModule):
+    name: str | None = "inaturalist"
     """Dataset name."""
 
-    dataset_cls: ClassVar[type[INaturalist]] = INaturalist
+    dataset_cls: ClassVar[type[VisionDataset]] = INaturalist
     """Dataset class to use."""
 
     dims: tuple[C, H, W] = (C(3), H(224), W(224))
diff --git a/project/datamodules/image_classification/mnist.py b/project/datamodules/image_classification/mnist.py
index fecf1753..d635142c 100644
--- a/project/datamodules/image_classification/mnist.py
+++ b/project/datamodules/image_classification/mnist.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from collections.abc import Callable
+from pathlib import Path
 from typing import Any
 
 import torch
@@ -11,6 +12,7 @@
 from project.datamodules.image_classification.image_classification import (
     ImageClassificationDataModule,
 )
+from project.utils.env_vars import DATA_DIR
 from project.utils.typing_utils import C, H, W
 
 
@@ -75,9 +77,9 @@ class MNISTDataModule(ImageClassificationDataModule):
 
     def __init__(
         self,
-        data_dir: str | None = None,
+        data_dir: str | Path = DATA_DIR,
         val_split: int | float = 0.2,
-        num_workers: int | None = 0,
+        num_workers: int = 0,
         normalize: bool = False,
         batch_size: int = 32,
         seed: int = 42,
diff --git a/project/datamodules/text/text_classification_test.py b/project/datamodules/text/text_classification_test.py
index c4878fc3..70434e7d 100644
--- a/project/datamodules/text/text_classification_test.py
+++ b/project/datamodules/text/text_classification_test.py
@@ -1,40 +1,37 @@
 from __future__ import annotations
 
+import lightning
 import pytest
-from omegaconf import DictConfig
 
 from project.datamodules.text.text_classification import TextClassificationDataModule
-from project.experiment import (
-    instantiate_datamodule,
-)
-from project.utils.hydra_config_utils import get_config_loader
-from project.utils.testutils import (
-    run_for_all_configs_of_type,
-)
-from project.utils.typing_utils.protocols import DataModule
+from project.experiment import instantiate_datamodule
+from project.utils.testutils import get_config_loader
+
+datamodule_configs = ["glue_cola"]
 
 
 @pytest.fixture()
 def datamodule(
-    datamodule_config: str | None,
-    command_line_overrides: list[str] | None,
-) -> DataModule:
+    request: pytest.FixtureRequest,
+) -> lightning.LightningDataModule:
     """Fixture that creates the datamodule for the given config."""
     # Load only the datamodule? (assuming it doesn't depend on the network or anything else...)
     from hydra.types import RunMode
 
+    datamodule_config_name = request.param
+    # need to pass a datamodule config via indirect parametrization.
+    assert isinstance(datamodule_config_name, str)
+
     config = get_config_loader().load_configuration(
-        f"datamodule/{datamodule_config}.yaml",
-        overrides=command_line_overrides or [],
+        f"datamodule/{datamodule_config_name}.yaml",
+        overrides=[],
         run_mode=RunMode.RUN,
     )
     datamodule_config = config["datamodule"]
-    assert isinstance(datamodule_config, DictConfig)
     datamodule = instantiate_datamodule(datamodule_config)
+    assert datamodule is not None
     return datamodule
 
-    # NOTE: creating the datamodule by itself instead of with everything else.
-
 
 @pytest.fixture()
 def prepared_datamodule(
@@ -64,7 +61,7 @@ def prepared_datamodule(
     datamodule.working_path = _slurm_tmpdir_before
 
 
-@run_for_all_configs_of_type("datamodule", TextClassificationDataModule)
+@pytest.mark.parametrize(datamodule.__name__, datamodule_configs, indirect=True)
 def test_dataset_location(
     prepared_datamodule: TextClassificationDataModule,
 ):
@@ -79,21 +76,3 @@ def test_dataset_location(
     for file_name in expected_files:
         file_path = datamodule.working_path / file_name
         assert file_path.exists(), f"Expected file: {file_name} not found at {file_path}."
-
-
-@run_for_all_configs_of_type("datamodule", TextClassificationDataModule)
-@pytest.mark.skip(reason="Not implemented")
-def test_pretrained_weight_location(
-    prepared_datamodule: TextClassificationDataModule,
-):
-    """Test that the pretrained weights are downloaded to the correct location."""
-    # datamodule = prepared_datamodule
-    pass
-
-
-## mismatched tasks
-# datamodule = HFDataModule(
-#    tokenizer="EleutherAI/gpt-neo-125M",
-#    hf_dataset_path="roneneldan/TinyStories",
-#    dataset_path=SLURM_TMPDIR,
-# )
diff --git a/project/datamodules/vision.py b/project/datamodules/vision.py
index 50d8dd12..37cd7581 100644
--- a/project/datamodules/vision.py
+++ b/project/datamodules/vision.py
@@ -9,11 +9,12 @@
 from typing import ClassVar, Concatenate, Literal, ParamSpec, TypeVar
 
 import torch
+import torchvision.transforms
+import torchvision.transforms.v2
 from lightning import LightningDataModule
 from torch.utils.data import DataLoader, Dataset, random_split
 from torch.utils.data._utils.collate import collate_tensor_fn, default_collate_fn_map
 from torchvision.datasets import VisionDataset
-from torchvision.transforms import v2 as transforms
 from torchvision.tv_tensors import Image, set_return_type
 
 from project.utils.env_vars import DATA_DIR, NUM_WORKERS
@@ -32,13 +33,13 @@ class VisionDataModule(LightningDataModule, DataModule[BatchType_co]):
     (Taken from pl_bolts which is not very well maintained.)
     """
 
-    name: ClassVar[str] = ""
+    name: str | None = ""
     """Dataset name."""
 
     dataset_cls: ClassVar[type[VisionDataset]]
     """Dataset class to use."""
 
-    dims: ClassVar[tuple[C, H, W]]
+    dims: tuple[C, H, W]
     """A tuple describing the shape of the data."""
 
     def __init__(
@@ -85,12 +86,27 @@ def __init__(
         self.pin_memory = pin_memory
         self.drop_last = drop_last
         self.train_transforms = train_transforms or self.default_transforms()
-        self.val_transforms = val_transforms or transforms.Compose(
-            [transforms.ToImage(), transforms.ToDtype(torch.float32, scale=True)]
+        self.val_transforms = val_transforms or torchvision.transforms.v2.Compose(
+            [
+                torchvision.transforms.v2.ToImage(),
+                torchvision.transforms.v2.ToDtype(torch.float32, scale=True),
+            ]
         )
-        self.test_transforms = test_transforms or transforms.Compose(
-            [transforms.ToImage(), transforms.ToDtype(torch.float32, scale=True)]
+        self.test_transforms = test_transforms or torchvision.transforms.v2.Compose(
+            [
+                torchvision.transforms.v2.ToImage(),
+                torchvision.transforms.v2.ToDtype(torch.float32, scale=True),
+            ]
         )
+        if (
+            not normalize
+            and train_transforms is not None
+            and _contains_normalization_transform(train_transforms)
+        ):
+            logger.warning(
+                "You passed `normalize=False` but `train_transforms` contains a normalization transform. "
+                "The provided normalization transform will be applied."
+            )
 
         # todo: what about the shuffling at each epoch?
         _rng = torch.Generator(device="cpu").manual_seed(self.seed)
@@ -115,7 +131,9 @@ def __init__(
             self.test_kwargs["train"] = False
 
         self.batch_size_per_device: int = batch_size
-        self.save_hyperparameters(logger=False)
+        self.save_hyperparameters(
+            logger=False, ignore=["train_transforms", "val_transforms", "test_transforms"]
+        )
 
     def prepare_data(self) -> None:
         """Saves files to data_dir."""
@@ -313,3 +331,15 @@ def num_cpus_on_node() -> int:
     if hasattr(os, "sched_getaffinity"):
         return len(os.sched_getaffinity(0))
     return torch.multiprocessing.cpu_count()
+
+
+def _contains_normalization_transform(transforms: Callable) -> bool:
+    if isinstance(
+        transforms, torchvision.transforms.Normalize | torchvision.transforms.v2.Normalize
+    ):
+        return True
+    if isinstance(transforms, torchvision.transforms.Compose | torchvision.transforms.v2.Compose):
+        return any(_contains_normalization_transform(t) for t in transforms.transforms)
+    if isinstance(transforms, torch.nn.Sequential):
+        return any(_contains_normalization_transform(t) for t in transforms.transforms)
+    return False
diff --git a/project/experiment.py b/project/experiment.py
index 940537f6..8b9e4cc8 100644
--- a/project/experiment.py
+++ b/project/experiment.py
@@ -14,7 +14,7 @@
 import copy
 import functools
 import logging
-from logging import getLogger as get_logger
+import typing
 from typing import Any
 
 import hydra
@@ -23,15 +23,15 @@
 import rich.console
 import rich.logging
 import rich.traceback
-from hydra_zen.typing import Builds
-from lightning import Callback, LightningDataModule, LightningModule, Trainer
 
-from project.configs.config import Config
-from project.trainers.jax_trainer import JaxModule, JaxTrainer
-from project.utils.typing_utils.protocols import DataModule
-from project.utils.utils import validate_datamodule
+if typing.TYPE_CHECKING:
+    from hydra_zen.typing import Builds
+    from lightning import Callback, LightningDataModule, LightningModule, Trainer
 
-logger = get_logger(__name__)
+    from project.configs.config import Config
+    from project.trainers.jax_trainer import JaxModule, JaxTrainer
+
+logger = logging.getLogger(__name__)
 
 
 # BUG: Always using the pydantic parser when instantiating things would be nice, but it currently
@@ -109,7 +109,9 @@ def instantiate_datamodule(
     """
     if not datamodule_config:
         return None
-    if isinstance(datamodule_config, DataModule):
+    import lightning
+
+    if isinstance(datamodule_config, lightning.LightningDataModule):
         logger.info(
             f"Datamodule was already instantiated (probably to interpolate a field value). "
             f"{datamodule_config=}"
@@ -119,12 +121,11 @@ def instantiate_datamodule(
         logger.debug(f"Instantiating datamodule from config: {datamodule_config}")
         datamodule = instantiate(datamodule_config)
 
-    datamodule = validate_datamodule(datamodule)
     return datamodule
 
 
 def instantiate_algorithm(
-    algorithm_config: Config, datamodule: DataModule | None
+    algorithm_config: Config, datamodule: LightningDataModule | None
 ) -> LightningModule | JaxModule:
     """Function used to instantiate the algorithm.
 
@@ -138,8 +139,9 @@ def instantiate_algorithm(
     # directly on the default device (GPU).
     # Create the algorithm
     algo_config = algorithm_config
+    import lightning
 
-    if isinstance(algo_config, LightningModule):
+    if isinstance(algo_config, lightning.LightningModule):
         logger.info(
             f"Algorithm was already instantiated (probably to interpolate a field value)."
             f"{algo_config=}"
@@ -162,8 +164,9 @@ def instantiate_algorithm(
         #     f"not recommended (since we can't pass the datamodule to the constructor)."
         # )
         algorithm = algo_or_algo_partial
+    from project.trainers.jax_trainer import JaxModule
 
-    if not isinstance(algorithm, LightningModule | JaxModule):
+    if not isinstance(algorithm, lightning.LightningModule | JaxModule):
         logger.warning(
             UserWarning(
                 f"Your algorithm ({algorithm}) is not a LightningModule. Beware that this isn't "
diff --git a/project/main.py b/project/main.py
index 3d970cf2..6c715159 100644
--- a/project/main.py
+++ b/project/main.py
@@ -11,25 +11,25 @@
 from __future__ import annotations
 
 import dataclasses
-import functools
+import logging
 import operator
 import os
 import warnings
-from logging import getLogger as get_logger
 from pathlib import Path
 from typing import Any
 
 import hydra
-import jax.random
+import jax
 import lightning
+import lightning.pytorch
+import lightning.pytorch.loggers
 import omegaconf
 import rich
+import wandb
 from hydra_plugins.auto_schema import auto_schema_plugin
-from lightning import Callback
-from lightning.pytorch.loggers import Logger
 from omegaconf import DictConfig
 
-from project.algorithms.jax_rl_example import EvalMetrics
+from project.algorithms.jax_ppo import EvalMetrics
 from project.configs import add_configs_to_hydra_store
 from project.configs.config import Config
 from project.experiment import (
@@ -38,16 +38,13 @@
     setup_logging,
 )
 from project.trainers.jax_trainer import JaxModule, JaxTrainer, Ts, _MetricsT
-from project.utils.env_vars import REPO_ROOTDIR
 from project.utils.hydra_utils import resolve_dictconfig
 from project.utils.utils import print_config
 
-logger = get_logger(__name__)
-
 PROJECT_NAME = Path(__file__).parent.name
-add_configs_to_hydra_store()
-setup_logging(log_level="INFO", global_log_level="ERROR")
+REPO_ROOTDIR = Path(__file__).parent.parent
 
+setup_logging(log_level="INFO", global_log_level="ERROR")
 
 auto_schema_plugin.config = auto_schema_plugin.AutoSchemaPluginConfig(
     schemas_dir=REPO_ROOTDIR / ".schemas",
@@ -58,6 +55,8 @@
     add_headers=False,  # don't fallback to adding headers if we can't use vscode settings file.
 )
 
+add_configs_to_hydra_store()
+
 
 @hydra.main(
     config_path=f"pkg://{PROJECT_NAME}.configs",
@@ -94,8 +93,12 @@ def main(dict_config: DictConfig) -> dict:
 
     # Create the Trainer
     trainer_config = config.trainer.copy()  # Avoid mutating the config if possible.
-    callbacks: list[Callback] | None = instantiate_values(trainer_config.pop("callbacks", None))
-    logger: list[Logger] | None = instantiate_values(trainer_config.pop("logger", None))
+    callbacks: list[lightning.Callback] | None = instantiate_values(
+        trainer_config.pop("callbacks", None)
+    )
+    logger: list[lightning.pytorch.loggers.Logger] | None = instantiate_values(
+        trainer_config.pop("logger", None)
+    )
     trainer: lightning.Trainer | JaxTrainer = hydra.utils.instantiate(
         trainer_config, callbacks=callbacks, logger=logger
     )
@@ -108,29 +111,29 @@ def main(dict_config: DictConfig) -> dict:
         config.algorithm, datamodule=datamodule
     )
 
-    import wandb
-
     if wandb.run:
         wandb.run.config.update({k: v for k, v in os.environ.items() if k.startswith("SLURM")})
         wandb.run.config.update(
             omegaconf.OmegaConf.to_container(dict_config, resolve=False, throw_on_missing=True)
         )
+
     # Train the algorithm.
     train_results = train(
         config=config, trainer=trainer, datamodule=datamodule, algorithm=algorithm
     )
 
     # Evaluate the algorithm.
-    if isinstance(algorithm, JaxModule):
+    if isinstance(trainer, lightning.Trainer):
+        assert isinstance(algorithm, lightning.LightningModule)
+        metric_name, error, _metrics = evaluate_lightningmodule(
+            algorithm, datamodule=datamodule, trainer=trainer
+        )
+    else:
         assert isinstance(trainer, JaxTrainer)
+        assert isinstance(algorithm, JaxModule)
         metric_name, error, _metrics = evaluate_jax_module(
             algorithm, trainer=trainer, train_results=train_results
         )
-    else:
-        assert isinstance(trainer, lightning.Trainer)
-        metric_name, error, _metrics = evaluate_lightningmodule(
-            algorithm, datamodule=datamodule, trainer=trainer
-        )
 
     if wandb.run:
         wandb.finish()
@@ -172,6 +175,8 @@ def train(
             f"a {JaxModule.__name__}, so it can't be used with the `{JaxTrainer.__name__}`. "
             f"Try to subclass {JaxModule.__name__} and implement the missing methods."
         )
+    import jax
+
     rng = jax.random.key(config.seed)
     # TODO: Use ckpt_path argument to load the training state and resume the training run.
     assert config.ckpt_path is None
@@ -194,6 +199,7 @@ def instantiate_values(config_dict: DictConfig | None) -> list[Any] | None:
     objects_dict = hydra.utils.instantiate(config_dict, _recursive_=True)
     if objects_dict is None:
         return None
+
     assert isinstance(objects_dict, dict | DictConfig)
     return [v for v in objects_dict.values() if v is not None]
 
@@ -243,6 +249,8 @@ def evaluate_lightningmodule(
     for key, value in metrics.items():
         rich.print(f"{results_type} {key}: ", value)
 
+    logger = logging.getLogger(__name__)
+
     if (accuracy := metrics.get(f"{results_type}/accuracy")) is not None:
         # NOTE: This is the value that is used for HParam sweeps.
         metric_name = "1-accuracy"
@@ -276,9 +284,12 @@ def evaluate_jax_module(
     return get_error_from_metrics(metrics)
 
 
-@functools.singledispatch
-def get_error_from_metrics(metrics: _MetricsT) -> tuple[MetricName, float, dict]:
+# BUG: ULTRA weird bug happens with cloudpickle if we use a singledispatch function here!
+# @functools.singledispatch
+def get_error_from_metrics(metrics: _MetricsT) -> tuple[str, float, dict]:
     """Returns the main metric name, its value, and the full metrics dictionary."""
+    if isinstance(metrics, EvalMetrics):
+        return get_error_from_jax_rl_example_metrics(metrics)
     raise NotImplementedError(
         f"Don't know how to calculate the error to minimize from metrics {metrics} of type "
         f"{type(metrics)}! "
@@ -286,7 +297,7 @@ def get_error_from_metrics(metrics: _MetricsT) -> tuple[MetricName, float, dict]
     )
 
 
-@get_error_from_metrics.register(EvalMetrics)
+# @get_error_from_metrics.register(EvalMetrics)
 def get_error_from_jax_rl_example_metrics(metrics: EvalMetrics):
     last_epoch_metrics = jax.tree.map(operator.itemgetter(-1), metrics)
     assert isinstance(last_epoch_metrics, EvalMetrics)
diff --git a/project/main_test.py b/project/main_test.py
index 08e0fba2..9c2f3a0b 100644
--- a/project/main_test.py
+++ b/project/main_test.py
@@ -14,7 +14,7 @@
 from omegaconf import DictConfig
 
 import project.main
-from project.conftest import command_line_overrides
+from project.conftest import command_line_overrides, skip_on_macOS_in_CI
 from project.utils.env_vars import REPO_ROOTDIR, SLURM_JOB_ID
 from project.utils.hydra_utils import resolve_dictconfig
 from project.utils.testutils import IN_GITHUB_CI
@@ -195,7 +195,10 @@ def test_can_run_experiment(
     project.main.main()
 
 
-@pytest.mark.parametrize(command_line_overrides.__name__, ["algorithm=example"], indirect=True)
+@skip_on_macOS_in_CI
+@pytest.mark.parametrize(
+    command_line_overrides.__name__, ["algorithm=image_classifier"], indirect=True
+)
 def test_setting_just_algorithm_isnt_enough(experiment_dictconfig: DictConfig) -> None:
     """Test to check that the datamodule is required (even when just the example algorithm is set).
 
@@ -216,7 +219,7 @@ def test_setting_just_algorithm_isnt_enough(experiment_dictconfig: DictConfig) -
 @pytest.mark.parametrize(
     command_line_overrides.__name__,
     [
-        "algorithm=example datamodule=cifar10 seed=1 trainer/callbacks=none trainer.fast_dev_run=True"
+        "algorithm=image_classifier datamodule=cifar10 seed=1 trainer/callbacks=none trainer.fast_dev_run=True"
     ],
     indirect=True,
 )
diff --git a/project/networks/__init__.py b/project/networks/__init__.py
index c44d7cfc..81970385 100644
--- a/project/networks/__init__.py
+++ b/project/networks/__init__.py
@@ -1,18 +1,4 @@
-# Design problem: How we create the network depends on the kind of datamodule (and later on maybe
-# even Algorithm..) that we use.
-# Option 1: Create a common interface (e.g. have DataModule have input_shape/space and output_shape
-# or similar)
-# Option 2: Create handlers for each kind of datamodule (e.g. VisionDataModule, RLDataModule, ...)
-# using something like Singledispatch:
-# - handler for creating the network from a VisionDataModule
-# - handler for creating the network from an RLDataModule
-# - ...
-# Currently, we're using something like option 1, where we use `interpolated_field` to retrieve
-# some attributes from the datamodule when creating the network configs.
-# _cs = ConfigStore.instance()
-# _cs.store(group="network", name="fcnet", node=FcNetConfig)
-# _cs.store(group="network", name="resnet18", node=ResNet18Config)
-# Add your network configs here.
+"""Network definitions."""
 
 from .fcnet import FcNet
 
diff --git a/project/trainers/__init__.py b/project/trainers/__init__.py
index f27ba440..4c921f67 100644
--- a/project/trainers/__init__.py
+++ b/project/trainers/__init__.py
@@ -1,8 +1,13 @@
-from lightning.pytorch.trainer.trainer import Trainer
+"""Trainers: actually run the training loop.
+
+You can define custom trainers here.
+"""
+
+from lightning.pytorch.trainer.trainer import Trainer as LightningTrainer
 
 from .jax_trainer import JaxTrainer
 
 __all__ = [
     "JaxTrainer",
-    "Trainer",
+    "LightningTrainer",
 ]
diff --git a/project/utils/autoref_plugin.py b/project/utils/autoref_plugin.py
index ded6f6ac..b706c9bb 100644
--- a/project/utils/autoref_plugin.py
+++ b/project/utils/autoref_plugin.py
@@ -1,8 +1,13 @@
-"""IDEA: Tweak the AutoRefsPlugin so that text in backticks like `this` (more IDE-friendly) are
+"""A plugin for the mkdocs documentation engine to provide better support for IDE-friendly links.
+
+IDEA: Tweak the AutoRefsPlugin so that text in backticks like `this` (more IDE-friendly) are
 considered refs when possible.
+
+TODO: Move to a separate package?
 """
 
 import functools
+import importlib
 import inspect
 import re
 import types
@@ -10,16 +15,11 @@
 import lightning
 import torch
 from mkdocs.config.defaults import MkDocsConfig
-from mkdocs.plugins import (
-    BasePlugin,
-    get_plugin_logger,
-)
+from mkdocs.plugins import BasePlugin, get_plugin_logger
 from mkdocs.structure.files import Files
 from mkdocs.structure.pages import Page
 from mkdocs_autorefs.plugin import AutorefsPlugin  # noqa
 
-from project.utils.hydra_config_utils import import_object
-
 # Same as in the mkdocs_autorefs plugin.
 logger = get_plugin_logger(__name__)
 
@@ -162,6 +162,38 @@ def _expand(obj: types.ModuleType | object) -> list[object]:
         ]
 
 
+def import_object(target_path: str):
+    """Imports the object at the given path."""
+
+    # todo: what is the difference between this here and `hydra.utils.get_object` ?
+    assert not target_path.endswith(
+        ".py"
+    ), "expect a valid python path like 'module.submodule.object'"
+    if "." not in target_path:
+        return importlib.import_module(target_path)
+
+    parts = target_path.split(".")
+    try:
+        return importlib.import_module(name=f".{parts[-1]}", package=".".join(parts[:-1]))
+    except (ModuleNotFoundError, AttributeError):
+        pass
+    exc = None
+    for i in range(1, len(parts)):
+        module_name = ".".join(parts[:i])
+        obj_path = parts[i:]
+        try:
+            module = importlib.import_module(module_name)
+            obj = getattr(module, obj_path[0])
+            for part in obj_path[1:]:
+                obj = getattr(obj, part)
+            return obj
+        except (ModuleNotFoundError, AttributeError) as _exc:
+            exc = _exc
+            continue
+    assert exc is not None
+    raise ModuleNotFoundError(f"Unable to import the {target_path=}!") from exc
+
+
 def _get_referencable_objects_from_doc_page_header(doc_page_references: list[str]):
     additional_objects: list[object] = []
     for package in doc_page_references:
diff --git a/project/utils/autoref_plugin_test.py b/project/utils/autoref_plugin_test.py
index a0504337..feec2688 100644
--- a/project/utils/autoref_plugin_test.py
+++ b/project/utils/autoref_plugin_test.py
@@ -32,7 +32,7 @@
         ),
         ("`Trainer`", "[`Trainer`][lightning.pytorch.trainer.trainer.Trainer]"),
         # since `Trainer` is in the `known_things` list, we add the proper ref.
-        ("`.devcontainer/devcontainer.json`", "`.devcontainer/devcontainer.json`")
+        ("`.devcontainer/devcontainer.json`", "`.devcontainer/devcontainer.json`"),
     ],
 )
 def test_autoref_plugin(input: str, expected: str):
@@ -71,12 +71,12 @@ def test_ref_using_additional_python_references():
         ),
         config=mkdocs_config,
     )
-    page.meta = {"additional_python_references": ["project.algorithms.example"]}
+    page.meta = {"additional_python_references": ["project.algorithms.image_classifier"]}
 
     result = plugin.on_page_markdown(
-        "`ExampleAlgorithm`",
+        "`ImageClassifier`",
         page=page,
         config=mkdocs_config,
         files=Files([]),
     )
-    assert result == "[`ExampleAlgorithm`][project.algorithms.example.ExampleAlgorithm]"
+    assert result == "[`ImageClassifier`][project.algorithms.image_classifier.ImageClassifier]"
diff --git a/project/utils/hydra_config_utils.py b/project/utils/hydra_config_utils.py
deleted file mode 100644
index 1f4e38de..00000000
--- a/project/utils/hydra_config_utils.py
+++ /dev/null
@@ -1,228 +0,0 @@
-import functools
-import importlib
-import inspect
-import typing
-from collections.abc import Callable
-from logging import getLogger as get_logger
-
-import hydra_zen
-from hydra.core.config_store import ConfigStore
-
-from project.utils.hydra_utils import get_outer_class
-
-logger = get_logger(__name__)
-
-
-@functools.cache
-def get_config_loader():
-    from hydra._internal.config_loader_impl import ConfigLoaderImpl
-    from hydra._internal.utils import create_automatic_config_search_path
-
-    from project.main import PROJECT_NAME
-
-    # TODO: This (loading a config) is actually taking a long time, in part because this is
-    # triggering the hydra-auto-schema plugin to add schemas to all the yaml files.
-    AutoSchemaPlugin = None
-    backup = None
-    try:
-        from hydra_plugins.hydra_auto_schema.auto_schema_plugin import (  # type: ignore
-            AutoSchemaPlugin,
-        )
-
-        backup = AutoSchemaPlugin._ALREADY_DID
-        AutoSchemaPlugin._ALREADY_DID = True
-    except ImportError:
-        pass
-    search_path = create_automatic_config_search_path(
-        calling_file=None, calling_module=None, config_path=f"pkg://{PROJECT_NAME}.configs"
-    )
-    if AutoSchemaPlugin is not None:
-        AutoSchemaPlugin._ALREADY_DID = backup
-    config_loader = ConfigLoaderImpl(config_search_path=search_path)
-    return config_loader
-
-
-def get_all_configs_in_group(group_name: str) -> list[str]:
-    # note: here we're copying a bit of the internal code from Hydra so that we also get the
-    # configs that are just yaml files, in addition to the configs we added programmatically to the
-    # configstores.
-
-    # names_yaml = cs.list(group_name)
-    # names = [name.rpartition(".")[0] for name in names_yaml]
-    # if "base" in names:
-    #     names.remove("base")
-    # return names
-
-    return get_config_loader().get_group_options(group_name)
-
-
-def get_target_of_config(
-    config_group: str, config_name: str, _cs: ConfigStore | None = None
-) -> Callable:
-    """Returns the class that is to be instantiated by the given config name.
-
-    In the case of inner dataclasses (e.g. Model.HParams), this returns the outer class (Model).
-    """
-    # TODO: Rework, use the same mechanism as in auto-schema.py
-    if _cs is None:
-        from project.configs import cs as _cs
-
-    config_loader = get_config_loader()
-    _, caching_repo = config_loader._parse_overrides_and_create_caching_repo(
-        config_name=None, overrides=[]
-    )
-    # todo: support both `.yml` and `.yaml` extensions for config files.
-    for extension in ["yaml", "yml"]:
-        config_result = caching_repo.load_config(f"{config_group}/{config_name}.{extension}")
-        if config_result is None:
-            continue
-        try:
-            return hydra_zen.get_target(config_result.config)  # type: ignore
-        except TypeError:
-            pass
-    from hydra.plugins.config_source import ConfigLoadError
-
-    try:
-        config_node = _cs._load(f"{config_group}/{config_name}.yaml")
-    except ConfigLoadError as error_yaml:
-        try:
-            config_node = _cs._load(f"{config_group}/{config_name}.yml")
-        except ConfigLoadError:
-            raise ConfigLoadError(
-                f"Unable to find a config {config_group}/{config_name}.yaml or {config_group}/{config_name}.yml!"
-            ) from error_yaml
-
-    if "_target_" in config_node.node:
-        # BUG: This won't work for nested classes! "module.class.class"
-        target: str = config_node.node["_target_"]
-        return import_object(target)
-        # module_name, _, class_name = target.rpartition(".")
-        # module = importlib.import_module(module_name)
-        # target = getattr(module, class_name)
-        # return target
-
-    # If it doesn't have a target, then assume that it's an inner dataclass like this:
-    """
-    ```python
-    class Model:
-        class HParams:
-            ...
-        def __init__(self, ...): # (with an arg of type HParams)
-            ...
-    """
-    # NOTE: A bit hacky, might break.
-    hparam_type = config_node.node._metadata.object_type
-    target_type = get_outer_class(hparam_type)
-    return target_type
-
-
-def import_object(target_path: str):
-    """Imports the object at the given path.
-
-    ## Examples
-
-    ```python
-    assert False
-    ```
-    """
-    assert not target_path.endswith(
-        ".py"
-    ), "expect a valid python path like 'module.submodule.object'"
-    if "." not in target_path:
-        return importlib.import_module(target_path)
-
-    parts = target_path.split(".")
-    try:
-        return importlib.import_module(name=f".{parts[-1]}", package=".".join(parts[:-1]))
-    except (ModuleNotFoundError, AttributeError):
-        pass
-
-    for i in range(1, len(parts)):
-        module_name = ".".join(parts[:i])
-        obj_path = parts[i:]
-        try:
-            module = importlib.import_module(module_name)
-            obj = getattr(module, obj_path[0])
-            for part in obj_path[1:]:
-                obj = getattr(obj, part)
-            return obj
-        except (ModuleNotFoundError, AttributeError):
-            continue
-    raise ModuleNotFoundError(f"Unable to import the {target_path=}!")
-
-
-def get_all_configs_in_group_of_type(
-    config_group: str,
-    config_target_type: type | tuple[type, ...],
-    include_subclasses: bool = True,
-    excluding: type | tuple[type, ...] = (),
-) -> list[str]:
-    """Returns the names of all the configs in the given config group that have this target or a
-    subclass of it."""
-    config_names = get_all_configs_in_group(config_group)
-    names_to_targets = {
-        config_name: get_target_of_config(config_group, config_name)
-        for config_name in config_names
-    }
-
-    names_to_types: dict[str, type] = {}
-    for name, target in names_to_targets.items():
-        if inspect.isclass(target):
-            names_to_types[name] = target
-            continue
-
-        if (
-            (inspect.isfunction(target) or inspect.ismethod(target))
-            and (annotations := typing.get_type_hints(target))
-            and (return_type := annotations.get("return"))
-            and (inspect.isclass(return_type) or inspect.isclass(typing.get_origin(return_type)))
-        ):
-            # Resolve generic aliases if present.
-            return_type = typing.get_origin(return_type) or return_type
-            logger.info(
-                f"Assuming that the function {target} creates objects of type {return_type} based "
-                f"on its return type annotation."
-            )
-            names_to_types[name] = return_type
-            continue
-
-        logger.warning(
-            RuntimeWarning(
-                f"Unable to tell what kind of object will be created by the target {target} of "
-                f"config {name} in group {config_group}. This config will be skipped in tests."
-            )
-        )
-    config_target_type = (
-        config_target_type if isinstance(config_target_type, tuple) else (config_target_type,)
-    )
-    if excluding is not None:
-        exclude = (excluding,) if isinstance(excluding, type) else excluding
-        names_to_types = {
-            name: object_type
-            for name, object_type in names_to_types.items()
-            if (
-                not issubclass(object_type, exclude)
-                if include_subclasses
-                else object_type not in exclude
-            )
-        }
-
-    return [
-        name
-        for name, object_type in names_to_types.items()
-        if (
-            issubclass(object_type, config_target_type)
-            if include_subclasses
-            else object_type in config_target_type
-        )
-    ]
-
-
-def get_all_configs_in_group_with_target(group_name: str, some_type: type) -> list[str]:
-    """Retrieves the names of all the configs in the given group that are used to construct objects
-    of the given type."""
-    config_names = get_all_configs_in_group(group_name)
-    names_to_target = {
-        config_name: get_target_of_config(group_name, config_name) for config_name in config_names
-    }
-    return [name for name, object_type in names_to_target.items() if object_type == some_type]
diff --git a/project/utils/hydra_config_utils_test.py b/project/utils/hydra_config_utils_test.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/project/utils/hydra_utils.py b/project/utils/hydra_utils.py
index e4c1d938..9c141db2 100644
--- a/project/utils/hydra_utils.py
+++ b/project/utils/hydra_utils.py
@@ -134,7 +134,7 @@ def resolve_dictconfig(dict_config: DictConfig) -> Config:
         value_in_config = _get_attr(config, attribute)
         if pre_instantiated_object != value_in_config:
             logger.debug(
-                f"Overwriting the config at {attribute} with the pre-instantiated "
+                f"Overwriting the config at {attribute} with the already-instantiated "
                 f"object {pre_instantiated_object}"
             )
             _set_attr(config, attribute, pre_instantiated_object)
diff --git a/project/utils/remote_launcher_plugin.py b/project/utils/remote_launcher_plugin.py
index 1eb551b5..f0a1d682 100644
--- a/project/utils/remote_launcher_plugin.py
+++ b/project/utils/remote_launcher_plugin.py
@@ -2,13 +2,12 @@
 # https://github.com/facebookresearch/hydra/blob/main/examples/plugins/example_launcher_plugin/hydra_plugins/example_launcher_plugin/example_launcher.py
 
 import dataclasses
-import functools
 import logging
 import os
 import warnings
 from collections.abc import Callable, Sequence
 from pathlib import Path
-from typing import Any
+from typing import Any, ClassVar
 
 import hydra_zen
 from hydra.core.config_store import ConfigStore
@@ -16,6 +15,7 @@
 from hydra.core.singleton import Singleton
 from hydra.core.utils import JobReturn, filter_overrides
 from hydra.plugins.plugin import Plugin
+from hydra.types import HydraContext, TaskFunction
 from hydra.utils import instantiate
 from hydra_plugins.hydra_submitit_launcher.submitit_launcher import BaseSubmititLauncher
 from omegaconf import DictConfig
@@ -60,8 +60,16 @@ def _instantiate(self: Plugins, config: DictConfig) -> Plugin:
 Plugins._instantiate = _instantiate
 
 
+@dataclasses.dataclass(init=False)
 class RemoteSlurmLauncher(BaseSubmititLauncher):
-    _EXECUTOR = "remoteslurm"
+    _EXECUTOR: ClassVar[str] = "remoteslurm"
+
+    params: dict[str, Any]
+    config: DictConfig | None = None
+    task_function: TaskFunction | None = None
+    sweep_configs: TaskFunction | None = None
+    hydra_context: HydraContext | None = None
+    executor: RemoteSlurmExecutor
 
     def __init__(
         self,
@@ -127,7 +135,8 @@ def __init__(
         if tasks_per_node is not None:
             assert ntasks_per_node is None, "can't use both tasks_per_node and ntasks_per_node"
             ntasks_per_node = tasks_per_node
-
+        if ntasks_per_node is not None:
+            additional_parameters["ntasks-per-node"] = ntasks_per_node
         super().__init__(
             account=account,
             array_parallelism=array_parallelism,
@@ -209,8 +218,24 @@ def launch(
         # for different seeds, or something similar!
         return [j.results()[0] for j in jobs]
 
+    def __call__(
+        self,
+        sweep_overrides: list[str],
+        job_dir_key: str,
+        job_num: int,
+        job_id: str,
+        singleton_state: dict[type, Singleton],
+    ) -> JobReturn:
+        return super().__call__(
+            sweep_overrides=sweep_overrides,
+            job_dir_key=job_dir_key,
+            job_num=job_num,
+            job_id=job_id,
+            singleton_state=singleton_state,
+        )
+
 
-@functools.cache
+# @functools.cache
 def get_slurm_accounts(cluster: str) -> list[str]:
     """Gets the SLURM accounts of the user using sacctmgr on the slurm cluster."""
     logger.debug(f"Fetching the list of SLURM accounts available on the {cluster} cluster.")
diff --git a/project/utils/remote_launcher_plugin_test.py b/project/utils/remote_launcher_plugin_test.py
index bc821ff4..d30f8b08 100644
--- a/project/utils/remote_launcher_plugin_test.py
+++ b/project/utils/remote_launcher_plugin_test.py
@@ -40,7 +40,7 @@ def _yaml_files_in(directory: str | Path, recursive: bool = False):
     "command_line_args",
     [
         pytest.param(
-            f"algorithm=example datamodule=cifar10 trainer.fast_dev_run=True cluster={cluster} resources={resources}",
+            f"algorithm=image_classifier datamodule=cifar10 trainer.fast_dev_run=True cluster={cluster} resources={resources}",
             marks=[
                 pytest.mark.skipif(
                     SLURM_JOB_ID is None and cluster == "current",
@@ -109,7 +109,7 @@ def test_can_load_configs(command_line_args: str):
     "argv",
     [
         [
-            "algorithm=example",
+            "algorithm=image_classifier",
             "datamodule=cifar10",
             # TODO: The ordering is important here, we can't use `cluster` before `resources`,
             # otherwise it will use the local launcher!
diff --git a/project/utils/seeding.py b/project/utils/seeding.py
deleted file mode 100644
index b998daf0..00000000
--- a/project/utils/seeding.py
+++ /dev/null
@@ -1,75 +0,0 @@
-"""Utility functions to manage random number generator states."""
-
-import contextlib
-import copy
-import dataclasses
-import random
-from contextlib import contextmanager
-from typing import Any
-
-import lightning
-import numpy as np
-import torch
-
-
-def _get_cuda_rng_states():
-    return tuple(
-        torch.cuda.get_rng_state(torch.device("cuda", index=index))
-        for index in range(torch.cuda.device_count())
-    )
-
-
-@dataclasses.dataclass(frozen=True)
-class RngState:
-    """Dataclass that contains the state of all the numpy/random/torch RNGs."""
-
-    random_state: tuple[Any, ...] = dataclasses.field(default_factory=random.getstate)
-    numpy_random_state: dict[str, Any] = dataclasses.field(default_factory=np.random.get_state)
-
-    torch_cpu_rng_state: torch.Tensor = torch.get_rng_state()
-    torch_device_rng_states: tuple[torch.Tensor, ...] = dataclasses.field(
-        default_factory=_get_cuda_rng_states
-    )
-
-    @classmethod
-    def get(cls):
-        """Gets the state of the random/numpy/torch random number generators."""
-        # Note: do a deepcopy just in case the libraries return the rng state "by reference" and
-        # keep modifying it.
-        return copy.deepcopy(cls())
-
-    def set(self):
-        """Resets the state of the random/numpy/torch random number generators with the contents of
-        `self`."""
-        random.setstate(self.random_state)
-        np.random.set_state(self.numpy_random_state)
-        torch.set_rng_state(self.torch_cpu_rng_state)
-        for index, state in enumerate(self.torch_device_rng_states):
-            torch.cuda.set_rng_state(state, torch.device("cuda", index=index))
-
-    @classmethod
-    def seed(cls, base_seed: int):
-        lightning.seed_everything(base_seed, workers=True)
-        # random.seed(base_seed)
-        # np.random.seed(base_seed)
-        # torch.random.manual_seed(base_seed)
-        return cls()
-
-
-@contextlib.contextmanager
-def fork_rng():
-    """Forks the RNG, so that when you return, the RNG is reset to the state that it was previously
-    in."""
-    # get the global RNG state before
-    rng_state = RngState.get()
-    # Yield: let the client code modify the global RNG state.
-    yield
-    # Reset the global RNG state to what it was before.
-    rng_state.set()
-
-
-@contextmanager
-def seeded_rng(seed: int = 42):
-    """Forks the RNG and seeds the torch, numpy, and random RNGs while inside the block."""
-    with fork_rng():
-        yield RngState.seed(seed)
diff --git a/project/utils/testutils.py b/project/utils/testutils.py
index d4e9b546..96c0d9f9 100644
--- a/project/utils/testutils.py
+++ b/project/utils/testutils.py
@@ -2,33 +2,35 @@
 
 from __future__ import annotations
 
+import functools
+import inspect
 import itertools
 import os
 import typing
-from collections.abc import Mapping
+from collections.abc import Callable, Mapping
 from logging import getLogger as get_logger
 
+import hydra
+import hydra_zen
 import pytest
+import torch
 import torchvision.models
+from hydra.core.config_store import ConfigStore
 
 from project.datamodules.image_classification.fashion_mnist import FashionMNISTDataModule
 from project.datamodules.image_classification.mnist import MNISTDataModule
 from project.utils.env_vars import NETWORK_DIR
-from project.utils.hydra_config_utils import (
-    get_all_configs_in_group,
-    get_all_configs_in_group_of_type,
-)
+from project.utils.hydra_utils import get_outer_class
 
 logger = get_logger(__name__)
 
 IN_GITHUB_CI = "GITHUB_ACTIONS" in os.environ
 IN_SELF_HOSTED_GITHUB_CI = IN_GITHUB_CI and "self-hosted" in os.environ.get("RUNNER_LABELS", "")
-IN_GITHUB_COULD_CI = IN_GITHUB_CI and not IN_SELF_HOSTED_GITHUB_CI
+IN_GITHUB_CLOUD_CI = IN_GITHUB_CI and not IN_SELF_HOSTED_GITHUB_CI
 PARAM_WHEN_USED_MARK_NAME = "parametrize_when_used"
 
 
 default_marks_for_config_name: dict[str, list[pytest.MarkDecorator]] = {
-    "imagenet32": [pytest.mark.slow],
     "inaturalist": [
         pytest.mark.slow,
         pytest.mark.skipif(
@@ -51,6 +53,185 @@
 }
 """Dict with some default marks for some configs name."""
 
+
+# Doing this once only because it might be a bit expensive.
+@functools.cache
+def get_config_loader():
+    from hydra._internal.config_loader_impl import ConfigLoaderImpl
+    from hydra._internal.utils import create_automatic_config_search_path
+
+    from project.main import PROJECT_NAME
+
+    # TODO: This (loading a config) is actually taking a long time, in part because this is
+    # triggering the hydra-auto-schema plugin to add schemas to all the yaml files.
+    AutoSchemaPlugin = None
+    backup = None
+    try:
+        from hydra_plugins.auto_schema.auto_schema_plugin import (
+            AutoSchemaPlugin,
+        )
+
+        backup = AutoSchemaPlugin._ALREADY_DID
+        AutoSchemaPlugin._ALREADY_DID = True
+    except ImportError:
+        pass
+    search_path = create_automatic_config_search_path(
+        calling_file=None, calling_module=None, config_path=f"pkg://{PROJECT_NAME}.configs"
+    )
+    if AutoSchemaPlugin is not None:
+        AutoSchemaPlugin._ALREADY_DID = backup
+    config_loader = ConfigLoaderImpl(config_search_path=search_path)
+    return config_loader
+
+
+def get_target_of_config(
+    config_group: str, config_name: str, _cs: ConfigStore | None = None
+) -> Callable:
+    """Returns the class that is to be instantiated by the given config name.
+
+    In the case of inner dataclasses (e.g. Model.HParams), this returns the outer class (Model).
+    """
+    # TODO: Rework, use the same mechanism as in auto-schema.py
+    if _cs is None:
+        from project.configs import cs as _cs
+
+    config_loader = get_config_loader()
+    _, caching_repo = config_loader._parse_overrides_and_create_caching_repo(
+        config_name=None, overrides=[]
+    )
+    # todo: support both `.yml` and `.yaml` extensions for config files.
+    for extension in ["yaml", "yml"]:
+        config_result = caching_repo.load_config(f"{config_group}/{config_name}.{extension}")
+        if config_result is None:
+            continue
+        try:
+            return hydra_zen.get_target(config_result.config)  # type: ignore
+        except TypeError:
+            pass
+    from hydra.plugins.config_source import ConfigLoadError
+
+    try:
+        config_node = _cs._load(f"{config_group}/{config_name}.yaml")
+    except ConfigLoadError as error_yaml:
+        try:
+            config_node = _cs._load(f"{config_group}/{config_name}.yml")
+        except ConfigLoadError:
+            raise ConfigLoadError(
+                f"Unable to find a config {config_group}/{config_name}.yaml or {config_group}/{config_name}.yml!"
+            ) from error_yaml
+
+    if "_target_" in config_node.node:
+        target: str = config_node.node["_target_"]
+        return hydra.utils.get_object(target)
+        # module_name, _, class_name = target.rpartition(".")
+        # module = importlib.import_module(module_name)
+        # target = getattr(module, class_name)
+        # return target
+
+    # If it doesn't have a target, then assume that it's an inner dataclass like this:
+    """
+    ```python
+    class Model:
+        class HParams:
+            ...
+        def __init__(self, ...): # (with an arg of type HParams)
+            ...
+    """
+    # NOTE: A bit hacky, might break.
+    hparam_type = config_node.node._metadata.object_type
+    target_type = get_outer_class(hparam_type)
+    return target_type
+
+
+def get_all_configs_in_group(group_name: str) -> list[str]:
+    # note: here we're copying a bit of the internal code from Hydra so that we also get the
+    # configs that are just yaml files, in addition to the configs we added programmatically to the
+    # configstores.
+
+    # names_yaml = cs.list(group_name)
+    # names = [name.rpartition(".")[0] for name in names_yaml]
+    # if "base" in names:
+    #     names.remove("base")
+    # return names
+
+    return get_config_loader().get_group_options(group_name)
+
+
+def get_all_configs_in_group_of_type(
+    config_group: str,
+    config_target_type: type | tuple[type, ...],
+    include_subclasses: bool = True,
+    excluding: type | tuple[type, ...] = (),
+) -> list[str]:
+    """Returns the names of all the configs in the given config group that have this target or a
+    subclass of it."""
+    config_names = get_all_configs_in_group(config_group)
+    names_to_targets = {
+        config_name: get_target_of_config(config_group, config_name)
+        for config_name in config_names
+    }
+
+    names_to_types: dict[str, type] = {}
+    for name, target in names_to_targets.items():
+        if inspect.isclass(target):
+            names_to_types[name] = target
+            continue
+
+        if (
+            (inspect.isfunction(target) or inspect.ismethod(target))
+            and (annotations := typing.get_type_hints(target))
+            and (return_type := annotations.get("return"))
+            and (inspect.isclass(return_type) or inspect.isclass(typing.get_origin(return_type)))
+        ):
+            # Resolve generic aliases if present.
+            return_type = typing.get_origin(return_type) or return_type
+            logger.debug(
+                f"Assuming that the function {target} creates objects of type {return_type} based "
+                f"on its return type annotation."
+            )
+            names_to_types[name] = return_type
+            continue
+
+        logger.warning(
+            RuntimeWarning(
+                f"Unable to tell what kind of object will be created by the target {target} of "
+                f"config {name} in group {config_group}. This config will be skipped in tests."
+            )
+        )
+    config_target_type = (
+        config_target_type if isinstance(config_target_type, tuple) else (config_target_type,)
+    )
+    if excluding is not None:
+        exclude = (excluding,) if isinstance(excluding, type) else excluding
+        names_to_types = {
+            name: object_type
+            for name, object_type in names_to_types.items()
+            if (
+                not issubclass(object_type, exclude)
+                if include_subclasses
+                else object_type not in exclude
+            )
+        }
+
+    def _matches_protocol(object: type, protocol: type) -> bool:
+        return isinstance(object, protocol)  # todo: weird!
+
+    compatible_config_names = []
+    for name, object_type in names_to_types.items():
+        if not include_subclasses:
+            if object_type in config_target_type:
+                compatible_config_names.append(name)
+            continue
+        for t in config_target_type:
+            if (
+                issubclass(t, typing.Protocol) and _matches_protocol(object_type, t)
+            ) or issubclass(object_type, t):
+                compatible_config_names.append(name)
+                break
+
+    return compatible_config_names
+
+
 default_marks_for_config_combinations: dict[tuple[str, ...], list[pytest.MarkDecorator]] = {
     ("imagenet", "fcnet"): [
         pytest.mark.xfail(
@@ -207,3 +388,16 @@ def run_for_all_configs_in_group(
         ],
         indirect=True,
     )
+
+
+def total_vram_gb() -> float:
+    """Returns the total VRAM in GB."""
+    if not torch.cuda.is_available():
+        return 0.0
+    return (
+        sum(
+            torch.cuda.get_device_properties(i).total_memory
+            for i in range(torch.cuda.device_count())
+        )
+        / 1024**3
+    )
diff --git a/project/utils/typing_utils/__init__.py b/project/utils/typing_utils/__init__.py
index 3070e8d5..e107ca81 100644
--- a/project/utils/typing_utils/__init__.py
+++ b/project/utils/typing_utils/__init__.py
@@ -5,9 +5,10 @@
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Any, NewType, TypeGuard
 
+from hydra_zen.typing import Builds
 from typing_extensions import TypeVar
 
-from .protocols import DataModule, Module
+from .protocols import DataModule
 
 # These are used to show which dim is which.
 C = NewType("C", int)
@@ -19,6 +20,10 @@
 K = TypeVar("K")
 V = TypeVar("V")
 
+HydraConfigFor = Builds[type[T]]
+"""Type annotation to say "a hydra config that returns an object of type T when instantiated"."""
+
+
 NestedMapping = Mapping[K, V | "NestedMapping[K, V]"]
 PyTree = T | Iterable["PyTree[T]"] | Mapping[Any, "PyTree[T]"]
 
@@ -41,6 +46,5 @@ def is_mapping_of(object: Any, key_type: type[K], value_type: type[V]) -> TypeGu
 
 
 __all__ = [
-    "Module",
     "DataModule",
 ]
diff --git a/project/utils/typing_utils/jax_typing_utils.py b/project/utils/typing_utils/jax_typing_utils.py
index 57376765..4370bcb6 100644
--- a/project/utils/typing_utils/jax_typing_utils.py
+++ b/project/utils/typing_utils/jax_typing_utils.py
@@ -1,3 +1,8 @@
+"""Small typing helpers for Jax.
+
+This makes `jax.jit` preserve the signature of the wrapped callable.
+"""
+
 from __future__ import annotations
 
 import dataclasses
diff --git a/project/utils/typing_utils/protocols.py b/project/utils/typing_utils/protocols.py
index 6a6082b4..28d28b01 100644
--- a/project/utils/typing_utils/protocols.py
+++ b/project/utils/typing_utils/protocols.py
@@ -1,10 +1,11 @@
 from __future__ import annotations
 
 import typing
-from collections.abc import Iterable
 from typing import Literal, ParamSpec, Protocol, TypeVar, runtime_checkable
 
-from torch import nn
+if typing.TYPE_CHECKING:
+    from torch import nn
+    from torch.utils.data import DataLoader
 
 P = ParamSpec("P")
 OutT = TypeVar("OutT", covariant=True)
@@ -12,7 +13,7 @@
 
 @runtime_checkable
 class Module(Protocol[P, OutT]):
-    """Small protocol used to help annotate the input/outputs of `torch.nn.Module`s."""
+    """Small protocol that can be used to annotate the input/output types of `torch.nn.Module`s."""
 
     def forward(self, *args: P.args, **kwargs: P.kwargs) -> OutT:
         raise NotImplementedError
@@ -48,16 +49,11 @@ def prepare_data(self) -> None: ...
 
     def setup(self, stage: Literal["fit", "validate", "test", "predict"]) -> None: ...
 
-    def train_dataloader(self) -> Iterable[BatchType]: ...
+    def train_dataloader(self) -> DataLoader[BatchType]: ...
 
 
 @runtime_checkable
 class ClassificationDataModule(DataModule[BatchType], Protocol):
-    num_classes: int
-
+    """Protocol that matches "datamodules with a 'num_classes' int attribute."""
 
-# todo: Decide if we want this to be a base class or a protocol. Currently a base class.
-# @runtime_checkable
-# class ImageClassificationDataModule[BatchType](DataModule[BatchType], Protocol):
-#     num_classes: int
-#     dims: tuple[C, H, W]
+    num_classes: int
diff --git a/project/utils/utils.py b/project/utils/utils.py
index ad2ef13f..2473cc13 100644
--- a/project/utils/utils.py
+++ b/project/utils/utils.py
@@ -1,101 +1,16 @@
 from __future__ import annotations
 
-import typing
 from collections.abc import Sequence
 from logging import getLogger as get_logger
-from pathlib import Path
-from typing import TypeVar
 
 import rich
 import rich.syntax
 import rich.tree
-import torch
-from lightning import LightningDataModule, Trainer
 from omegaconf import DictConfig, OmegaConf
-from torchvision import transforms
-
-from project.utils.typing_utils.protocols import (
-    DataModule,
-)
 
 logger = get_logger(__name__)
 
 
-def get_log_dir(trainer: Trainer | None) -> Path:
-    """Gives back the default directory to use when `trainer.log_dir` is None (no logger used)."""
-    # TODO: This isn't great.. It could probably be a property on the Algorithm class or
-    # customizable somehow.
-    # ALSO: This
-    if trainer:
-        if trainer.logger and trainer.logger.log_dir:
-            return Path(trainer.logger.log_dir)
-        if trainer.log_dir:
-            return Path(trainer.log_dir)
-    base = Path(trainer.default_root_dir) if trainer else Path.cwd() / "logs"
-    log_dir = base / "default"
-    logger.warning(
-        RuntimeWarning(
-            f"Using the default log directory of {log_dir} because the trainer.log_dir is None. "
-            f"Consider using a logger (e.g. with 'trainer.logger=wandb' on the command-line)."
-        )
-    )
-    return log_dir
-
-
-DM = TypeVar("DM", bound=DataModule | LightningDataModule)
-
-
-def validate_datamodule(datamodule: DM) -> DM:
-    """Checks that the transforms / things are setup correctly.
-
-    Returns the same datamodule.
-    """
-    from project.datamodules.image_classification.image_classification import (
-        ImageClassificationDataModule,
-    )
-
-    if isinstance(datamodule, ImageClassificationDataModule) and not datamodule.normalize:
-        _remove_normalization_from_transforms(datamodule)
-    else:
-        # todo: maybe check that the normalization transform is present everywhere?
-        pass
-    return datamodule
-
-
-if typing.TYPE_CHECKING:
-    from project.datamodules.image_classification.image_classification import (
-        ImageClassificationDataModule,
-    )
-
-
-# todo: shouldn't be here, should be done in `VisionDataModule` or in the configs:
-# If `normalize=False`, and there is a normalization transform in the train transforms, then an
-# error should be raised.
-def _remove_normalization_from_transforms(
-    datamodule: ImageClassificationDataModule,
-) -> None:
-    transform_properties = (
-        datamodule.train_transforms,
-        datamodule.val_transforms,
-        datamodule.test_transforms,
-    )
-    for transform_list in transform_properties:
-        if transform_list is None:
-            continue
-        assert isinstance(transform_list, transforms.Compose)
-        if isinstance(transform_list.transforms[-1], transforms.Normalize):
-            t = transform_list.transforms.pop(-1)
-            logger.info(f"Removed normalization transform {t} since datamodule.normalize=False")
-        if any(isinstance(t, transforms.Normalize) for t in transform_list.transforms):
-            raise RuntimeError(
-                f"Unable to remove all the normalization transforms from datamodule {datamodule}: "
-                f"{transform_list}"
-            )
-
-
-# from lightning.utilities.rank_zero import rank_zero_only
-
-
 # @rank_zero_only
 def print_config(
     config: DictConfig,
@@ -149,10 +64,3 @@ def print_config(
 
     # with open("config_tree.log", "w") as file:
     #     rich.print(tree, file=file)
-
-
-def default_device() -> torch.device:
-    """Returns the default device (GPU if available, else CPU)."""
-    return torch.device(
-        f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"
-    )
diff --git a/pyproject.toml b/pyproject.toml
index 5927c145..88d68c73 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,6 +48,22 @@ requires-python = ">= 3.10"
 [project.entry-points."mkdocs.plugins"]
 custom_autoref_plugin = "project.utils.autoref_plugin:CustomAutoRefPlugin"
 
+[dependency-groups]
+dev = [
+    "mktestdocs>=0.2.2",
+    "pre-commit<4.0.0",
+    "pytest-benchmark>=4.0.0",
+    "pytest-cov>=5.0.0",
+    "pytest-env>=1.1.3",
+    "pytest-regressions>=2.5.0",
+    "pytest-skip-slow>=0.0.5",
+    "pytest-testmon>=2.1.1",
+    "pytest-timeout>=2.3.1",
+    "pytest-xdist>=3.6.1",
+    "pytest>=8.3.2",
+    "ruff>=0.6.0",
+    "tensor-regression>=0.0.8",
+]
 
 [project.optional-dependencies]
 docs = [
@@ -69,7 +85,7 @@ build-backend = "hatchling.build"
 
 [tool.pytest.ini_options]
 testpaths = ["project", "docs"]
-norecursedirs = [".venv"]
+norecursedirs = [".venv", "site"]
 # Required to use torch deterministic mode.
 env = ["CUBLAS_WORKSPACE_CONFIG=:4096:8"]
 addopts = [
@@ -99,21 +115,6 @@ packages = ["project"]
 
 [tool.uv]
 managed = true
-dev-dependencies = [
-    "mktestdocs>=0.2.2",
-    "pre-commit<4.0.0",
-    "pytest-benchmark>=4.0.0",
-    "pytest-cov>=5.0.0",
-    "pytest-env>=1.1.3",
-    "pytest-regressions>=2.5.0",
-    "pytest-skip-slow>=0.0.5",
-    "pytest-testmon>=2.1.1",
-    "pytest-timeout>=2.3.1",
-    "pytest-xdist>=3.6.1",
-    "pytest>=8.3.2",
-    "ruff>=0.6.0",
-    "tensor-regression>=0.0.8",
-]
 
 [tool.uv.sources]
 remote-slurm-executor = { git = "https://github.com/lebrice/remote-slurm-executor", branch = "master" }