diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index d4fb5277..86cd83e5 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -51,8 +51,7 @@ ".venv": true, ".pytest_cache": true, ".benchmarks": true, - ".ruff_cache": true, - ".regression_files": true + ".ruff_cache": true }, "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, @@ -85,7 +84,6 @@ "containerEnv": { "SCRATCH": "/home/vscode/scratch", "SLURM_TMPDIR": "/tmp", - "NETWORK_DIR": "/network", "UV_LINK_MODE": "symlink", "UV_CACHE_DIR": "/home/vscode/.uv_cache" }, diff --git a/.github/actions-runner-job.sh b/.github/actions-runner-job.sh index 432b9a84..4fa7d1e2 100755 --- a/.github/actions-runner-job.sh +++ b/.github/actions-runner-job.sh @@ -1,8 +1,8 @@ #!/bin/bash #SBATCH --nodes=1 #SBATCH --ntasks=1 -#SBATCH --cpus-per-task=1 -#SBATCH --mem=16G +#SBATCH --cpus-per-task=4 +#SBATCH --mem=32G #SBATCH --gpus=rtx8000:1 #SBATCH --time=00:30:00 #SBATCH --dependency=singleton diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 70e93c2a..35298b2b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -87,7 +87,7 @@ jobs: local_integration_tests: needs: [unit_tests, check_docs] runs-on: self-hosted - timeout-minutes: 20 + timeout-minutes: 30 strategy: max-parallel: 1 matrix: @@ -150,7 +150,7 @@ jobs: name: Run integration tests on the ${{ matrix.cluster }} cluster in job ${{ needs.launch-slurm-actions-runner.outputs.job_id}} needs: [launch-slurm-actions-runner] runs-on: ${{ matrix.cluster }} - timeout-minutes: 20 + timeout-minutes: 30 strategy: max-parallel: 5 matrix: diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_cifar10_example.yaml b/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_cifar10_example.yaml deleted file mode 100644 index 5dab27b0..00000000 --- a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_cifar10_example.yaml +++ /dev/null @@ -1,22 +0,0 @@ -input: - device: cpu - hash: -1373365636602041987 - max: 2.1 - mean: -0.0 - min: -2.0 - shape: - - 128 - - 3 - - 32 - - 32 - sum: -2429.8 -out: - device: cpu - hash: -5286755934104888446 - max: 0.7 - mean: 0.0 - min: -0.8 - shape: - - 128 - - 10 - sum: 20.2 diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml b/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml deleted file mode 100644 index aaa55377..00000000 --- a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml +++ /dev/null @@ -1,22 +0,0 @@ -input: - device: cpu - hash: 9223185275738543696 - max: 2.8 - mean: 0.5 - min: -0.4 - shape: - - 128 - - 1 - - 28 - - 28 - sum: 48391.2 -out: - device: cpu - hash: 3229404000460739909 - max: 1.2 - mean: -0.0 - min: -1.1 - shape: - - 128 - - 10 - sum: -40.6 diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_mnist_example.yaml b/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_mnist_example.yaml deleted file mode 100644 index 0d41f6d3..00000000 --- a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/fcnet_mnist_example.yaml +++ /dev/null @@ -1,22 +0,0 @@ -input: - device: cpu - hash: 8611995894311838429 - max: 2.8 - mean: 0.0 - min: -0.4 - shape: - - 128 - - 1 - - 28 - - 28 - sum: 1437.2 -out: - device: cpu - hash: -4763233483389115210 - max: 0.8 - mean: -0.0 - min: -0.9 - shape: - - 128 - - 10 - sum: -30.8 diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/resnet18_cifar10_example.yaml b/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/resnet18_cifar10_example.yaml deleted file mode 100644 index dea2f076..00000000 --- a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/resnet18_cifar10_example.yaml +++ /dev/null @@ -1,22 +0,0 @@ -input: - device: cpu - hash: -1373365636602041987 - max: 2.1 - mean: -0.0 - min: -2.0 - shape: - - 128 - - 3 - - 32 - - 32 - sum: -2429.8 -out: - device: cpu - hash: -1856253906003733022 - max: 2.1 - mean: -0.2 - min: -3.0 - shape: - - 128 - - 10 - sum: -265.8 diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/resnet50_cifar10_example.yaml b/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/resnet50_cifar10_example.yaml deleted file mode 100644 index 78bbee98..00000000 --- a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cpu/resnet50_cifar10_example.yaml +++ /dev/null @@ -1,22 +0,0 @@ -input: - device: cpu - hash: -1373365636602041987 - max: 2.1 - mean: -0.0 - min: -2.0 - shape: - - 128 - - 3 - - 32 - - 32 - sum: -2429.8 -out: - device: cpu - hash: -9209917346416037156 - max: 6.0 - mean: 0.3 - min: -5.2 - shape: - - 128 - - 10 - sum: 322.7 diff --git a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_cifar10_example.yaml b/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_cifar10_example.yaml deleted file mode 100644 index 66b7eef8..00000000 --- a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_cifar10_example.yaml +++ /dev/null @@ -1,51 +0,0 @@ -network.0.1.bias: - device: cpu - max: '1.770e-02' - mean: '-1.236e-04' - min: '-1.797e-02' - shape: - - 128 - sum: '-1.581e-02' -network.0.1.weight: - device: cpu - max: '1.804e-02' - mean: '-8.050e-06' - min: '-1.804e-02' - shape: - - 128 - - 3072 - sum: '-3.166e+00' -network.1.0.bias: - device: cpu - max: '8.806e-02' - mean: '-3.074e-03' - min: '-8.612e-02' - shape: - - 128 - sum: '-3.935e-01' -network.1.0.weight: - device: cpu - max: '8.836e-02' - mean: '5.354e-04' - min: '-8.837e-02' - shape: - - 128 - - 128 - sum: '8.773e+00' -network.2.0.bias: - device: cpu - max: '8.265e-02' - mean: '2.135e-02' - min: '-2.476e-02' - shape: - - 10 - sum: '2.135e-01' -network.2.0.weight: - device: cpu - max: '8.824e-02' - mean: '-6.046e-04' - min: '-8.823e-02' - shape: - - 10 - - 128 - sum: '-7.739e-01' diff --git a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml b/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml deleted file mode 100644 index 309c24b7..00000000 --- a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml +++ /dev/null @@ -1,51 +0,0 @@ -network.0.1.bias: - device: cpu - max: '3.564e-02' - mean: '-5.232e-04' - min: '-3.566e-02' - shape: - - 128 - sum: '-6.697e-02' -network.0.1.weight: - device: cpu - max: '3.571e-02' - mean: '7.122e-05' - min: '-3.571e-02' - shape: - - 128 - - 784 - sum: '7.147e+00' -network.1.0.bias: - device: cpu - max: '8.382e-02' - mean: '-9.825e-03' - min: '-8.787e-02' - shape: - - 128 - sum: '-1.258e+00' -network.1.0.weight: - device: cpu - max: '8.838e-02' - mean: '1.486e-04' - min: '-8.838e-02' - shape: - - 128 - - 128 - sum: '2.434e+00' -network.2.0.bias: - device: cpu - max: '7.293e-02' - mean: '1.038e-02' - min: '-8.284e-02' - shape: - - 10 - sum: '1.038e-01' -network.2.0.weight: - device: cpu - max: '8.835e-02' - mean: '-1.525e-03' - min: '-8.816e-02' - shape: - - 10 - - 128 - sum: '-1.952e+00' diff --git a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_mnist_example.yaml b/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_mnist_example.yaml deleted file mode 100644 index 309c24b7..00000000 --- a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/fcnet_mnist_example.yaml +++ /dev/null @@ -1,51 +0,0 @@ -network.0.1.bias: - device: cpu - max: '3.564e-02' - mean: '-5.232e-04' - min: '-3.566e-02' - shape: - - 128 - sum: '-6.697e-02' -network.0.1.weight: - device: cpu - max: '3.571e-02' - mean: '7.122e-05' - min: '-3.571e-02' - shape: - - 128 - - 784 - sum: '7.147e+00' -network.1.0.bias: - device: cpu - max: '8.382e-02' - mean: '-9.825e-03' - min: '-8.787e-02' - shape: - - 128 - sum: '-1.258e+00' -network.1.0.weight: - device: cpu - max: '8.838e-02' - mean: '1.486e-04' - min: '-8.838e-02' - shape: - - 128 - - 128 - sum: '2.434e+00' -network.2.0.bias: - device: cpu - max: '7.293e-02' - mean: '1.038e-02' - min: '-8.284e-02' - shape: - - 10 - sum: '1.038e-01' -network.2.0.weight: - device: cpu - max: '8.835e-02' - mean: '-1.525e-03' - min: '-8.816e-02' - shape: - - 10 - - 128 - sum: '-1.952e+00' diff --git a/.regression_files/project/algorithms/hf_example_test/test_backward_pass_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml b/.regression_files/project/algorithms/hf_example_test/test_backward_pass_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml deleted file mode 100644 index f91a9de7..00000000 --- a/.regression_files/project/algorithms/hf_example_test/test_backward_pass_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml +++ /dev/null @@ -1,286 +0,0 @@ -batch.attention_mask: - device: cpu - max: 1 - mean: '8.374e-02' - min: 0 - shape: - - 32 - - 128 - sum: 343 -batch.input_ids: - device: cpu - max: 26101 - mean: '1.597e+02' - min: 0 - shape: - - 32 - - 128 - sum: 654306 -batch.labels: - device: cpu - max: 1 - mean: '7.188e-01' - min: 0 - shape: - - 32 - sum: 23 -batch.token_type_ids: - device: cpu - max: 0 - mean: '0.e+00' - min: 0 - shape: - - 32 - - 128 - sum: 0 -grads.network.albert.embeddings.LayerNorm.bias: - device: cpu - max: '9.495e-03' - mean: '-1.080e-05' - min: '-1.796e-02' - shape: - - 128 - sum: '-1.383e-03' -grads.network.albert.embeddings.LayerNorm.weight: - device: cpu - max: '1.186e-02' - mean: '-2.625e-04' - min: '-1.228e-02' - shape: - - 128 - sum: '-3.360e-02' -grads.network.albert.embeddings.position_embeddings.weight: - device: cpu - max: '6.970e-01' - mean: '-3.638e-12' - min: '-1.086e+00' - shape: - - 512 - - 128 - sum: '-2.384e-07' -grads.network.albert.embeddings.token_type_embeddings.weight: - device: cpu - max: '6.053e-01' - mean: '-1.863e-09' - min: '-1.119e+00' - shape: - - 2 - - 128 - sum: '-4.768e-07' -grads.network.albert.embeddings.word_embeddings.weight: - device: cpu - max: '1.541e+00' - mean: '-2.008e-13' - min: '-6.233e-01' - shape: - - 30000 - - 128 - sum: '-7.711e-07' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm.bias: - device: cpu - max: '6.357e-02' - mean: '-3.738e-04' - min: '-6.593e-02' - shape: - - 768 - sum: '-2.871e-01' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm.weight: - device: cpu - max: '8.125e-02' - mean: '1.121e-04' - min: '-5.811e-01' - shape: - - 768 - sum: '8.612e-02' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense.bias: - device: cpu - max: '6.013e-02' - mean: '-1.940e-11' - min: '-5.395e-02' - shape: - - 768 - sum: '-1.490e-08' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense.weight: - device: cpu - max: '1.061e-01' - mean: '4.042e-13' - min: '-1.112e-01' - shape: - - 768 - - 768 - sum: '2.384e-07' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.bias: - device: cpu - max: '1.275e-08' - mean: '-1.333e-11' - min: '-6.650e-09' - shape: - - 768 - sum: '-1.023e-08' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.weight: - device: cpu - max: '6.536e-01' - mean: '4.320e-06' - min: '-3.507e-01' - shape: - - 768 - - 768 - sum: '2.548e+00' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.bias: - device: cpu - max: '2.402e-02' - mean: '2.56e-05' - min: '-1.913e-02' - shape: - - 768 - sum: '1.966e-02' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight: - device: cpu - max: '1.087e-01' - mean: '7.314e-07' - min: '-1.164e-01' - shape: - - 768 - - 768 - sum: '4.314e-01' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.bias: - device: cpu - max: '6.786e-02' - mean: '-3.315e-04' - min: '-8.925e-02' - shape: - - 768 - sum: '-2.546e-01' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.weight: - device: cpu - max: '4.607e-01' - mean: '-6.091e-06' - min: '-3.011e-01' - shape: - - 768 - - 768 - sum: '-3.592e+00' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn.bias: - device: cpu - max: '4.213e-02' - mean: '-3.888e-05' - min: '-6.737e-02' - shape: - - 3072 - sum: '-1.195e-01' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn.weight: - device: cpu - max: '2.953e-01' - mean: '-5.795e-07' - min: '-2.323e-01' - shape: - - 3072 - - 768 - sum: '-1.367e+00' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn_output.bias: - device: cpu - max: '5.003e-02' - mean: '-5.821e-11' - min: '-5.843e-02' - shape: - - 768 - sum: '-4.470e-08' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn_output.weight: - device: cpu - max: '6.105e-01' - mean: '-2.627e-12' - min: '-5.125e-01' - shape: - - 768 - - 3072 - sum: '-6.199e-06' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.bias: - device: cpu - max: '6.435e-02' - mean: '-1.912e-04' - min: '-6.824e-02' - shape: - - 768 - sum: '-1.468e-01' -grads.network.albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight: - device: cpu - max: '5.071e-02' - mean: '-6.398e-04' - min: '-4.395e-01' - shape: - - 768 - sum: '-4.914e-01' -grads.network.albert.encoder.embedding_hidden_mapping_in.bias: - device: cpu - max: '7.07e-03' - mean: '-8.878e-05' - min: '-7.231e-03' - shape: - - 768 - sum: '-6.818e-02' -grads.network.albert.encoder.embedding_hidden_mapping_in.weight: - device: cpu - max: '8.686e-02' - mean: '2.216e-06' - min: '-8.327e-02' - shape: - - 768 - - 128 - sum: '2.178e-01' -grads.network.albert.pooler.bias: - device: cpu - max: '1.253e-02' - mean: '5.213e-05' - min: '-8.348e-03' - shape: - - 768 - sum: '4.004e-02' -grads.network.albert.pooler.weight: - device: cpu - max: '9.280e-02' - mean: '-9.552e-07' - min: '-6.335e-02' - shape: - - 768 - - 768 - sum: '-5.634e-01' -grads.network.classifier.bias: - device: cpu - max: '2.129e-01' - mean: '7.451e-09' - min: '-2.129e-01' - shape: - - 2 - sum: '1.490e-08' -grads.network.classifier.weight: - device: cpu - max: '2.222e-01' - mean: '-3.444e-10' - min: '-2.222e-01' - shape: - - 2 - - 768 - sum: '-5.29e-07' -outputs.labels: - device: cpu - max: 1 - mean: '7.188e-01' - min: 0 - shape: - - 32 - sum: 23 -outputs.loss: - device: cpu - max: '7.185e-01' - mean: '7.185e-01' - min: '7.185e-01' - shape: [] - sum: '7.185e-01' -outputs.preds: - device: cpu - max: 1 - mean: '4.688e-01' - min: 0 - shape: - - 32 - sum: 15 diff --git a/.regression_files/project/algorithms/hf_example_test/test_forward_pass_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml b/.regression_files/project/algorithms/hf_example_test/test_forward_pass_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml deleted file mode 100644 index f8eb4d0d..00000000 --- a/.regression_files/project/algorithms/hf_example_test/test_forward_pass_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml +++ /dev/null @@ -1,57 +0,0 @@ -input.attention_mask: - device: cpu - hash: -5248677368460617222 - max: 1 - mean: 0.1 - min: 0 - shape: - - 32 - - 128 - sum: 343 -input.input_ids: - device: cpu - hash: -8391087330217722819 - max: 26101 - mean: 159.7 - min: 0 - shape: - - 32 - - 128 - sum: 654306 -input.labels: - device: cpu - hash: -3945588999998408889 - max: 1 - mean: 0.7 - min: 0 - shape: - - 32 - sum: 23 -input.token_type_ids: - device: cpu - hash: -8123354182314851848 - max: 0 - mean: 0.0 - min: 0 - shape: - - 32 - - 128 - sum: 0 -out.logits: - device: cpu - hash: -3045239871714879068 - max: 0.6 - mean: 0.4 - min: 0.1 - shape: - - 32 - - 2 - sum: 26.8 -out.loss: - device: cpu - hash: 1287410195914297480 - max: 0.7 - mean: 0.7 - min: 0.7 - shape: [] - sum: 0.7 diff --git a/.regression_files/project/algorithms/hf_example_test/test_forward_pass_is_reproducible/cuda/albert_base_v2_hf_text_hf_example.yaml b/.regression_files/project/algorithms/hf_example_test/test_forward_pass_is_reproducible/cuda/albert_base_v2_hf_text_hf_example.yaml deleted file mode 100644 index 8e622121..00000000 --- a/.regression_files/project/algorithms/hf_example_test/test_forward_pass_is_reproducible/cuda/albert_base_v2_hf_text_hf_example.yaml +++ /dev/null @@ -1,51 +0,0 @@ -input.attention_mask: - device: cuda:0 - max: 1 - mean: '8.374e-02' - min: 0 - shape: - - 32 - - 128 - sum: 343 -input.input_ids: - device: cuda:0 - max: 26101 - mean: '1.597e+02' - min: 0 - shape: - - 32 - - 128 - sum: 654306 -input.labels: - device: cuda:0 - max: 1 - mean: '7.188e-01' - min: 0 - shape: - - 32 - sum: 23 -input.token_type_ids: - device: cuda:0 - max: 0 - mean: '0.e+00' - min: 0 - shape: - - 32 - - 128 - sum: 0 -out.logits: - device: cuda:0 - max: '4.019e-02' - mean: '-1.58e-01' - min: '-4.991e-01' - shape: - - 32 - - 2 - sum: '-1.011e+01' -out.loss: - device: cuda:0 - max: '7.185e-01' - mean: '7.185e-01' - min: '7.185e-01' - shape: [] - sum: '7.185e-01' diff --git a/.regression_files/project/algorithms/hf_example_test/test_initialization_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml b/.regression_files/project/algorithms/hf_example_test/test_initialization_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml deleted file mode 100644 index 528e67c0..00000000 --- a/.regression_files/project/algorithms/hf_example_test/test_initialization_is_reproducible/cpu/albert_base_v2_hf_text_hf_example.yaml +++ /dev/null @@ -1,228 +0,0 @@ -network.albert.embeddings.LayerNorm.bias: - device: cpu - max: '2.53e+00' - mean: '-3.477e-02' - min: '-1.398e+00' - shape: - - 128 - sum: '-4.451e+00' -network.albert.embeddings.LayerNorm.weight: - device: cpu - max: '3.675e+00' - mean: '3.264e+00' - min: '1.297e+00' - shape: - - 128 - sum: '4.178e+02' -network.albert.embeddings.position_embeddings.weight: - device: cpu - max: '2.774e-01' - mean: '1.058e-04' - min: '-2.344e-01' - shape: - - 512 - - 128 - sum: '6.933e+00' -network.albert.embeddings.token_type_embeddings.weight: - device: cpu - max: '4.431e-02' - mean: '1.339e-04' - min: '-8.033e-02' - shape: - - 2 - - 128 - sum: '3.429e-02' -network.albert.embeddings.word_embeddings.weight: - device: cpu - max: '2.003e-01' - mean: '-5.478e-03' - min: '-1.946e-01' - shape: - - 30000 - - 128 - sum: '-2.104e+04' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm.bias: - device: cpu - max: '2.411e+00' - mean: '-6.698e-03' - min: '-3.421e+00' - shape: - - 768 - sum: '-5.144e+00' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm.weight: - device: cpu - max: '2.478e+00' - mean: '5.703e-01' - min: '3.535e-01' - shape: - - 768 - sum: '4.38e+02' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense.bias: - device: cpu - max: '5.149e+00' - mean: '-3.476e-03' - min: '-8.748e+00' - shape: - - 768 - sum: '-2.669e+00' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense.weight: - device: cpu - max: '7.227e-01' - mean: '1.840e-06' - min: '-5.057e-01' - shape: - - 768 - - 768 - sum: '1.085e+00' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.bias: - device: cpu - max: '1.643e+00' - mean: '1.291e-02' - min: '-1.689e+00' - shape: - - 768 - sum: '9.916e+00' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.weight: - device: cpu - max: '2.669e-01' - mean: '1.060e-04' - min: '-3.136e-01' - shape: - - 768 - - 768 - sum: '6.253e+01' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.bias: - device: cpu - max: '4.806e+00' - mean: '6.103e-02' - min: '-4.117e+00' - shape: - - 768 - sum: '4.687e+01' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight: - device: cpu - max: '3.613e-01' - mean: '-2.149e-05' - min: '-2.743e-01' - shape: - - 768 - - 768 - sum: '-1.268e+01' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.bias: - device: cpu - max: '5.064e-01' - mean: '8.661e-04' - min: '-6.153e-01' - shape: - - 768 - sum: '6.652e-01' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.weight: - device: cpu - max: '2.998e-01' - mean: '-9.619e-05' - min: '-2.962e-01' - shape: - - 768 - - 768 - sum: '-5.674e+01' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn.bias: - device: cpu - max: '5.147e-01' - mean: '-5.56e-01' - min: '-9.e+00' - shape: - - 3072 - sum: '-1.708e+03' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn.weight: - device: cpu - max: '1.932e+00' - mean: '-1.609e-05' - min: '-1.779e+00' - shape: - - 3072 - - 768 - sum: '-3.796e+01' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn_output.bias: - device: cpu - max: '1.906e+00' - mean: '-1.445e-02' - min: '-1.471e+01' - shape: - - 768 - sum: '-1.11e+01' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn_output.weight: - device: cpu - max: '1.226e+00' - mean: '-1.576e-05' - min: '-2.475e+00' - shape: - - 768 - - 3072 - sum: '-3.717e+01' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.bias: - device: cpu - max: '4.331e+00' - mean: '-4.060e-02' - min: '-7.592e-01' - shape: - - 768 - sum: '-3.118e+01' -network.albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight: - device: cpu - max: '3.067e+00' - mean: '1.35e+00' - min: '2.373e-01' - shape: - - 768 - sum: '1.037e+03' -network.albert.encoder.embedding_hidden_mapping_in.bias: - device: cpu - max: '2.250e+00' - mean: '-2.328e-02' - min: '-2.484e+00' - shape: - - 768 - sum: '-1.788e+01' -network.albert.encoder.embedding_hidden_mapping_in.weight: - device: cpu - max: '2.709e-01' - mean: '3.868e-04' - min: '-2.624e-01' - shape: - - 768 - - 128 - sum: '3.802e+01' -network.albert.pooler.bias: - device: cpu - max: '1.409e+00' - mean: '5.837e-03' - min: '-1.279e+00' - shape: - - 768 - sum: '4.483e+00' -network.albert.pooler.weight: - device: cpu - max: '2.83e-01' - mean: '-2.292e-05' - min: '-2.817e-01' - shape: - - 768 - - 768 - sum: '-1.352e+01' -network.classifier.bias: - device: cpu - max: '0.e+00' - mean: '0.e+00' - min: '0.e+00' - shape: - - 2 - sum: '0.e+00' -network.classifier.weight: - device: cpu - max: '6.891e-02' - mean: '8.459e-05' - min: '-6.203e-02' - shape: - - 2 - - 768 - sum: '1.299e-01' diff --git a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_cifar10_image_classifier.yaml similarity index 84% rename from .regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_cifar10_example.yaml rename to .regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_cifar10_image_classifier.yaml index b4b3f47e..8e762f3f 100644 --- a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_cifar10_example.yaml +++ b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_cifar10_image_classifier.yaml @@ -1,5 +1,5 @@ batch.0: - device: cpu + device: cuda:0 max: '2.126e+00' mean: '-6.179e-03' min: '-1.989e+00' @@ -10,7 +10,7 @@ batch.0: - 32 sum: '-2.43e+03' batch.1: - device: cpu + device: cuda:0 max: 9 mean: '4.555e+00' min: 0 @@ -18,7 +18,7 @@ batch.1: - 128 sum: 583 grads.network.0.1.bias: - device: cpu + device: cuda:0 max: '6.107e-03' mean: '1.775e-04' min: '-5.292e-03' @@ -26,7 +26,7 @@ grads.network.0.1.bias: - 128 sum: '2.272e-02' grads.network.0.1.weight: - device: cpu + device: cuda:0 max: '1.307e-02' mean: '4.693e-05' min: '-1.141e-02' @@ -35,7 +35,7 @@ grads.network.0.1.weight: - 3072 sum: '1.845e+01' grads.network.1.0.bias: - device: cpu + device: cuda:0 max: '1.041e-02' mean: '6.975e-04' min: '-8.782e-03' @@ -43,7 +43,7 @@ grads.network.1.0.bias: - 128 sum: '8.928e-02' grads.network.1.0.weight: - device: cpu + device: cuda:0 max: '1.584e-02' mean: '1.481e-04' min: '-1.507e-02' @@ -52,7 +52,7 @@ grads.network.1.0.weight: - 128 sum: '2.426e+00' grads.network.2.0.bias: - device: cpu + device: cuda:0 max: '3.282e-02' mean: '-1.956e-09' min: '-2.134e-02' @@ -60,16 +60,16 @@ grads.network.2.0.bias: - 10 sum: '-1.956e-08' grads.network.2.0.weight: - device: cpu + device: cuda:0 max: '2.200e-02' - mean: '-2.874e-10' + mean: '-2.561e-10' min: '-5.831e-02' shape: - 10 - 128 - sum: '-3.679e-07' + sum: '-3.278e-07' outputs.logits: - device: cpu + device: cuda:0 max: '7.036e-01' mean: '-8.651e-03' min: '-8.180e-01' @@ -78,14 +78,14 @@ outputs.logits: - 10 sum: '-1.107e+01' outputs.loss: - device: cpu + device: cuda:0 max: '2.316e+00' mean: '2.316e+00' min: '2.316e+00' shape: [] sum: '2.316e+00' outputs.y: - device: cpu + device: cuda:0 max: 9 mean: '4.555e+00' min: 0 diff --git a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_fashion_mnist_image_classifier.yaml similarity index 84% rename from .regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml rename to .regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_fashion_mnist_image_classifier.yaml index ee70a8f8..8be326eb 100644 --- a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_fashion_mnist_example.yaml +++ b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_fashion_mnist_image_classifier.yaml @@ -1,5 +1,5 @@ batch.0: - device: cpu + device: cuda:0 max: '2.821e+00' mean: '4.822e-01' min: '-4.242e-01' @@ -10,7 +10,7 @@ batch.0: - 28 sum: '4.839e+04' batch.1: - device: cpu + device: cuda:0 max: 9 mean: '4.555e+00' min: 0 @@ -18,7 +18,7 @@ batch.1: - 128 sum: 583 grads.network.0.1.bias: - device: cpu + device: cuda:0 max: '6.875e-03' mean: '2.096e-04' min: '-8.370e-03' @@ -26,7 +26,7 @@ grads.network.0.1.bias: - 128 sum: '2.683e-02' grads.network.0.1.weight: - device: cpu + device: cuda:0 max: '1.948e-02' mean: '2.916e-04' min: '-2.213e-02' @@ -35,7 +35,7 @@ grads.network.0.1.weight: - 784 sum: '2.926e+01' grads.network.1.0.bias: - device: cpu + device: cuda:0 max: '1.109e-02' mean: '2.213e-04' min: '-1.267e-02' @@ -43,7 +43,7 @@ grads.network.1.0.bias: - 128 sum: '2.832e-02' grads.network.1.0.weight: - device: cpu + device: cuda:0 max: '2.374e-02' mean: '9.326e-05' min: '-2.32e-02' @@ -52,7 +52,7 @@ grads.network.1.0.weight: - 128 sum: '1.528e+00' grads.network.2.0.bias: - device: cpu + device: cuda:0 max: '3.847e-02' mean: '-3.353e-09' min: '-4.706e-02' @@ -60,16 +60,16 @@ grads.network.2.0.bias: - 10 sum: '-3.353e-08' grads.network.2.0.weight: - device: cpu + device: cuda:0 max: '5.741e-02' - mean: '-4.195e-10' + mean: '-3.929e-10' min: '-6.431e-02' shape: - 10 - 128 - sum: '-5.369e-07' + sum: '-5.029e-07' outputs.logits: - device: cpu + device: cuda:0 max: '9.872e-01' mean: '-1.288e-02' min: '-7.225e-01' @@ -78,14 +78,14 @@ outputs.logits: - 10 sum: '-1.648e+01' outputs.loss: - device: cpu + device: cuda:0 max: '2.311e+00' mean: '2.311e+00' min: '2.311e+00' shape: [] sum: '2.311e+00' outputs.y: - device: cpu + device: cuda:0 max: 9 mean: '4.555e+00' min: 0 diff --git a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_mnist_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_mnist_image_classifier.yaml similarity index 81% rename from .regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_mnist_example.yaml rename to .regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_mnist_image_classifier.yaml index 90b624d9..232a8e50 100644 --- a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/fcnet_mnist_example.yaml +++ b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/fcnet_mnist_image_classifier.yaml @@ -1,5 +1,5 @@ batch.0: - device: cpu + device: cuda:0 max: '2.821e+00' mean: '1.432e-02' min: '-4.242e-01' @@ -10,7 +10,7 @@ batch.0: - 28 sum: '1.437e+03' batch.1: - device: cpu + device: cuda:0 max: 9 mean: '4.242e+00' min: 0 @@ -18,7 +18,7 @@ batch.1: - 128 sum: 543 grads.network.0.1.bias: - device: cpu + device: cuda:0 max: '1.075e-02' mean: '2.421e-04' min: '-7.844e-03' @@ -26,7 +26,7 @@ grads.network.0.1.bias: - 128 sum: '3.099e-02' grads.network.0.1.weight: - device: cpu + device: cuda:0 max: '2.006e-02' mean: '5.258e-05' min: '-1.844e-02' @@ -35,7 +35,7 @@ grads.network.0.1.weight: - 784 sum: '5.277e+00' grads.network.1.0.bias: - device: cpu + device: cuda:0 max: '1.169e-02' mean: '4.285e-04' min: '-1.152e-02' @@ -43,7 +43,7 @@ grads.network.1.0.bias: - 128 sum: '5.485e-02' grads.network.1.0.weight: - device: cpu + device: cuda:0 max: '1.753e-02' mean: '1.016e-04' min: '-2.219e-02' @@ -52,24 +52,24 @@ grads.network.1.0.weight: - 128 sum: '1.665e+00' grads.network.2.0.bias: - device: cpu + device: cuda:0 max: '3.969e-02' - mean: '-1.304e-09' + mean: '-1.490e-09' min: '-7.979e-02' shape: - 10 - sum: '-1.304e-08' + sum: '-1.490e-08' grads.network.2.0.weight: - device: cpu + device: cuda:0 max: '3.221e-02' - mean: '-1.306e-10' + mean: '-1.928e-10' min: '-6.755e-02' shape: - 10 - 128 - sum: '-1.672e-07' + sum: '-2.468e-07' outputs.logits: - device: cpu + device: cuda:0 max: '7.029e-01' mean: '-3.564e-02' min: '-7.781e-01' @@ -78,14 +78,14 @@ outputs.logits: - 10 sum: '-4.562e+01' outputs.loss: - device: cpu + device: cuda:0 max: '2.304e+00' mean: '2.304e+00' min: '2.304e+00' shape: [] sum: '2.304e+00' outputs.y: - device: cpu + device: cuda:0 max: 9 mean: '4.242e+00' min: 0 diff --git a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/resnet18_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet18_cifar10_image_classifier.yaml similarity index 86% rename from .regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/resnet18_cifar10_example.yaml rename to .regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet18_cifar10_image_classifier.yaml index f9556c68..1ada67d1 100644 --- a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/resnet18_cifar10_example.yaml +++ b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet18_cifar10_image_classifier.yaml @@ -1,5 +1,5 @@ batch.0: - device: cpu + device: cuda:0 max: '2.126e+00' mean: '-6.179e-03' min: '-1.989e+00' @@ -10,7 +10,7 @@ batch.0: - 32 sum: '-2.43e+03' batch.1: - device: cpu + device: cuda:0 max: 9 mean: '4.555e+00' min: 0 @@ -18,7 +18,7 @@ batch.1: - 128 sum: 583 grads.network.bn1.bias: - device: cpu + device: cuda:0 max: '4.94e-02' mean: '3.131e-04' min: '-4.549e-02' @@ -26,7 +26,7 @@ grads.network.bn1.bias: - 64 sum: '2.004e-02' grads.network.bn1.weight: - device: cpu + device: cuda:0 max: '7.001e-02' mean: '1.024e-03' min: '-7.857e-02' @@ -34,7 +34,7 @@ grads.network.bn1.weight: - 64 sum: '6.554e-02' grads.network.conv1.weight: - device: cpu + device: cuda:0 max: '6.192e-01' mean: '1.341e-03' min: '-7.564e-01' @@ -45,7 +45,7 @@ grads.network.conv1.weight: - 7 sum: '1.261e+01' grads.network.fc.bias: - device: cpu + device: cuda:0 max: '8.718e-02' mean: '-2.235e-09' min: '-7.594e-02' @@ -53,16 +53,16 @@ grads.network.fc.bias: - 10 sum: '-2.235e-08' grads.network.fc.weight: - device: cpu + device: cuda:0 max: '1.526e-01' - mean: '-8.327e-10' + mean: '-7.902e-10' min: '-1.636e-01' shape: - 10 - 512 - sum: '-4.264e-06' + sum: '-4.046e-06' grads.network.layer1.0.bn1.bias: - device: cpu + device: cuda:0 max: '4.809e-02' mean: '-6.887e-05' min: '-4.261e-02' @@ -70,15 +70,15 @@ grads.network.layer1.0.bn1.bias: - 64 sum: '-4.407e-03' grads.network.layer1.0.bn1.weight: - device: cpu + device: cuda:0 max: '5.681e-02' - mean: '-2.846e-08' + mean: '-2.87e-08' min: '-6.472e-02' shape: - 64 - sum: '-1.822e-06' + sum: '-1.837e-06' grads.network.layer1.0.bn2.bias: - device: cpu + device: cuda:0 max: '2.823e-02' mean: '6.060e-04' min: '-3.829e-02' @@ -86,7 +86,7 @@ grads.network.layer1.0.bn2.bias: - 64 sum: '3.878e-02' grads.network.layer1.0.bn2.weight: - device: cpu + device: cuda:0 max: '4.298e-02' mean: '-1.402e-03' min: '-5.307e-02' @@ -94,7 +94,7 @@ grads.network.layer1.0.bn2.weight: - 64 sum: '-8.975e-02' grads.network.layer1.0.conv1.weight: - device: cpu + device: cuda:0 max: '1.152e-01' mean: '2.658e-05' min: '-1.006e-01' @@ -105,7 +105,7 @@ grads.network.layer1.0.conv1.weight: - 3 sum: '9.8e-01' grads.network.layer1.0.conv2.weight: - device: cpu + device: cuda:0 max: '7.023e-02' mean: '2.208e-04' min: '-8.426e-02' @@ -116,7 +116,7 @@ grads.network.layer1.0.conv2.weight: - 3 sum: '8.138e+00' grads.network.layer1.1.bn1.bias: - device: cpu + device: cuda:0 max: '5.121e-02' mean: '1.57e-05' min: '-3.888e-02' @@ -124,15 +124,15 @@ grads.network.layer1.1.bn1.bias: - 64 sum: '1.005e-03' grads.network.layer1.1.bn1.weight: - device: cpu + device: cuda:0 max: '3.775e-02' - mean: '4.249e-09' + mean: '4.075e-09' min: '-3.404e-02' shape: - 64 - sum: '2.719e-07' + sum: '2.608e-07' grads.network.layer1.1.bn2.bias: - device: cpu + device: cuda:0 max: '2.051e-02' mean: '1.167e-03' min: '-2.095e-02' @@ -140,7 +140,7 @@ grads.network.layer1.1.bn2.bias: - 64 sum: '7.466e-02' grads.network.layer1.1.bn2.weight: - device: cpu + device: cuda:0 max: '3.145e-02' mean: '3.783e-04' min: '-3.695e-02' @@ -148,7 +148,7 @@ grads.network.layer1.1.bn2.weight: - 64 sum: '2.421e-02' grads.network.layer1.1.conv1.weight: - device: cpu + device: cuda:0 max: '7.035e-02' mean: '-9.996e-04' min: '-7.167e-02' @@ -159,7 +159,7 @@ grads.network.layer1.1.conv1.weight: - 3 sum: '-3.685e+01' grads.network.layer1.1.conv2.weight: - device: cpu + device: cuda:0 max: '7.708e-02' mean: '3.07e-04' min: '-5.375e-02' @@ -170,7 +170,7 @@ grads.network.layer1.1.conv2.weight: - 3 sum: '1.132e+01' grads.network.layer2.0.bn1.bias: - device: cpu + device: cuda:0 max: '2.687e-02' mean: '5.859e-04' min: '-2.458e-02' @@ -178,7 +178,7 @@ grads.network.layer2.0.bn1.bias: - 128 sum: '7.500e-02' grads.network.layer2.0.bn1.weight: - device: cpu + device: cuda:0 max: '2.383e-02' mean: '-1.983e-08' min: '-3.218e-02' @@ -186,7 +186,7 @@ grads.network.layer2.0.bn1.weight: - 128 sum: '-2.539e-06' grads.network.layer2.0.bn2.bias: - device: cpu + device: cuda:0 max: '1.778e-02' mean: '-7.097e-04' min: '-2.318e-02' @@ -194,7 +194,7 @@ grads.network.layer2.0.bn2.bias: - 128 sum: '-9.084e-02' grads.network.layer2.0.bn2.weight: - device: cpu + device: cuda:0 max: '2.506e-02' mean: '-1.001e-03' min: '-2.575e-02' @@ -202,7 +202,7 @@ grads.network.layer2.0.bn2.weight: - 128 sum: '-1.281e-01' grads.network.layer2.0.conv1.weight: - device: cpu + device: cuda:0 max: '7.148e-02' mean: '8.56e-04' min: '-6.533e-02' @@ -213,7 +213,7 @@ grads.network.layer2.0.conv1.weight: - 3 sum: '6.311e+01' grads.network.layer2.0.conv2.weight: - device: cpu + device: cuda:0 max: '4.581e-02' mean: '5.887e-06' min: '-4.373e-02' @@ -224,7 +224,7 @@ grads.network.layer2.0.conv2.weight: - 3 sum: '8.681e-01' grads.network.layer2.0.downsample.0.weight: - device: cpu + device: cuda:0 max: '5.408e-02' mean: '6.587e-05' min: '-6.218e-02' @@ -235,7 +235,7 @@ grads.network.layer2.0.downsample.0.weight: - 1 sum: '5.396e-01' grads.network.layer2.0.downsample.1.bias: - device: cpu + device: cuda:0 max: '1.778e-02' mean: '-7.097e-04' min: '-2.318e-02' @@ -243,7 +243,7 @@ grads.network.layer2.0.downsample.1.bias: - 128 sum: '-9.084e-02' grads.network.layer2.0.downsample.1.weight: - device: cpu + device: cuda:0 max: '2.67e-02' mean: '7.026e-04' min: '-2.834e-02' @@ -251,7 +251,7 @@ grads.network.layer2.0.downsample.1.weight: - 128 sum: '8.994e-02' grads.network.layer2.1.bn1.bias: - device: cpu + device: cuda:0 max: '2.282e-02' mean: '4.179e-04' min: '-1.989e-02' @@ -259,15 +259,15 @@ grads.network.layer2.1.bn1.bias: - 128 sum: '5.349e-02' grads.network.layer2.1.bn1.weight: - device: cpu + device: cuda:0 max: '2.738e-02' - mean: '3.405e-09' + mean: '3.492e-09' min: '-2.028e-02' shape: - 128 - sum: '4.359e-07' + sum: '4.470e-07' grads.network.layer2.1.bn2.bias: - device: cpu + device: cuda:0 max: '1.634e-02' mean: '4.516e-04' min: '-1.524e-02' @@ -275,7 +275,7 @@ grads.network.layer2.1.bn2.bias: - 128 sum: '5.78e-02' grads.network.layer2.1.bn2.weight: - device: cpu + device: cuda:0 max: '2.251e-02' mean: '2.985e-04' min: '-2.765e-02' @@ -283,7 +283,7 @@ grads.network.layer2.1.bn2.weight: - 128 sum: '3.821e-02' grads.network.layer2.1.conv1.weight: - device: cpu + device: cuda:0 max: '4.786e-02' mean: '-1.842e-04' min: '-4.788e-02' @@ -294,7 +294,7 @@ grads.network.layer2.1.conv1.weight: - 3 sum: '-2.716e+01' grads.network.layer2.1.conv2.weight: - device: cpu + device: cuda:0 max: '3.281e-02' mean: '-1.638e-05' min: '-3.597e-02' @@ -305,7 +305,7 @@ grads.network.layer2.1.conv2.weight: - 3 sum: '-2.415e+00' grads.network.layer3.0.bn1.bias: - device: cpu + device: cuda:0 max: '1.373e-02' mean: '-1.949e-05' min: '-1.339e-02' @@ -313,15 +313,15 @@ grads.network.layer3.0.bn1.bias: - 256 sum: '-4.989e-03' grads.network.layer3.0.bn1.weight: - device: cpu + device: cuda:0 max: '1.651e-02' - mean: '-1.781e-08' + mean: '-1.778e-08' min: '-1.433e-02' shape: - 256 - sum: '-4.56e-06' + sum: '-4.552e-06' grads.network.layer3.0.bn2.bias: - device: cpu + device: cuda:0 max: '1.342e-02' mean: '-1.425e-04' min: '-1.272e-02' @@ -329,7 +329,7 @@ grads.network.layer3.0.bn2.bias: - 256 sum: '-3.647e-02' grads.network.layer3.0.bn2.weight: - device: cpu + device: cuda:0 max: '1.591e-02' mean: '-4.350e-04' min: '-1.678e-02' @@ -337,7 +337,7 @@ grads.network.layer3.0.bn2.weight: - 256 sum: '-1.114e-01' grads.network.layer3.0.conv1.weight: - device: cpu + device: cuda:0 max: '3.91e-02' mean: '1.103e-04' min: '-3.65e-02' @@ -348,7 +348,7 @@ grads.network.layer3.0.conv1.weight: - 3 sum: '3.254e+01' grads.network.layer3.0.conv2.weight: - device: cpu + device: cuda:0 max: '2.947e-02' mean: '-2.338e-05' min: '-3.166e-02' @@ -359,7 +359,7 @@ grads.network.layer3.0.conv2.weight: - 3 sum: '-1.379e+01' grads.network.layer3.0.downsample.0.weight: - device: cpu + device: cuda:0 max: '3.125e-02' mean: '-1.221e-06' min: '-2.705e-02' @@ -370,7 +370,7 @@ grads.network.layer3.0.downsample.0.weight: - 1 sum: '-4.002e-02' grads.network.layer3.0.downsample.1.bias: - device: cpu + device: cuda:0 max: '1.342e-02' mean: '-1.425e-04' min: '-1.272e-02' @@ -378,7 +378,7 @@ grads.network.layer3.0.downsample.1.bias: - 256 sum: '-3.647e-02' grads.network.layer3.0.downsample.1.weight: - device: cpu + device: cuda:0 max: '1.214e-02' mean: '5.825e-05' min: '-1.422e-02' @@ -386,7 +386,7 @@ grads.network.layer3.0.downsample.1.weight: - 256 sum: '1.491e-02' grads.network.layer3.1.bn1.bias: - device: cpu + device: cuda:0 max: '1.198e-02' mean: '1.985e-04' min: '-9.063e-03' @@ -394,15 +394,15 @@ grads.network.layer3.1.bn1.bias: - 256 sum: '5.082e-02' grads.network.layer3.1.bn1.weight: - device: cpu + device: cuda:0 max: '1.364e-02' - mean: '1.122e-08' + mean: '1.119e-08' min: '-1.406e-02' shape: - 256 - sum: '2.874e-06' + sum: '2.865e-06' grads.network.layer3.1.bn2.bias: - device: cpu + device: cuda:0 max: '6.948e-03' mean: '1.387e-04' min: '-6.29e-03' @@ -410,7 +410,7 @@ grads.network.layer3.1.bn2.bias: - 256 sum: '3.551e-02' grads.network.layer3.1.bn2.weight: - device: cpu + device: cuda:0 max: '1.099e-02' mean: '3.768e-04' min: '-1.145e-02' @@ -418,7 +418,7 @@ grads.network.layer3.1.bn2.weight: - 256 sum: '9.646e-02' grads.network.layer3.1.conv1.weight: - device: cpu + device: cuda:0 max: '2.413e-02' mean: '-6.619e-06' min: '-2.651e-02' @@ -429,7 +429,7 @@ grads.network.layer3.1.conv1.weight: - 3 sum: '-3.904e+00' grads.network.layer3.1.conv2.weight: - device: cpu + device: cuda:0 max: '2.347e-02' mean: '-3.211e-05' min: '-2.596e-02' @@ -440,7 +440,7 @@ grads.network.layer3.1.conv2.weight: - 3 sum: '-1.894e+01' grads.network.layer4.0.bn1.bias: - device: cpu + device: cuda:0 max: '6.987e-03' mean: '-5.95e-06' min: '-6.451e-03' @@ -448,7 +448,7 @@ grads.network.layer4.0.bn1.bias: - 512 sum: '-3.046e-03' grads.network.layer4.0.bn1.weight: - device: cpu + device: cuda:0 max: '8.782e-03' mean: '5.227e-08' min: '-8.326e-03' @@ -456,7 +456,7 @@ grads.network.layer4.0.bn1.weight: - 512 sum: '2.676e-05' grads.network.layer4.0.bn2.bias: - device: cpu + device: cuda:0 max: '7.944e-03' mean: '4.654e-04' min: '-5.159e-03' @@ -464,7 +464,7 @@ grads.network.layer4.0.bn2.bias: - 512 sum: '2.383e-01' grads.network.layer4.0.bn2.weight: - device: cpu + device: cuda:0 max: '7.365e-03' mean: '3.815e-04' min: '-7.759e-03' @@ -472,7 +472,7 @@ grads.network.layer4.0.bn2.weight: - 512 sum: '1.953e-01' grads.network.layer4.0.conv1.weight: - device: cpu + device: cuda:0 max: '3.395e-02' mean: '1.298e-05' min: '-3.451e-02' @@ -483,7 +483,7 @@ grads.network.layer4.0.conv1.weight: - 3 sum: '1.531e+01' grads.network.layer4.0.conv2.weight: - device: cpu + device: cuda:0 max: '2.825e-02' mean: '-1.254e-06' min: '-2.923e-02' @@ -494,7 +494,7 @@ grads.network.layer4.0.conv2.weight: - 3 sum: '-2.96e+00' grads.network.layer4.0.downsample.0.weight: - device: cpu + device: cuda:0 max: '1.519e-02' mean: '2.644e-06' min: '-1.993e-02' @@ -505,7 +505,7 @@ grads.network.layer4.0.downsample.0.weight: - 1 sum: '3.466e-01' grads.network.layer4.0.downsample.1.bias: - device: cpu + device: cuda:0 max: '7.944e-03' mean: '4.654e-04' min: '-5.159e-03' @@ -513,7 +513,7 @@ grads.network.layer4.0.downsample.1.bias: - 512 sum: '2.383e-01' grads.network.layer4.0.downsample.1.weight: - device: cpu + device: cuda:0 max: '6.664e-03' mean: '3.273e-04' min: '-6.98e-03' @@ -521,7 +521,7 @@ grads.network.layer4.0.downsample.1.weight: - 512 sum: '1.676e-01' grads.network.layer4.1.bn1.bias: - device: cpu + device: cuda:0 max: '5.407e-03' mean: '9.024e-05' min: '-4.404e-03' @@ -529,15 +529,15 @@ grads.network.layer4.1.bn1.bias: - 512 sum: '4.620e-02' grads.network.layer4.1.bn1.weight: - device: cpu + device: cuda:0 max: '5.791e-03' - mean: '4.915e-08' + mean: '4.913e-08' min: '-5.188e-03' shape: - 512 - sum: '2.516e-05' + sum: '2.515e-05' grads.network.layer4.1.bn2.bias: - device: cpu + device: cuda:0 max: '8.746e-03' mean: '4.971e-04' min: '-9.116e-03' @@ -545,7 +545,7 @@ grads.network.layer4.1.bn2.bias: - 512 sum: '2.545e-01' grads.network.layer4.1.bn2.weight: - device: cpu + device: cuda:0 max: '6.717e-03' mean: '3.269e-04' min: '-5.782e-03' @@ -553,7 +553,7 @@ grads.network.layer4.1.bn2.weight: - 512 sum: '1.674e-01' grads.network.layer4.1.conv1.weight: - device: cpu + device: cuda:0 max: '2.951e-02' mean: '-5.57e-06' min: '-3.434e-02' @@ -564,7 +564,7 @@ grads.network.layer4.1.conv1.weight: - 3 sum: '-1.314e+01' grads.network.layer4.1.conv2.weight: - device: cpu + device: cuda:0 max: '2.492e-02' mean: '-1.259e-06' min: '-2.262e-02' @@ -575,7 +575,7 @@ grads.network.layer4.1.conv2.weight: - 3 sum: '-2.971e+00' outputs.logits: - device: cpu + device: cuda:0 max: '2.728e+00' mean: '8.106e-02' min: '-2.536e+00' @@ -584,14 +584,14 @@ outputs.logits: - 10 sum: '1.038e+02' outputs.loss: - device: cpu + device: cuda:0 max: '2.593e+00' mean: '2.593e+00' min: '2.593e+00' shape: [] sum: '2.593e+00' outputs.y: - device: cpu + device: cuda:0 max: 9 mean: '4.555e+00' min: 0 diff --git a/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet18_imagenet_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet18_imagenet_image_classifier.yaml new file mode 100644 index 00000000..938d81f2 --- /dev/null +++ b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet18_imagenet_image_classifier.yaml @@ -0,0 +1,600 @@ +batch.0: + device: cuda:0 + max: '2.640e+00' + mean: '-6.663e-02' + min: '-2.118e+00' + shape: + - 64 + - 3 + - 224 + - 224 + sum: '-6.419e+05' +batch.1: + device: cuda:0 + max: 988 + mean: '5.182e+02' + min: 0 + shape: + - 64 + sum: 33166 +grads.network.bn1.bias: + device: cuda:0 + max: '1.433e-02' + mean: '1.035e-03' + min: '-1.257e-02' + shape: + - 64 + sum: '6.621e-02' +grads.network.bn1.weight: + device: cuda:0 + max: '1.866e-02' + mean: '9.764e-05' + min: '-2.028e-02' + shape: + - 64 + sum: '6.249e-03' +grads.network.conv1.weight: + device: cuda:0 + max: '1.798e-01' + mean: '6.264e-03' + min: '-1.354e-01' + shape: + - 64 + - 3 + - 7 + - 7 + sum: '5.893e+01' +grads.network.fc.bias: + device: cuda:0 + max: '3.523e-03' + mean: '2.235e-11' + min: '-3.062e-02' + shape: + - 1000 + sum: '2.235e-08' +grads.network.fc.weight: + device: cuda:0 + max: '4.594e-03' + mean: '1.490e-11' + min: '-8.777e-02' + shape: + - 1000 + - 512 + sum: '7.629e-06' +grads.network.layer1.0.bn1.bias: + device: cuda:0 + max: '1.035e-02' + mean: '-8.887e-05' + min: '-1.081e-02' + shape: + - 64 + sum: '-5.688e-03' +grads.network.layer1.0.bn1.weight: + device: cuda:0 + max: '1.322e-02' + mean: '3.085e-09' + min: '-1.446e-02' + shape: + - 64 + sum: '1.974e-07' +grads.network.layer1.0.bn2.bias: + device: cuda:0 + max: '5.771e-03' + mean: '2.727e-04' + min: '-8.209e-03' + shape: + - 64 + sum: '1.745e-02' +grads.network.layer1.0.bn2.weight: + device: cuda:0 + max: '9.735e-03' + mean: '3.428e-05' + min: '-7.881e-03' + shape: + - 64 + sum: '2.194e-03' +grads.network.layer1.0.conv1.weight: + device: cuda:0 + max: '3.228e-02' + mean: '-2.187e-04' + min: '-3.009e-02' + shape: + - 64 + - 64 + - 3 + - 3 + sum: '-8.063e+00' +grads.network.layer1.0.conv2.weight: + device: cuda:0 + max: '2.011e-02' + mean: '-8.082e-05' + min: '-2.321e-02' + shape: + - 64 + - 64 + - 3 + - 3 + sum: '-2.979e+00' +grads.network.layer1.1.bn1.bias: + device: cuda:0 + max: '8.757e-03' + mean: '3.335e-04' + min: '-8.009e-03' + shape: + - 64 + sum: '2.134e-02' +grads.network.layer1.1.bn1.weight: + device: cuda:0 + max: '1.031e-02' + mean: '-1.251e-09' + min: '-8.325e-03' + shape: + - 64 + sum: '-8.009e-08' +grads.network.layer1.1.bn2.bias: + device: cuda:0 + max: '3.688e-03' + mean: '-1.159e-04' + min: '-3.878e-03' + shape: + - 64 + sum: '-7.419e-03' +grads.network.layer1.1.bn2.weight: + device: cuda:0 + max: '7.533e-03' + mean: '-1.319e-04' + min: '-1.042e-02' + shape: + - 64 + sum: '-8.443e-03' +grads.network.layer1.1.conv1.weight: + device: cuda:0 + max: '1.682e-02' + mean: '7.859e-05' + min: '-1.756e-02' + shape: + - 64 + - 64 + - 3 + - 3 + sum: '2.897e+00' +grads.network.layer1.1.conv2.weight: + device: cuda:0 + max: '1.164e-02' + mean: '-8.183e-05' + min: '-1.057e-02' + shape: + - 64 + - 64 + - 3 + - 3 + sum: '-3.017e+00' +grads.network.layer2.0.bn1.bias: + device: cuda:0 + max: '6.346e-03' + mean: '3.467e-04' + min: '-5.223e-03' + shape: + - 128 + sum: '4.438e-02' +grads.network.layer2.0.bn1.weight: + device: cuda:0 + max: '4.709e-03' + mean: '8.731e-11' + min: '-5.212e-03' + shape: + - 128 + sum: '1.118e-08' +grads.network.layer2.0.bn2.bias: + device: cuda:0 + max: '4.109e-03' + mean: '1.036e-04' + min: '-5.165e-03' + shape: + - 128 + sum: '1.326e-02' +grads.network.layer2.0.bn2.weight: + device: cuda:0 + max: '7.476e-03' + mean: '-1.799e-05' + min: '-5.677e-03' + shape: + - 128 + sum: '-2.302e-03' +grads.network.layer2.0.conv1.weight: + device: cuda:0 + max: '1.684e-02' + mean: '-1.249e-04' + min: '-1.531e-02' + shape: + - 128 + - 64 + - 3 + - 3 + sum: '-9.211e+00' +grads.network.layer2.0.conv2.weight: + device: cuda:0 + max: '9.979e-03' + mean: '-4.225e-05' + min: '-9.486e-03' + shape: + - 128 + - 128 + - 3 + - 3 + sum: '-6.229e+00' +grads.network.layer2.0.downsample.0.weight: + device: cuda:0 + max: '1.095e-02' + mean: '-1.596e-04' + min: '-1.44e-02' + shape: + - 128 + - 64 + - 1 + - 1 + sum: '-1.307e+00' +grads.network.layer2.0.downsample.1.bias: + device: cuda:0 + max: '4.109e-03' + mean: '1.036e-04' + min: '-5.165e-03' + shape: + - 128 + sum: '1.326e-02' +grads.network.layer2.0.downsample.1.weight: + device: cuda:0 + max: '5.643e-03' + mean: '-9.116e-05' + min: '-5.724e-03' + shape: + - 128 + sum: '-1.167e-02' +grads.network.layer2.1.bn1.bias: + device: cuda:0 + max: '3.875e-03' + mean: '2.269e-04' + min: '-3.296e-03' + shape: + - 128 + sum: '2.904e-02' +grads.network.layer2.1.bn1.weight: + device: cuda:0 + max: '3.931e-03' + mean: '1.222e-09' + min: '-5.433e-03' + shape: + - 128 + sum: '1.565e-07' +grads.network.layer2.1.bn2.bias: + device: cuda:0 + max: '3.029e-03' + mean: '1.229e-04' + min: '-2.608e-03' + shape: + - 128 + sum: '1.574e-02' +grads.network.layer2.1.bn2.weight: + device: cuda:0 + max: '4.324e-03' + mean: '1.091e-04' + min: '-4.632e-03' + shape: + - 128 + sum: '1.397e-02' +grads.network.layer2.1.conv1.weight: + device: cuda:0 + max: '8.457e-03' + mean: '-2.224e-05' + min: '-8.334e-03' + shape: + - 128 + - 128 + - 3 + - 3 + sum: '-3.279e+00' +grads.network.layer2.1.conv2.weight: + device: cuda:0 + max: '6.936e-03' + mean: '-2.779e-05' + min: '-6.811e-03' + shape: + - 128 + - 128 + - 3 + - 3 + sum: '-4.098e+00' +grads.network.layer3.0.bn1.bias: + device: cuda:0 + max: '2.770e-03' + mean: '5.8e-05' + min: '-3.176e-03' + shape: + - 256 + sum: '1.485e-02' +grads.network.layer3.0.bn1.weight: + device: cuda:0 + max: '4.501e-03' + mean: '-1.965e-09' + min: '-3.247e-03' + shape: + - 256 + sum: '-5.029e-07' +grads.network.layer3.0.bn2.bias: + device: cuda:0 + max: '2.85e-03' + mean: '2.536e-05' + min: '-3.149e-03' + shape: + - 256 + sum: '6.493e-03' +grads.network.layer3.0.bn2.weight: + device: cuda:0 + max: '3.689e-03' + mean: '-1.113e-04' + min: '-3.318e-03' + shape: + - 256 + sum: '-2.850e-02' +grads.network.layer3.0.conv1.weight: + device: cuda:0 + max: '8.373e-03' + mean: '1.589e-06' + min: '-8.216e-03' + shape: + - 256 + - 128 + - 3 + - 3 + sum: '4.685e-01' +grads.network.layer3.0.conv2.weight: + device: cuda:0 + max: '7.279e-03' + mean: '3.597e-07' + min: '-6.876e-03' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '2.122e-01' +grads.network.layer3.0.downsample.0.weight: + device: cuda:0 + max: '7.642e-03' + mean: '7.352e-06' + min: '-6.323e-03' + shape: + - 256 + - 128 + - 1 + - 1 + sum: '2.409e-01' +grads.network.layer3.0.downsample.1.bias: + device: cuda:0 + max: '2.85e-03' + mean: '2.536e-05' + min: '-3.149e-03' + shape: + - 256 + sum: '6.493e-03' +grads.network.layer3.0.downsample.1.weight: + device: cuda:0 + max: '3.721e-03' + mean: '1.250e-04' + min: '-3.504e-03' + shape: + - 256 + sum: '3.201e-02' +grads.network.layer3.1.bn1.bias: + device: cuda:0 + max: '2.634e-03' + mean: '3.564e-05' + min: '-2.17e-03' + shape: + - 256 + sum: '9.124e-03' +grads.network.layer3.1.bn1.weight: + device: cuda:0 + max: '2.518e-03' + mean: '1.983e-10' + min: '-2.539e-03' + shape: + - 256 + sum: '5.076e-08' +grads.network.layer3.1.bn2.bias: + device: cuda:0 + max: '2.024e-03' + mean: '6.733e-05' + min: '-1.777e-03' + shape: + - 256 + sum: '1.724e-02' +grads.network.layer3.1.bn2.weight: + device: cuda:0 + max: '2.737e-03' + mean: '-1.37e-05' + min: '-2.669e-03' + shape: + - 256 + sum: '-3.507e-03' +grads.network.layer3.1.conv1.weight: + device: cuda:0 + max: '5.457e-03' + mean: '-1.498e-06' + min: '-5.48e-03' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '-8.836e-01' +grads.network.layer3.1.conv2.weight: + device: cuda:0 + max: '4.436e-03' + mean: '7.578e-07' + min: '-4.453e-03' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '4.469e-01' +grads.network.layer4.0.bn1.bias: + device: cuda:0 + max: '1.529e-03' + mean: '4.731e-05' + min: '-1.600e-03' + shape: + - 512 + sum: '2.422e-02' +grads.network.layer4.0.bn1.weight: + device: cuda:0 + max: '2.836e-03' + mean: '3.382e-09' + min: '-1.948e-03' + shape: + - 512 + sum: '1.731e-06' +grads.network.layer4.0.bn2.bias: + device: cuda:0 + max: '4.572e-03' + mean: '2.561e-04' + min: '-3.552e-03' + shape: + - 512 + sum: '1.311e-01' +grads.network.layer4.0.bn2.weight: + device: cuda:0 + max: '4.103e-03' + mean: '2.118e-04' + min: '-2.870e-03' + shape: + - 512 + sum: '1.084e-01' +grads.network.layer4.0.conv1.weight: + device: cuda:0 + max: '5.52e-03' + mean: '-1.319e-05' + min: '-5.398e-03' + shape: + - 512 + - 256 + - 3 + - 3 + sum: '-1.556e+01' +grads.network.layer4.0.conv2.weight: + device: cuda:0 + max: '3.6e-03' + mean: '-4.087e-06' + min: '-4.384e-03' + shape: + - 512 + - 512 + - 3 + - 3 + sum: '-9.643e+00' +grads.network.layer4.0.downsample.0.weight: + device: cuda:0 + max: '4.390e-03' + mean: '-2.207e-06' + min: '-5.205e-03' + shape: + - 512 + - 256 + - 1 + - 1 + sum: '-2.893e-01' +grads.network.layer4.0.downsample.1.bias: + device: cuda:0 + max: '4.572e-03' + mean: '2.561e-04' + min: '-3.552e-03' + shape: + - 512 + sum: '1.311e-01' +grads.network.layer4.0.downsample.1.weight: + device: cuda:0 + max: '3.626e-03' + mean: '1.351e-04' + min: '-3.259e-03' + shape: + - 512 + sum: '6.917e-02' +grads.network.layer4.1.bn1.bias: + device: cuda:0 + max: '1.327e-03' + mean: '1.918e-05' + min: '-1.29e-03' + shape: + - 512 + sum: '9.818e-03' +grads.network.layer4.1.bn1.weight: + device: cuda:0 + max: '2.764e-03' + mean: '3.335e-09' + min: '-2.679e-03' + shape: + - 512 + sum: '1.707e-06' +grads.network.layer4.1.bn2.bias: + device: cuda:0 + max: '7.656e-03' + mean: '4.169e-04' + min: '-5.189e-03' + shape: + - 512 + sum: '2.134e-01' +grads.network.layer4.1.bn2.weight: + device: cuda:0 + max: '3.609e-03' + mean: '2.029e-04' + min: '-3.125e-03' + shape: + - 512 + sum: '1.039e-01' +grads.network.layer4.1.conv1.weight: + device: cuda:0 + max: '4.400e-03' + mean: '-9.705e-06' + min: '-3.475e-03' + shape: + - 512 + - 512 + - 3 + - 3 + sum: '-2.29e+01' +grads.network.layer4.1.conv2.weight: + device: cuda:0 + max: '3.91e-03' + mean: '1.074e-05' + min: '-2.999e-03' + shape: + - 512 + - 512 + - 3 + - 3 + sum: '2.535e+01' +outputs.logits: + device: cuda:0 + max: '2.934e+00' + mean: '-8.071e-04' + min: '-2.896e+00' + shape: + - 64 + - 1000 + sum: '-5.165e+01' +outputs.loss: + device: cuda:0 + max: '7.073e+00' + mean: '7.073e+00' + min: '7.073e+00' + shape: [] + sum: '7.073e+00' +outputs.y: + device: cuda:0 + max: 988 + mean: '5.182e+02' + min: 0 + shape: + - 64 + sum: 33166 diff --git a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/resnet50_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet50_cifar10_image_classifier.yaml similarity index 84% rename from .regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/resnet50_cifar10_example.yaml rename to .regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet50_cifar10_image_classifier.yaml index fb60cb5a..3fafcadf 100644 --- a/.regression_files/project/algorithms/example_test/test_backward_pass_is_reproducible/cpu/resnet50_cifar10_example.yaml +++ b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet50_cifar10_image_classifier.yaml @@ -1,5 +1,5 @@ batch.0: - device: cpu + device: cuda:0 max: '2.126e+00' mean: '-6.179e-03' min: '-1.989e+00' @@ -10,7 +10,7 @@ batch.0: - 32 sum: '-2.43e+03' batch.1: - device: cpu + device: cuda:0 max: 9 mean: '4.555e+00' min: 0 @@ -18,7 +18,7 @@ batch.1: - 128 sum: 583 grads.network.bn1.bias: - device: cpu + device: cuda:0 max: '9.205e-01' mean: '4.814e-02' min: '-1.080e+00' @@ -26,15 +26,15 @@ grads.network.bn1.bias: - 64 sum: '3.081e+00' grads.network.bn1.weight: - device: cpu + device: cuda:0 max: '1.441e+00' - mean: '3.663e-06' + mean: '3.662e-06' min: '-1.737e+00' shape: - 64 sum: '2.344e-04' grads.network.conv1.weight: - device: cpu + device: cuda:0 max: '1.895e+01' mean: '-8.353e-03' min: '-1.422e+01' @@ -45,24 +45,24 @@ grads.network.conv1.weight: - 7 sum: '-7.858e+01' grads.network.fc.bias: - device: cpu + device: cuda:0 max: '1.341e-01' - mean: '7.451e-10' + mean: '1.490e-09' min: '-6.681e-02' shape: - 10 - sum: '7.451e-09' + sum: '1.490e-08' grads.network.fc.weight: - device: cpu + device: cuda:0 max: '3.777e-01' - mean: '6.054e-10' + mean: '5.101e-10' min: '-2.029e-01' shape: - 10 - 2048 - sum: '1.24e-05' + sum: '1.045e-05' grads.network.layer1.0.bn1.bias: - device: cpu + device: cuda:0 max: '8.082e-01' mean: '1.893e-02' min: '-8.557e-01' @@ -70,15 +70,15 @@ grads.network.layer1.0.bn1.bias: - 64 sum: '1.211e+00' grads.network.layer1.0.bn1.weight: - device: cpu + device: cuda:0 max: '7.796e-01' - mean: '-1.29e-07' + mean: '-1.248e-07' min: '-9.923e-01' shape: - 64 - sum: '-8.255e-06' + sum: '-7.987e-06' grads.network.layer1.0.bn2.bias: - device: cpu + device: cuda:0 max: '6.138e-01' mean: '-3.147e-02' min: '-7.454e-01' @@ -86,15 +86,15 @@ grads.network.layer1.0.bn2.bias: - 64 sum: '-2.014e+00' grads.network.layer1.0.bn2.weight: - device: cpu + device: cuda:0 max: '8.566e-01' - mean: '-4.082e-06' + mean: '-4.075e-06' min: '-8.725e-01' shape: - 64 - sum: '-2.613e-04' + sum: '-2.608e-04' grads.network.layer1.0.bn3.bias: - device: cpu + device: cuda:0 max: '4.064e-01' mean: '-1.042e-04' min: '-4.231e-01' @@ -102,7 +102,7 @@ grads.network.layer1.0.bn3.bias: - 256 sum: '-2.667e-02' grads.network.layer1.0.bn3.weight: - device: cpu + device: cuda:0 max: '5.445e-01' mean: '-1.607e-02' min: '-5.301e-01' @@ -110,7 +110,7 @@ grads.network.layer1.0.bn3.weight: - 256 sum: '-4.115e+00' grads.network.layer1.0.conv1.weight: - device: cpu + device: cuda:0 max: '1.995e+00' mean: '5.037e-03' min: '-2.531e+00' @@ -121,7 +121,7 @@ grads.network.layer1.0.conv1.weight: - 1 sum: '2.063e+01' grads.network.layer1.0.conv2.weight: - device: cpu + device: cuda:0 max: '1.94e+00' mean: '9.205e-03' min: '-1.562e+00' @@ -132,7 +132,7 @@ grads.network.layer1.0.conv2.weight: - 3 sum: '3.393e+02' grads.network.layer1.0.conv3.weight: - device: cpu + device: cuda:0 max: '1.516e+00' mean: '1.730e-03' min: '-1.296e+00' @@ -143,7 +143,7 @@ grads.network.layer1.0.conv3.weight: - 1 sum: '2.835e+01' grads.network.layer1.0.downsample.0.weight: - device: cpu + device: cuda:0 max: '1.394e+00' mean: '6.997e-03' min: '-1.394e+00' @@ -154,7 +154,7 @@ grads.network.layer1.0.downsample.0.weight: - 1 sum: '1.146e+02' grads.network.layer1.0.downsample.1.bias: - device: cpu + device: cuda:0 max: '4.064e-01' mean: '-1.042e-04' min: '-4.231e-01' @@ -162,7 +162,7 @@ grads.network.layer1.0.downsample.1.bias: - 256 sum: '-2.667e-02' grads.network.layer1.0.downsample.1.weight: - device: cpu + device: cuda:0 max: '7.517e-01' mean: '1.179e-02' min: '-4.804e-01' @@ -170,7 +170,7 @@ grads.network.layer1.0.downsample.1.weight: - 256 sum: '3.017e+00' grads.network.layer1.1.bn1.bias: - device: cpu + device: cuda:0 max: '5.352e-01' mean: '-5.139e-03' min: '-6.301e-01' @@ -178,15 +178,15 @@ grads.network.layer1.1.bn1.bias: - 64 sum: '-3.289e-01' grads.network.layer1.1.bn1.weight: - device: cpu + device: cuda:0 max: '7.305e-01' - mean: '-1.327e-07' + mean: '-1.322e-07' min: '-6.086e-01' shape: - 64 - sum: '-8.494e-06' + sum: '-8.464e-06' grads.network.layer1.1.bn2.bias: - device: cpu + device: cuda:0 max: '6.326e-01' mean: '-2.056e-03' min: '-4.814e-01' @@ -194,15 +194,15 @@ grads.network.layer1.1.bn2.bias: - 64 sum: '-1.316e-01' grads.network.layer1.1.bn2.weight: - device: cpu + device: cuda:0 max: '7.657e-01' - mean: '2.468e-08' + mean: '2.328e-08' min: '-5.989e-01' shape: - 64 - sum: '1.58e-06' + sum: '1.490e-06' grads.network.layer1.1.bn3.bias: - device: cpu + device: cuda:0 max: '2.399e-01' mean: '5.205e-03' min: '-1.858e-01' @@ -210,7 +210,7 @@ grads.network.layer1.1.bn3.bias: - 256 sum: '1.333e+00' grads.network.layer1.1.bn3.weight: - device: cpu + device: cuda:0 max: '3.889e-01' mean: '2.229e-03' min: '-3.122e-01' @@ -218,7 +218,7 @@ grads.network.layer1.1.bn3.weight: - 256 sum: '5.706e-01' grads.network.layer1.1.conv1.weight: - device: cpu + device: cuda:0 max: '6.541e-01' mean: '6.722e-04' min: '-6.24e-01' @@ -229,7 +229,7 @@ grads.network.layer1.1.conv1.weight: - 1 sum: '1.101e+01' grads.network.layer1.1.conv2.weight: - device: cpu + device: cuda:0 max: '1.279e+00' mean: '6.102e-03' min: '-1.024e+00' @@ -240,7 +240,7 @@ grads.network.layer1.1.conv2.weight: - 3 sum: '2.249e+02' grads.network.layer1.1.conv3.weight: - device: cpu + device: cuda:0 max: '9.491e-01' mean: '2.511e-03' min: '-9.537e-01' @@ -251,7 +251,7 @@ grads.network.layer1.1.conv3.weight: - 1 sum: '4.114e+01' grads.network.layer1.2.bn1.bias: - device: cpu + device: cuda:0 max: '4.21e-01' mean: '-1.548e-02' min: '-4.326e-01' @@ -259,7 +259,7 @@ grads.network.layer1.2.bn1.bias: - 64 sum: '-9.907e-01' grads.network.layer1.2.bn1.weight: - device: cpu + device: cuda:0 max: '5.188e-01' mean: '1.397e-08' min: '-3.354e-01' @@ -267,7 +267,7 @@ grads.network.layer1.2.bn1.weight: - 64 sum: '8.941e-07' grads.network.layer1.2.bn2.bias: - device: cpu + device: cuda:0 max: '4.175e-01' mean: '-7.536e-03' min: '-3.544e-01' @@ -275,15 +275,15 @@ grads.network.layer1.2.bn2.bias: - 64 sum: '-4.823e-01' grads.network.layer1.2.bn2.weight: - device: cpu + device: cuda:0 max: '2.97e-01' - mean: '5.030e-07' + mean: '5.048e-07' min: '-3.822e-01' shape: - 64 - sum: '3.219e-05' + sum: '3.231e-05' grads.network.layer1.2.bn3.bias: - device: cpu + device: cuda:0 max: '1.238e-01' mean: '2.877e-03' min: '-1.060e-01' @@ -291,7 +291,7 @@ grads.network.layer1.2.bn3.bias: - 256 sum: '7.366e-01' grads.network.layer1.2.bn3.weight: - device: cpu + device: cuda:0 max: '2.316e-01' mean: '2.059e-03' min: '-2.506e-01' @@ -299,7 +299,7 @@ grads.network.layer1.2.bn3.weight: - 256 sum: '5.272e-01' grads.network.layer1.2.conv1.weight: - device: cpu + device: cuda:0 max: '3.633e-01' mean: '3.658e-03' min: '-4.331e-01' @@ -310,7 +310,7 @@ grads.network.layer1.2.conv1.weight: - 1 sum: '5.993e+01' grads.network.layer1.2.conv2.weight: - device: cpu + device: cuda:0 max: '6.992e-01' mean: '2.97e-03' min: '-7.175e-01' @@ -321,7 +321,7 @@ grads.network.layer1.2.conv2.weight: - 3 sum: '1.095e+02' grads.network.layer1.2.conv3.weight: - device: cpu + device: cuda:0 max: '5.388e-01' mean: '-1.901e-04' min: '-6.321e-01' @@ -332,7 +332,7 @@ grads.network.layer1.2.conv3.weight: - 1 sum: '-3.115e+00' grads.network.layer2.0.bn1.bias: - device: cpu + device: cuda:0 max: '2.419e-01' mean: '-5.441e-03' min: '-2.731e-01' @@ -340,15 +340,15 @@ grads.network.layer2.0.bn1.bias: - 128 sum: '-6.964e-01' grads.network.layer2.0.bn1.weight: - device: cpu + device: cuda:0 max: '3.249e-01' - mean: '2.375e-08' + mean: '2.258e-08' min: '-2.792e-01' shape: - 128 - sum: '3.04e-06' + sum: '2.891e-06' grads.network.layer2.0.bn2.bias: - device: cpu + device: cuda:0 max: '1.974e-01' mean: '-7.017e-03' min: '-2.037e-01' @@ -356,15 +356,15 @@ grads.network.layer2.0.bn2.bias: - 128 sum: '-8.981e-01' grads.network.layer2.0.bn2.weight: - device: cpu + device: cuda:0 max: '3.613e-01' - mean: '6.624e-08' + mean: '6.775e-08' min: '-2.713e-01' shape: - 128 - sum: '8.479e-06' + sum: '8.672e-06' grads.network.layer2.0.bn3.bias: - device: cpu + device: cuda:0 max: '1.091e-01' mean: '6.263e-04' min: '-1.059e-01' @@ -372,7 +372,7 @@ grads.network.layer2.0.bn3.bias: - 512 sum: '3.207e-01' grads.network.layer2.0.bn3.weight: - device: cpu + device: cuda:0 max: '1.658e-01' mean: '-1.899e-04' min: '-1.353e-01' @@ -380,7 +380,7 @@ grads.network.layer2.0.bn3.weight: - 512 sum: '-9.725e-02' grads.network.layer2.0.conv1.weight: - device: cpu + device: cuda:0 max: '3.953e-01' mean: '1.031e-03' min: '-3.708e-01' @@ -391,7 +391,7 @@ grads.network.layer2.0.conv1.weight: - 1 sum: '3.38e+01' grads.network.layer2.0.conv2.weight: - device: cpu + device: cuda:0 max: '4.388e-01' mean: '1.736e-03' min: '-4.009e-01' @@ -402,7 +402,7 @@ grads.network.layer2.0.conv2.weight: - 3 sum: '2.560e+02' grads.network.layer2.0.conv3.weight: - device: cpu + device: cuda:0 max: '3.455e-01' mean: '8.466e-04' min: '-3.519e-01' @@ -413,7 +413,7 @@ grads.network.layer2.0.conv3.weight: - 1 sum: '5.548e+01' grads.network.layer2.0.downsample.0.weight: - device: cpu + device: cuda:0 max: '2.479e-01' mean: '3.199e-04' min: '-2.569e-01' @@ -424,7 +424,7 @@ grads.network.layer2.0.downsample.0.weight: - 1 sum: '4.193e+01' grads.network.layer2.0.downsample.1.bias: - device: cpu + device: cuda:0 max: '1.091e-01' mean: '6.263e-04' min: '-1.059e-01' @@ -432,7 +432,7 @@ grads.network.layer2.0.downsample.1.bias: - 512 sum: '3.207e-01' grads.network.layer2.0.downsample.1.weight: - device: cpu + device: cuda:0 max: '1.697e-01' mean: '1.416e-03' min: '-1.327e-01' @@ -440,7 +440,7 @@ grads.network.layer2.0.downsample.1.weight: - 512 sum: '7.250e-01' grads.network.layer2.1.bn1.bias: - device: cpu + device: cuda:0 max: '1.482e-01' mean: '-1.673e-03' min: '-1.761e-01' @@ -448,15 +448,15 @@ grads.network.layer2.1.bn1.bias: - 128 sum: '-2.141e-01' grads.network.layer2.1.bn1.weight: - device: cpu + device: cuda:0 max: '1.848e-01' - mean: '-3.946e-08' + mean: '-3.888e-08' min: '-2.179e-01' shape: - 128 - sum: '-5.051e-06' + sum: '-4.977e-06' grads.network.layer2.1.bn2.bias: - device: cpu + device: cuda:0 max: '1.764e-01' mean: '5.389e-03' min: '-1.466e-01' @@ -464,15 +464,15 @@ grads.network.layer2.1.bn2.bias: - 128 sum: '6.898e-01' grads.network.layer2.1.bn2.weight: - device: cpu + device: cuda:0 max: '2.348e-01' - mean: '-1.397e-07' + mean: '-1.404e-07' min: '-2.435e-01' shape: - 128 - sum: '-1.788e-05' + sum: '-1.797e-05' grads.network.layer2.1.bn3.bias: - device: cpu + device: cuda:0 max: '8.049e-02' mean: '-1.62e-04' min: '-6.643e-02' @@ -480,7 +480,7 @@ grads.network.layer2.1.bn3.bias: - 512 sum: '-8.292e-02' grads.network.layer2.1.bn3.weight: - device: cpu + device: cuda:0 max: '1.130e-01' mean: '1.227e-04' min: '-9.870e-02' @@ -488,7 +488,7 @@ grads.network.layer2.1.bn3.weight: - 512 sum: '6.285e-02' grads.network.layer2.1.conv1.weight: - device: cpu + device: cuda:0 max: '2.100e-01' mean: '-3.326e-04' min: '-1.831e-01' @@ -499,7 +499,7 @@ grads.network.layer2.1.conv1.weight: - 1 sum: '-2.18e+01' grads.network.layer2.1.conv2.weight: - device: cpu + device: cuda:0 max: '3.447e-01' mean: '-9.641e-04' min: '-3.505e-01' @@ -510,7 +510,7 @@ grads.network.layer2.1.conv2.weight: - 3 sum: '-1.422e+02' grads.network.layer2.1.conv3.weight: - device: cpu + device: cuda:0 max: '2.356e-01' mean: '-1.869e-04' min: '-2.254e-01' @@ -521,7 +521,7 @@ grads.network.layer2.1.conv3.weight: - 1 sum: '-1.225e+01' grads.network.layer2.2.bn1.bias: - device: cpu + device: cuda:0 max: '1.512e-01' mean: '-1.99e-03' min: '-1.240e-01' @@ -529,15 +529,15 @@ grads.network.layer2.2.bn1.bias: - 128 sum: '-2.547e-01' grads.network.layer2.2.bn1.weight: - device: cpu + device: cuda:0 max: '1.999e-01' - mean: '2.258e-08' + mean: '2.270e-08' min: '-1.396e-01' shape: - 128 - sum: '2.891e-06' + sum: '2.906e-06' grads.network.layer2.2.bn2.bias: - device: cpu + device: cuda:0 max: '1.029e-01' mean: '-3.850e-04' min: '-1.010e-01' @@ -545,15 +545,15 @@ grads.network.layer2.2.bn2.bias: - 128 sum: '-4.928e-02' grads.network.layer2.2.bn2.weight: - device: cpu + device: cuda:0 max: '1.463e-01' - mean: '-1.159e-07' + mean: '-1.162e-07' min: '-1.46e-01' shape: - 128 - sum: '-1.484e-05' + sum: '-1.487e-05' grads.network.layer2.2.bn3.bias: - device: cpu + device: cuda:0 max: '4.505e-02' mean: '-9.093e-05' min: '-3.943e-02' @@ -561,7 +561,7 @@ grads.network.layer2.2.bn3.bias: - 512 sum: '-4.656e-02' grads.network.layer2.2.bn3.weight: - device: cpu + device: cuda:0 max: '8.137e-02' mean: '-4.692e-04' min: '-6.764e-02' @@ -569,7 +569,7 @@ grads.network.layer2.2.bn3.weight: - 512 sum: '-2.402e-01' grads.network.layer2.2.conv1.weight: - device: cpu + device: cuda:0 max: '1.230e-01' mean: '2.737e-04' min: '-1.255e-01' @@ -580,7 +580,7 @@ grads.network.layer2.2.conv1.weight: - 1 sum: '1.794e+01' grads.network.layer2.2.conv2.weight: - device: cpu + device: cuda:0 max: '2.359e-01' mean: '4.964e-04' min: '-2.379e-01' @@ -591,7 +591,7 @@ grads.network.layer2.2.conv2.weight: - 3 sum: '7.32e+01' grads.network.layer2.2.conv3.weight: - device: cpu + device: cuda:0 max: '1.738e-01' mean: '4.385e-04' min: '-1.777e-01' @@ -602,7 +602,7 @@ grads.network.layer2.2.conv3.weight: - 1 sum: '2.874e+01' grads.network.layer2.3.bn1.bias: - device: cpu + device: cuda:0 max: '1.279e-01' mean: '6.022e-03' min: '-8.782e-02' @@ -610,15 +610,15 @@ grads.network.layer2.3.bn1.bias: - 128 sum: '7.708e-01' grads.network.layer2.3.bn1.weight: - device: cpu + device: cuda:0 max: '1.222e-01' - mean: '1.257e-08' + mean: '1.199e-08' min: '-1.526e-01' shape: - 128 - sum: '1.609e-06' + sum: '1.535e-06' grads.network.layer2.3.bn2.bias: - device: cpu + device: cuda:0 max: '9.101e-02' mean: '-1.522e-03' min: '-7.893e-02' @@ -626,15 +626,15 @@ grads.network.layer2.3.bn2.bias: - 128 sum: '-1.948e-01' grads.network.layer2.3.bn2.weight: - device: cpu + device: cuda:0 max: '8.481e-02' - mean: '-1.930e-07' + mean: '-1.932e-07' min: '-8.458e-02' shape: - 128 - sum: '-2.471e-05' + sum: '-2.474e-05' grads.network.layer2.3.bn3.bias: - device: cpu + device: cuda:0 max: '2.302e-02' mean: '1.906e-05' min: '-3.022e-02' @@ -642,7 +642,7 @@ grads.network.layer2.3.bn3.bias: - 512 sum: '9.761e-03' grads.network.layer2.3.bn3.weight: - device: cpu + device: cuda:0 max: '4.318e-02' mean: '-8.797e-04' min: '-4.599e-02' @@ -650,7 +650,7 @@ grads.network.layer2.3.bn3.weight: - 512 sum: '-4.504e-01' grads.network.layer2.3.conv1.weight: - device: cpu + device: cuda:0 max: '8.230e-02' mean: '-3.507e-04' min: '-9.358e-02' @@ -661,7 +661,7 @@ grads.network.layer2.3.conv1.weight: - 1 sum: '-2.298e+01' grads.network.layer2.3.conv2.weight: - device: cpu + device: cuda:0 max: '1.666e-01' mean: '8.926e-04' min: '-1.69e-01' @@ -672,7 +672,7 @@ grads.network.layer2.3.conv2.weight: - 3 sum: '1.316e+02' grads.network.layer2.3.conv3.weight: - device: cpu + device: cuda:0 max: '1.444e-01' mean: '1.829e-04' min: '-1.152e-01' @@ -683,7 +683,7 @@ grads.network.layer2.3.conv3.weight: - 1 sum: '1.199e+01' grads.network.layer3.0.bn1.bias: - device: cpu + device: cuda:0 max: '6.992e-02' mean: '1.721e-03' min: '-8.225e-02' @@ -691,15 +691,15 @@ grads.network.layer3.0.bn1.bias: - 256 sum: '4.405e-01' grads.network.layer3.0.bn1.weight: - device: cpu + device: cuda:0 max: '8.985e-02' - mean: '-2.648e-09' + mean: '-2.561e-09' min: '-1.042e-01' shape: - 256 - sum: '-6.780e-07' + sum: '-6.557e-07' grads.network.layer3.0.bn2.bias: - device: cpu + device: cuda:0 max: '6.940e-02' mean: '5.335e-04' min: '-5.311e-02' @@ -707,15 +707,15 @@ grads.network.layer3.0.bn2.bias: - 256 sum: '1.366e-01' grads.network.layer3.0.bn2.weight: - device: cpu + device: cuda:0 max: '5.623e-02' - mean: '-2.305e-08' + mean: '-2.282e-08' min: '-7.762e-02' shape: - 256 - sum: '-5.901e-06' + sum: '-5.841e-06' grads.network.layer3.0.bn3.bias: - device: cpu + device: cuda:0 max: '3.228e-02' mean: '-1.181e-04' min: '-2.608e-02' @@ -723,7 +723,7 @@ grads.network.layer3.0.bn3.bias: - 1024 sum: '-1.209e-01' grads.network.layer3.0.bn3.weight: - device: cpu + device: cuda:0 max: '3.652e-02' mean: '-7.228e-05' min: '-4.893e-02' @@ -731,7 +731,7 @@ grads.network.layer3.0.bn3.weight: - 1024 sum: '-7.401e-02' grads.network.layer3.0.conv1.weight: - device: cpu + device: cuda:0 max: '9.913e-02' mean: '-3.902e-04' min: '-9.101e-02' @@ -742,7 +742,7 @@ grads.network.layer3.0.conv1.weight: - 1 sum: '-5.114e+01' grads.network.layer3.0.conv2.weight: - device: cpu + device: cuda:0 max: '1.257e-01' mean: '-8.546e-05' min: '-1.265e-01' @@ -753,7 +753,7 @@ grads.network.layer3.0.conv2.weight: - 3 sum: '-5.040e+01' grads.network.layer3.0.conv3.weight: - device: cpu + device: cuda:0 max: '9.508e-02' mean: '4.733e-05' min: '-1.04e-01' @@ -764,7 +764,7 @@ grads.network.layer3.0.conv3.weight: - 1 sum: '1.241e+01' grads.network.layer3.0.downsample.0.weight: - device: cpu + device: cuda:0 max: '7.85e-02' mean: '-3.186e-05' min: '-9.409e-02' @@ -775,7 +775,7 @@ grads.network.layer3.0.downsample.0.weight: - 1 sum: '-1.671e+01' grads.network.layer3.0.downsample.1.bias: - device: cpu + device: cuda:0 max: '3.228e-02' mean: '-1.181e-04' min: '-2.608e-02' @@ -783,7 +783,7 @@ grads.network.layer3.0.downsample.1.bias: - 1024 sum: '-1.209e-01' grads.network.layer3.0.downsample.1.weight: - device: cpu + device: cuda:0 max: '3.657e-02' mean: '-7.938e-05' min: '-3.968e-02' @@ -791,7 +791,7 @@ grads.network.layer3.0.downsample.1.weight: - 1024 sum: '-8.128e-02' grads.network.layer3.1.bn1.bias: - device: cpu + device: cuda:0 max: '5.199e-02' mean: '-3.091e-04' min: '-6.523e-02' @@ -799,15 +799,15 @@ grads.network.layer3.1.bn1.bias: - 256 sum: '-7.912e-02' grads.network.layer3.1.bn1.weight: - device: cpu + device: cuda:0 max: '7.237e-02' - mean: '1.156e-08' + mean: '1.141e-08' min: '-5.789e-02' shape: - 256 - sum: '2.959e-06' + sum: '2.921e-06' grads.network.layer3.1.bn2.bias: - device: cpu + device: cuda:0 max: '4.225e-02' mean: '7.41e-04' min: '-4.171e-02' @@ -815,15 +815,15 @@ grads.network.layer3.1.bn2.bias: - 256 sum: '1.897e-01' grads.network.layer3.1.bn2.weight: - device: cpu + device: cuda:0 max: '3.798e-02' - mean: '3.897e-08' + mean: '3.9e-08' min: '-5.021e-02' shape: - 256 - sum: '9.976e-06' + sum: '9.984e-06' grads.network.layer3.1.bn3.bias: - device: cpu + device: cuda:0 max: '1.976e-02' mean: '-1.692e-04' min: '-2.215e-02' @@ -831,7 +831,7 @@ grads.network.layer3.1.bn3.bias: - 1024 sum: '-1.733e-01' grads.network.layer3.1.bn3.weight: - device: cpu + device: cuda:0 max: '2.348e-02' mean: '1.549e-04' min: '-2.379e-02' @@ -839,7 +839,7 @@ grads.network.layer3.1.bn3.weight: - 1024 sum: '1.587e-01' grads.network.layer3.1.conv1.weight: - device: cpu + device: cuda:0 max: '4.929e-02' mean: '4.316e-05' min: '-4.696e-02' @@ -850,7 +850,7 @@ grads.network.layer3.1.conv1.weight: - 1 sum: '1.131e+01' grads.network.layer3.1.conv2.weight: - device: cpu + device: cuda:0 max: '1.156e-01' mean: '-8.390e-05' min: '-1.048e-01' @@ -861,7 +861,7 @@ grads.network.layer3.1.conv2.weight: - 3 sum: '-4.949e+01' grads.network.layer3.1.conv3.weight: - device: cpu + device: cuda:0 max: '6.757e-02' mean: '3.39e-05' min: '-6.879e-02' @@ -872,7 +872,7 @@ grads.network.layer3.1.conv3.weight: - 1 sum: '8.886e+00' grads.network.layer3.2.bn1.bias: - device: cpu + device: cuda:0 max: '3.715e-02' mean: '-3.498e-04' min: '-4.113e-02' @@ -880,15 +880,15 @@ grads.network.layer3.2.bn1.bias: - 256 sum: '-8.956e-02' grads.network.layer3.2.bn1.weight: - device: cpu + device: cuda:0 max: '4.569e-02' - mean: '2.794e-09' + mean: '2.867e-09' min: '-4.962e-02' shape: - 256 - sum: '7.153e-07' + sum: '7.339e-07' grads.network.layer3.2.bn2.bias: - device: cpu + device: cuda:0 max: '3.029e-02' mean: '-4.436e-04' min: '-2.692e-02' @@ -896,15 +896,15 @@ grads.network.layer3.2.bn2.bias: - 256 sum: '-1.135e-01' grads.network.layer3.2.bn2.weight: - device: cpu + device: cuda:0 max: '3.397e-02' - mean: '-1.458e-08' + mean: '-1.461e-08' min: '-3.55e-02' shape: - 256 - sum: '-3.733e-06' + sum: '-3.740e-06' grads.network.layer3.2.bn3.bias: - device: cpu + device: cuda:0 max: '1.074e-02' mean: '-9.653e-05' min: '-1.428e-02' @@ -912,7 +912,7 @@ grads.network.layer3.2.bn3.bias: - 1024 sum: '-9.884e-02' grads.network.layer3.2.bn3.weight: - device: cpu + device: cuda:0 max: '2.000e-02' mean: '-7.752e-05' min: '-1.676e-02' @@ -920,7 +920,7 @@ grads.network.layer3.2.bn3.weight: - 1024 sum: '-7.938e-02' grads.network.layer3.2.conv1.weight: - device: cpu + device: cuda:0 max: '3.134e-02' mean: '6.29e-05' min: '-3.177e-02' @@ -931,7 +931,7 @@ grads.network.layer3.2.conv1.weight: - 1 sum: '1.649e+01' grads.network.layer3.2.conv2.weight: - device: cpu + device: cuda:0 max: '7.868e-02' mean: '7.155e-06' min: '-7.522e-02' @@ -942,7 +942,7 @@ grads.network.layer3.2.conv2.weight: - 3 sum: '4.220e+00' grads.network.layer3.2.conv3.weight: - device: cpu + device: cuda:0 max: '4.457e-02' mean: '-6.326e-05' min: '-4.720e-02' @@ -953,7 +953,7 @@ grads.network.layer3.2.conv3.weight: - 1 sum: '-1.658e+01' grads.network.layer3.3.bn1.bias: - device: cpu + device: cuda:0 max: '4.017e-02' mean: '6.214e-05' min: '-2.511e-02' @@ -961,15 +961,15 @@ grads.network.layer3.3.bn1.bias: - 256 sum: '1.591e-02' grads.network.layer3.3.bn1.weight: - device: cpu + device: cuda:0 max: '3.217e-02' - mean: '-1.31e-10' + mean: '-2.183e-10' min: '-3.779e-02' shape: - 256 - sum: '-3.353e-08' + sum: '-5.588e-08' grads.network.layer3.3.bn2.bias: - device: cpu + device: cuda:0 max: '2.313e-02' mean: '-2.275e-06' min: '-2.476e-02' @@ -977,15 +977,15 @@ grads.network.layer3.3.bn2.bias: - 256 sum: '-5.825e-04' grads.network.layer3.3.bn2.weight: - device: cpu + device: cuda:0 max: '2.436e-02' - mean: '-1.283e-08' + mean: '-1.279e-08' min: '-2.400e-02' shape: - 256 - sum: '-3.286e-06' + sum: '-3.275e-06' grads.network.layer3.3.bn3.bias: - device: cpu + device: cuda:0 max: '9.701e-03' mean: '-4.152e-05' min: '-8.985e-03' @@ -993,7 +993,7 @@ grads.network.layer3.3.bn3.bias: - 1024 sum: '-4.251e-02' grads.network.layer3.3.bn3.weight: - device: cpu + device: cuda:0 max: '1.274e-02' mean: '-5.492e-05' min: '-1.673e-02' @@ -1001,7 +1001,7 @@ grads.network.layer3.3.bn3.weight: - 1024 sum: '-5.623e-02' grads.network.layer3.3.conv1.weight: - device: cpu + device: cuda:0 max: '2.719e-02' mean: '-4.864e-05' min: '-2.668e-02' @@ -1012,7 +1012,7 @@ grads.network.layer3.3.conv1.weight: - 1 sum: '-1.275e+01' grads.network.layer3.3.conv2.weight: - device: cpu + device: cuda:0 max: '6.36e-02' mean: '7.046e-05' min: '-5.796e-02' @@ -1023,7 +1023,7 @@ grads.network.layer3.3.conv2.weight: - 3 sum: '4.156e+01' grads.network.layer3.3.conv3.weight: - device: cpu + device: cuda:0 max: '4.141e-02' mean: '1.489e-05' min: '-3.670e-02' @@ -1034,7 +1034,7 @@ grads.network.layer3.3.conv3.weight: - 1 sum: '3.903e+00' grads.network.layer3.4.bn1.bias: - device: cpu + device: cuda:0 max: '2.147e-02' mean: '3.403e-05' min: '-2.25e-02' @@ -1042,7 +1042,7 @@ grads.network.layer3.4.bn1.bias: - 256 sum: '8.711e-03' grads.network.layer3.4.bn1.weight: - device: cpu + device: cuda:0 max: '3.626e-02' mean: '-1.892e-09' min: '-2.356e-02' @@ -1050,7 +1050,7 @@ grads.network.layer3.4.bn1.weight: - 256 sum: '-4.843e-07' grads.network.layer3.4.bn2.bias: - device: cpu + device: cuda:0 max: '1.518e-02' mean: '3.233e-04' min: '-1.562e-02' @@ -1058,7 +1058,7 @@ grads.network.layer3.4.bn2.bias: - 256 sum: '8.277e-02' grads.network.layer3.4.bn2.weight: - device: cpu + device: cuda:0 max: '2.106e-02' mean: '4.386e-08' min: '-2.206e-02' @@ -1066,7 +1066,7 @@ grads.network.layer3.4.bn2.weight: - 256 sum: '1.123e-05' grads.network.layer3.4.bn3.bias: - device: cpu + device: cuda:0 max: '6.997e-03' mean: '-6.533e-05' min: '-7.944e-03' @@ -1074,7 +1074,7 @@ grads.network.layer3.4.bn3.bias: - 1024 sum: '-6.689e-02' grads.network.layer3.4.bn3.weight: - device: cpu + device: cuda:0 max: '1.064e-02' mean: '1.463e-04' min: '-9.902e-03' @@ -1082,7 +1082,7 @@ grads.network.layer3.4.bn3.weight: - 1024 sum: '1.498e-01' grads.network.layer3.4.conv1.weight: - device: cpu + device: cuda:0 max: '1.904e-02' mean: '-2.754e-05' min: '-1.891e-02' @@ -1093,7 +1093,7 @@ grads.network.layer3.4.conv1.weight: - 1 sum: '-7.22e+00' grads.network.layer3.4.conv2.weight: - device: cpu + device: cuda:0 max: '4.254e-02' mean: '-2.627e-05' min: '-5.017e-02' @@ -1104,7 +1104,7 @@ grads.network.layer3.4.conv2.weight: - 3 sum: '-1.549e+01' grads.network.layer3.4.conv3.weight: - device: cpu + device: cuda:0 max: '2.563e-02' mean: '-3.938e-06' min: '-2.833e-02' @@ -1115,7 +1115,7 @@ grads.network.layer3.4.conv3.weight: - 1 sum: '-1.032e+00' grads.network.layer3.5.bn1.bias: - device: cpu + device: cuda:0 max: '1.901e-02' mean: '2.356e-04' min: '-1.961e-02' @@ -1123,7 +1123,7 @@ grads.network.layer3.5.bn1.bias: - 256 sum: '6.031e-02' grads.network.layer3.5.bn1.weight: - device: cpu + device: cuda:0 max: '2.546e-02' mean: '-9.313e-10' min: '-2.608e-02' @@ -1131,7 +1131,7 @@ grads.network.layer3.5.bn1.weight: - 256 sum: '-2.384e-07' grads.network.layer3.5.bn2.bias: - device: cpu + device: cuda:0 max: '1.274e-02' mean: '-1.438e-04' min: '-1.364e-02' @@ -1139,15 +1139,15 @@ grads.network.layer3.5.bn2.bias: - 256 sum: '-3.680e-02' grads.network.layer3.5.bn2.weight: - device: cpu + device: cuda:0 max: '1.536e-02' - mean: '-3.049e-09' + mean: '-3.012e-09' min: '-2.043e-02' shape: - 256 - sum: '-7.804e-07' + sum: '-7.711e-07' grads.network.layer3.5.bn3.bias: - device: cpu + device: cuda:0 max: '4.202e-03' mean: '-2.573e-05' min: '-4.034e-03' @@ -1155,7 +1155,7 @@ grads.network.layer3.5.bn3.bias: - 1024 sum: '-2.634e-02' grads.network.layer3.5.bn3.weight: - device: cpu + device: cuda:0 max: '9.836e-03' mean: '-1.711e-05' min: '-8.328e-03' @@ -1163,7 +1163,7 @@ grads.network.layer3.5.bn3.weight: - 1024 sum: '-1.752e-02' grads.network.layer3.5.conv1.weight: - device: cpu + device: cuda:0 max: '1.525e-02' mean: '-3.503e-05' min: '-1.432e-02' @@ -1174,7 +1174,7 @@ grads.network.layer3.5.conv1.weight: - 1 sum: '-9.184e+00' grads.network.layer3.5.conv2.weight: - device: cpu + device: cuda:0 max: '4.67e-02' mean: '-7.542e-05' min: '-3.959e-02' @@ -1185,7 +1185,7 @@ grads.network.layer3.5.conv2.weight: - 3 sum: '-4.448e+01' grads.network.layer3.5.conv3.weight: - device: cpu + device: cuda:0 max: '2.486e-02' mean: '-4.622e-05' min: '-2.199e-02' @@ -1196,7 +1196,7 @@ grads.network.layer3.5.conv3.weight: - 1 sum: '-1.212e+01' grads.network.layer4.0.bn1.bias: - device: cpu + device: cuda:0 max: '1.216e-02' mean: '1.105e-04' min: '-1.527e-02' @@ -1204,15 +1204,15 @@ grads.network.layer4.0.bn1.bias: - 512 sum: '5.66e-02' grads.network.layer4.0.bn1.weight: - device: cpu + device: cuda:0 max: '1.341e-02' - mean: '2.485e-09' + mean: '2.454e-09' min: '-1.568e-02' shape: - 512 - sum: '1.272e-06' + sum: '1.256e-06' grads.network.layer4.0.bn2.bias: - device: cpu + device: cuda:0 max: '1.081e-02' mean: '-9.498e-06' min: '-1.008e-02' @@ -1220,15 +1220,15 @@ grads.network.layer4.0.bn2.bias: - 512 sum: '-4.863e-03' grads.network.layer4.0.bn2.weight: - device: cpu + device: cuda:0 max: '1.896e-02' - mean: '3.363e-08' + mean: '3.362e-08' min: '-1.575e-02' shape: - 512 - sum: '1.722e-05' + sum: '1.721e-05' grads.network.layer4.0.bn3.bias: - device: cpu + device: cuda:0 max: '6.932e-03' mean: '1.369e-04' min: '-6.060e-03' @@ -1236,7 +1236,7 @@ grads.network.layer4.0.bn3.bias: - 2048 sum: '2.805e-01' grads.network.layer4.0.bn3.weight: - device: cpu + device: cuda:0 max: '8.164e-03' mean: '1.423e-04' min: '-7.306e-03' @@ -1244,7 +1244,7 @@ grads.network.layer4.0.bn3.weight: - 2048 sum: '2.915e-01' grads.network.layer4.0.conv1.weight: - device: cpu + device: cuda:0 max: '1.748e-02' mean: '-2.425e-05' min: '-1.699e-02' @@ -1255,7 +1255,7 @@ grads.network.layer4.0.conv1.weight: - 1 sum: '-1.271e+01' grads.network.layer4.0.conv2.weight: - device: cpu + device: cuda:0 max: '4.355e-02' mean: '-2.123e-06' min: '-4.091e-02' @@ -1266,7 +1266,7 @@ grads.network.layer4.0.conv2.weight: - 3 sum: '-5.008e+00' grads.network.layer4.0.conv3.weight: - device: cpu + device: cuda:0 max: '1.988e-02' mean: '2.471e-05' min: '-2.667e-02' @@ -1277,7 +1277,7 @@ grads.network.layer4.0.conv3.weight: - 1 sum: '2.591e+01' grads.network.layer4.0.downsample.0.weight: - device: cpu + device: cuda:0 max: '1.62e-02' mean: '1.449e-05' min: '-2.14e-02' @@ -1288,7 +1288,7 @@ grads.network.layer4.0.downsample.0.weight: - 1 sum: '3.038e+01' grads.network.layer4.0.downsample.1.bias: - device: cpu + device: cuda:0 max: '6.932e-03' mean: '1.369e-04' min: '-6.060e-03' @@ -1296,7 +1296,7 @@ grads.network.layer4.0.downsample.1.bias: - 2048 sum: '2.805e-01' grads.network.layer4.0.downsample.1.weight: - device: cpu + device: cuda:0 max: '7.480e-03' mean: '2.966e-05' min: '-7.067e-03' @@ -1304,7 +1304,7 @@ grads.network.layer4.0.downsample.1.weight: - 2048 sum: '6.073e-02' grads.network.layer4.1.bn1.bias: - device: cpu + device: cuda:0 max: '8.244e-03' mean: '2.764e-05' min: '-1.008e-02' @@ -1312,15 +1312,15 @@ grads.network.layer4.1.bn1.bias: - 512 sum: '1.415e-02' grads.network.layer4.1.bn1.weight: - device: cpu + device: cuda:0 max: '1.030e-02' - mean: '7.105e-09' + mean: '7.094e-09' min: '-1.473e-02' shape: - 512 - sum: '3.638e-06' + sum: '3.632e-06' grads.network.layer4.1.bn2.bias: - device: cpu + device: cuda:0 max: '9.241e-03' mean: '1.883e-05' min: '-6.795e-03' @@ -1328,15 +1328,15 @@ grads.network.layer4.1.bn2.bias: - 512 sum: '9.642e-03' grads.network.layer4.1.bn2.weight: - device: cpu + device: cuda:0 max: '9.995e-03' - mean: '2.547e-08' + mean: '2.548e-08' min: '-9.566e-03' shape: - 512 - sum: '1.304e-05' + sum: '1.305e-05' grads.network.layer4.1.bn3.bias: - device: cpu + device: cuda:0 max: '5.288e-03' mean: '1.693e-04' min: '-5.143e-03' @@ -1344,7 +1344,7 @@ grads.network.layer4.1.bn3.bias: - 2048 sum: '3.468e-01' grads.network.layer4.1.bn3.weight: - device: cpu + device: cuda:0 max: '5.510e-03' mean: '1.148e-04' min: '-4.869e-03' @@ -1352,7 +1352,7 @@ grads.network.layer4.1.bn3.weight: - 2048 sum: '2.352e-01' grads.network.layer4.1.conv1.weight: - device: cpu + device: cuda:0 max: '1.323e-02' mean: '-7.145e-06' min: '-1.063e-02' @@ -1363,7 +1363,7 @@ grads.network.layer4.1.conv1.weight: - 1 sum: '-7.492e+00' grads.network.layer4.1.conv2.weight: - device: cpu + device: cuda:0 max: '4.482e-02' mean: '4.064e-06' min: '-4.435e-02' @@ -1374,7 +1374,7 @@ grads.network.layer4.1.conv2.weight: - 3 sum: '9.588e+00' grads.network.layer4.1.conv3.weight: - device: cpu + device: cuda:0 max: '1.372e-02' mean: '-7.804e-07' min: '-1.28e-02' @@ -1385,7 +1385,7 @@ grads.network.layer4.1.conv3.weight: - 1 sum: '-8.183e-01' grads.network.layer4.2.bn1.bias: - device: cpu + device: cuda:0 max: '5.947e-03' mean: '3.877e-05' min: '-7.937e-03' @@ -1393,15 +1393,15 @@ grads.network.layer4.2.bn1.bias: - 512 sum: '1.985e-02' grads.network.layer4.2.bn1.weight: - device: cpu + device: cuda:0 max: '8.022e-03' - mean: '1.703e-09' + mean: '1.71e-09' min: '-9.428e-03' shape: - 512 - sum: '8.717e-07' + sum: '8.754e-07' grads.network.layer4.2.bn2.bias: - device: cpu + device: cuda:0 max: '5.880e-03' mean: '9.59e-05' min: '-4.611e-03' @@ -1409,15 +1409,15 @@ grads.network.layer4.2.bn2.bias: - 512 sum: '4.91e-02' grads.network.layer4.2.bn2.weight: - device: cpu + device: cuda:0 max: '7.32e-03' - mean: '2.75e-08' + mean: '2.751e-08' min: '-5.822e-03' shape: - 512 - sum: '1.408e-05' + sum: '1.409e-05' grads.network.layer4.2.bn3.bias: - device: cpu + device: cuda:0 max: '6.23e-03' mean: '2.174e-04' min: '-6.104e-03' @@ -1425,7 +1425,7 @@ grads.network.layer4.2.bn3.bias: - 2048 sum: '4.453e-01' grads.network.layer4.2.bn3.weight: - device: cpu + device: cuda:0 max: '4.123e-03' mean: '1.086e-04' min: '-4.657e-03' @@ -1433,7 +1433,7 @@ grads.network.layer4.2.bn3.weight: - 2048 sum: '2.225e-01' grads.network.layer4.2.conv1.weight: - device: cpu + device: cuda:0 max: '8.671e-03' mean: '-1.917e-05' min: '-8.358e-03' @@ -1444,7 +1444,7 @@ grads.network.layer4.2.conv1.weight: - 1 sum: '-2.010e+01' grads.network.layer4.2.conv2.weight: - device: cpu + device: cuda:0 max: '3.57e-02' mean: '-5.759e-06' min: '-3.629e-02' @@ -1455,7 +1455,7 @@ grads.network.layer4.2.conv2.weight: - 3 sum: '-1.359e+01' grads.network.layer4.2.conv3.weight: - device: cpu + device: cuda:0 max: '9.38e-03' mean: '2.033e-05' min: '-1.081e-02' @@ -1466,7 +1466,7 @@ grads.network.layer4.2.conv3.weight: - 1 sum: '2.131e+01' outputs.logits: - device: cpu + device: cuda:0 max: '5.678e+00' mean: '-2.389e-03' min: '-5.650e+00' @@ -1475,14 +1475,14 @@ outputs.logits: - 10 sum: '-3.058e+00' outputs.loss: - device: cpu + device: cuda:0 max: '2.735e+00' mean: '2.735e+00' min: '2.735e+00' shape: [] sum: '2.735e+00' outputs.y: - device: cpu + device: cuda:0 max: 9 mean: '4.555e+00' min: 0 diff --git a/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet50_imagenet_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet50_imagenet_image_classifier.yaml new file mode 100644 index 00000000..6da0613a --- /dev/null +++ b/.regression_files/project/algorithms/image_classifier_test/test_backward_pass_is_reproducible/resnet50_imagenet_image_classifier.yaml @@ -0,0 +1,1491 @@ +batch.0: + device: cuda:0 + max: '2.640e+00' + mean: '-6.663e-02' + min: '-2.118e+00' + shape: + - 64 + - 3 + - 224 + - 224 + sum: '-6.419e+05' +batch.1: + device: cuda:0 + max: 988 + mean: '5.182e+02' + min: 0 + shape: + - 64 + sum: 33166 +grads.network.bn1.bias: + device: cuda:0 + max: '2.068e-01' + mean: '-9.46e-03' + min: '-2.002e-01' + shape: + - 64 + sum: '-6.054e-01' +grads.network.bn1.weight: + device: cuda:0 + max: '2.498e-01' + mean: '2.254e-07' + min: '-3.246e-01' + shape: + - 64 + sum: '1.442e-05' +grads.network.conv1.weight: + device: cuda:0 + max: '4.087e+00' + mean: '2.056e-01' + min: '-2.608e+00' + shape: + - 64 + - 3 + - 7 + - 7 + sum: '1.934e+03' +grads.network.fc.bias: + device: cuda:0 + max: '4.933e-03' + mean: '-2.235e-11' + min: '-3.081e-02' + shape: + - 1000 + sum: '-2.235e-08' +grads.network.fc.weight: + device: cuda:0 + max: '9.717e-03' + mean: '-1.118e-11' + min: '-9.624e-02' + shape: + - 1000 + - 2048 + sum: '-2.289e-05' +grads.network.layer1.0.bn1.bias: + device: cuda:0 + max: '1.701e-01' + mean: '-1.097e-02' + min: '-2.24e-01' + shape: + - 64 + sum: '-7.022e-01' +grads.network.layer1.0.bn1.weight: + device: cuda:0 + max: '2.153e-01' + mean: '-6.054e-09' + min: '-2.101e-01' + shape: + - 64 + sum: '-3.874e-07' +grads.network.layer1.0.bn2.bias: + device: cuda:0 + max: '2.238e-01' + mean: '2.082e-03' + min: '-1.410e-01' + shape: + - 64 + sum: '1.333e-01' +grads.network.layer1.0.bn2.weight: + device: cuda:0 + max: '1.821e-01' + mean: '-9.057e-08' + min: '-2.169e-01' + shape: + - 64 + sum: '-5.797e-06' +grads.network.layer1.0.bn3.bias: + device: cuda:0 + max: '6.3e-02' + mean: '-6.664e-04' + min: '-6.507e-02' + shape: + - 256 + sum: '-1.706e-01' +grads.network.layer1.0.bn3.weight: + device: cuda:0 + max: '9.049e-02' + mean: '-6.014e-04' + min: '-9.014e-02' + shape: + - 256 + sum: '-1.539e-01' +grads.network.layer1.0.conv1.weight: + device: cuda:0 + max: '3.310e-01' + mean: '-6.233e-04' + min: '-4.917e-01' + shape: + - 64 + - 64 + - 1 + - 1 + sum: '-2.553e+00' +grads.network.layer1.0.conv2.weight: + device: cuda:0 + max: '2.914e-01' + mean: '1.291e-03' + min: '-3.517e-01' + shape: + - 64 + - 64 + - 3 + - 3 + sum: '4.760e+01' +grads.network.layer1.0.conv3.weight: + device: cuda:0 + max: '2.922e-01' + mean: '9.76e-04' + min: '-2.715e-01' + shape: + - 256 + - 64 + - 1 + - 1 + sum: '1.599e+01' +grads.network.layer1.0.downsample.0.weight: + device: cuda:0 + max: '3.240e-01' + mean: '6.147e-04' + min: '-4.201e-01' + shape: + - 256 + - 64 + - 1 + - 1 + sum: '1.007e+01' +grads.network.layer1.0.downsample.1.bias: + device: cuda:0 + max: '6.3e-02' + mean: '-6.664e-04' + min: '-6.507e-02' + shape: + - 256 + sum: '-1.706e-01' +grads.network.layer1.0.downsample.1.weight: + device: cuda:0 + max: '1.168e-01' + mean: '8.313e-04' + min: '-7.264e-02' + shape: + - 256 + sum: '2.128e-01' +grads.network.layer1.1.bn1.bias: + device: cuda:0 + max: '1.160e-01' + mean: '9.456e-04' + min: '-1.079e-01' + shape: + - 64 + sum: '6.052e-02' +grads.network.layer1.1.bn1.weight: + device: cuda:0 + max: '1.274e-01' + mean: '3.097e-08' + min: '-1.296e-01' + shape: + - 64 + sum: '1.982e-06' +grads.network.layer1.1.bn2.bias: + device: cuda:0 + max: '9.845e-02' + mean: '5.403e-03' + min: '-7.661e-02' + shape: + - 64 + sum: '3.458e-01' +grads.network.layer1.1.bn2.weight: + device: cuda:0 + max: '1.274e-01' + mean: '-4.994e-08' + min: '-1.105e-01' + shape: + - 64 + sum: '-3.196e-06' +grads.network.layer1.1.bn3.bias: + device: cuda:0 + max: '4.778e-02' + mean: '9.509e-04' + min: '-3.793e-02' + shape: + - 256 + sum: '2.434e-01' +grads.network.layer1.1.bn3.weight: + device: cuda:0 + max: '7.710e-02' + mean: '2.718e-04' + min: '-5.506e-02' + shape: + - 256 + sum: '6.959e-02' +grads.network.layer1.1.conv1.weight: + device: cuda:0 + max: '1.421e-01' + mean: '3.867e-04' + min: '-1.254e-01' + shape: + - 64 + - 256 + - 1 + - 1 + sum: '6.335e+00' +grads.network.layer1.1.conv2.weight: + device: cuda:0 + max: '2.049e-01' + mean: '-3.724e-04' + min: '-2.049e-01' + shape: + - 64 + - 64 + - 3 + - 3 + sum: '-1.373e+01' +grads.network.layer1.1.conv3.weight: + device: cuda:0 + max: '1.850e-01' + mean: '-1.549e-04' + min: '-1.803e-01' + shape: + - 256 + - 64 + - 1 + - 1 + sum: '-2.539e+00' +grads.network.layer1.2.bn1.bias: + device: cuda:0 + max: '5.462e-02' + mean: '-5.246e-04' + min: '-8.094e-02' + shape: + - 64 + sum: '-3.358e-02' +grads.network.layer1.2.bn1.weight: + device: cuda:0 + max: '1.337e-01' + mean: '9.662e-09' + min: '-7.616e-02' + shape: + - 64 + sum: '6.184e-07' +grads.network.layer1.2.bn2.bias: + device: cuda:0 + max: '5.837e-02' + mean: '-2.464e-04' + min: '-6.975e-02' + shape: + - 64 + sum: '-1.577e-02' +grads.network.layer1.2.bn2.weight: + device: cuda:0 + max: '7.667e-02' + mean: '-1.267e-07' + min: '-6.187e-02' + shape: + - 64 + sum: '-8.106e-06' +grads.network.layer1.2.bn3.bias: + device: cuda:0 + max: '2.286e-02' + mean: '7.026e-04' + min: '-2.327e-02' + shape: + - 256 + sum: '1.799e-01' +grads.network.layer1.2.bn3.weight: + device: cuda:0 + max: '4.287e-02' + mean: '-5.017e-04' + min: '-4.000e-02' + shape: + - 256 + sum: '-1.284e-01' +grads.network.layer1.2.conv1.weight: + device: cuda:0 + max: '8.545e-02' + mean: '-3.494e-04' + min: '-9.286e-02' + shape: + - 64 + - 256 + - 1 + - 1 + sum: '-5.725e+00' +grads.network.layer1.2.conv2.weight: + device: cuda:0 + max: '1.467e-01' + mean: '-1.392e-04' + min: '-1.282e-01' + shape: + - 64 + - 64 + - 3 + - 3 + sum: '-5.132e+00' +grads.network.layer1.2.conv3.weight: + device: cuda:0 + max: '1.048e-01' + mean: '-1.928e-04' + min: '-1.267e-01' + shape: + - 256 + - 64 + - 1 + - 1 + sum: '-3.16e+00' +grads.network.layer2.0.bn1.bias: + device: cuda:0 + max: '4.211e-02' + mean: '1.735e-03' + min: '-5.167e-02' + shape: + - 128 + sum: '2.221e-01' +grads.network.layer2.0.bn1.weight: + device: cuda:0 + max: '4.957e-02' + mean: '8.149e-09' + min: '-4.993e-02' + shape: + - 128 + sum: '1.043e-06' +grads.network.layer2.0.bn2.bias: + device: cuda:0 + max: '3.316e-02' + mean: '7.625e-04' + min: '-3.657e-02' + shape: + - 128 + sum: '9.760e-02' +grads.network.layer2.0.bn2.weight: + device: cuda:0 + max: '5.121e-02' + mean: '-4.243e-08' + min: '-4.316e-02' + shape: + - 128 + sum: '-5.431e-06' +grads.network.layer2.0.bn3.bias: + device: cuda:0 + max: '2.226e-02' + mean: '1.177e-04' + min: '-1.811e-02' + shape: + - 512 + sum: '6.026e-02' +grads.network.layer2.0.bn3.weight: + device: cuda:0 + max: '2.429e-02' + mean: '-2.402e-04' + min: '-2.550e-02' + shape: + - 512 + sum: '-1.230e-01' +grads.network.layer2.0.conv1.weight: + device: cuda:0 + max: '8.179e-02' + mean: '-1.704e-05' + min: '-7.493e-02' + shape: + - 128 + - 256 + - 1 + - 1 + sum: '-5.582e-01' +grads.network.layer2.0.conv2.weight: + device: cuda:0 + max: '8.488e-02' + mean: '-2.583e-04' + min: '-8.498e-02' + shape: + - 128 + - 128 + - 3 + - 3 + sum: '-3.809e+01' +grads.network.layer2.0.conv3.weight: + device: cuda:0 + max: '7.02e-02' + mean: '1.67e-05' + min: '-7.408e-02' + shape: + - 512 + - 128 + - 1 + - 1 + sum: '1.094e+00' +grads.network.layer2.0.downsample.0.weight: + device: cuda:0 + max: '5.65e-02' + mean: '3.045e-05' + min: '-5.636e-02' + shape: + - 512 + - 256 + - 1 + - 1 + sum: '3.991e+00' +grads.network.layer2.0.downsample.1.bias: + device: cuda:0 + max: '2.226e-02' + mean: '1.177e-04' + min: '-1.811e-02' + shape: + - 512 + sum: '6.026e-02' +grads.network.layer2.0.downsample.1.weight: + device: cuda:0 + max: '2.814e-02' + mean: '4.625e-04' + min: '-2.305e-02' + shape: + - 512 + sum: '2.368e-01' +grads.network.layer2.1.bn1.bias: + device: cuda:0 + max: '3.645e-02' + mean: '-7.118e-04' + min: '-3.115e-02' + shape: + - 128 + sum: '-9.111e-02' +grads.network.layer2.1.bn1.weight: + device: cuda:0 + max: '4.458e-02' + mean: '-6.869e-09' + min: '-3.865e-02' + shape: + - 128 + sum: '-8.792e-07' +grads.network.layer2.1.bn2.bias: + device: cuda:0 + max: '2.695e-02' + mean: '-9.38e-04' + min: '-2.543e-02' + shape: + - 128 + sum: '-1.201e-01' +grads.network.layer2.1.bn2.weight: + device: cuda:0 + max: '2.824e-02' + mean: '-1.768e-08' + min: '-2.943e-02' + shape: + - 128 + sum: '-2.263e-06' +grads.network.layer2.1.bn3.bias: + device: cuda:0 + max: '1.148e-02' + mean: '2.42e-04' + min: '-9.819e-03' + shape: + - 512 + sum: '1.239e-01' +grads.network.layer2.1.bn3.weight: + device: cuda:0 + max: '1.542e-02' + mean: '-9.633e-05' + min: '-1.593e-02' + shape: + - 512 + sum: '-4.932e-02' +grads.network.layer2.1.conv1.weight: + device: cuda:0 + max: '3.077e-02' + mean: '3.157e-04' + min: '-3.122e-02' + shape: + - 128 + - 512 + - 1 + - 1 + sum: '2.069e+01' +grads.network.layer2.1.conv2.weight: + device: cuda:0 + max: '5.878e-02' + mean: '5.832e-05' + min: '-5.409e-02' + shape: + - 128 + - 128 + - 3 + - 3 + sum: '8.600e+00' +grads.network.layer2.1.conv3.weight: + device: cuda:0 + max: '5.426e-02' + mean: '6.567e-05' + min: '-3.881e-02' + shape: + - 512 + - 128 + - 1 + - 1 + sum: '4.303e+00' +grads.network.layer2.2.bn1.bias: + device: cuda:0 + max: '3.436e-02' + mean: '1.063e-05' + min: '-2.625e-02' + shape: + - 128 + sum: '1.361e-03' +grads.network.layer2.2.bn1.weight: + device: cuda:0 + max: '2.442e-02' + mean: '-6.228e-09' + min: '-3.548e-02' + shape: + - 128 + sum: '-7.972e-07' +grads.network.layer2.2.bn2.bias: + device: cuda:0 + max: '1.91e-02' + mean: '8.820e-05' + min: '-1.719e-02' + shape: + - 128 + sum: '1.129e-02' +grads.network.layer2.2.bn2.weight: + device: cuda:0 + max: '2.045e-02' + mean: '7.683e-09' + min: '-2.136e-02' + shape: + - 128 + sum: '9.835e-07' +grads.network.layer2.2.bn3.bias: + device: cuda:0 + max: '7.928e-03' + mean: '-9.574e-05' + min: '-7.345e-03' + shape: + - 512 + sum: '-4.902e-02' +grads.network.layer2.2.bn3.weight: + device: cuda:0 + max: '1.170e-02' + mean: '2.873e-05' + min: '-1.136e-02' + shape: + - 512 + sum: '1.471e-02' +grads.network.layer2.2.conv1.weight: + device: cuda:0 + max: '2.182e-02' + mean: '5.088e-05' + min: '-2.084e-02' + shape: + - 128 + - 512 + - 1 + - 1 + sum: '3.334e+00' +grads.network.layer2.2.conv2.weight: + device: cuda:0 + max: '4.288e-02' + mean: '-5.458e-05' + min: '-4.216e-02' + shape: + - 128 + - 128 + - 3 + - 3 + sum: '-8.048e+00' +grads.network.layer2.2.conv3.weight: + device: cuda:0 + max: '3.284e-02' + mean: '4.204e-05' + min: '-3.245e-02' + shape: + - 512 + - 128 + - 1 + - 1 + sum: '2.755e+00' +grads.network.layer2.3.bn1.bias: + device: cuda:0 + max: '1.834e-02' + mean: '4.186e-04' + min: '-2.066e-02' + shape: + - 128 + sum: '5.358e-02' +grads.network.layer2.3.bn1.weight: + device: cuda:0 + max: '2.448e-02' + mean: '-2.095e-09' + min: '-2.123e-02' + shape: + - 128 + sum: '-2.682e-07' +grads.network.layer2.3.bn2.bias: + device: cuda:0 + max: '1.283e-02' + mean: '2.229e-04' + min: '-1.321e-02' + shape: + - 128 + sum: '2.853e-02' +grads.network.layer2.3.bn2.weight: + device: cuda:0 + max: '1.610e-02' + mean: '-3.396e-08' + min: '-2.095e-02' + shape: + - 128 + sum: '-4.347e-06' +grads.network.layer2.3.bn3.bias: + device: cuda:0 + max: '4.654e-03' + mean: '-2.983e-05' + min: '-5.059e-03' + shape: + - 512 + sum: '-1.527e-02' +grads.network.layer2.3.bn3.weight: + device: cuda:0 + max: '1.013e-02' + mean: '-1.547e-04' + min: '-1.059e-02' + shape: + - 512 + sum: '-7.918e-02' +grads.network.layer2.3.conv1.weight: + device: cuda:0 + max: '1.884e-02' + mean: '1.101e-04' + min: '-1.608e-02' + shape: + - 128 + - 512 + - 1 + - 1 + sum: '7.213e+00' +grads.network.layer2.3.conv2.weight: + device: cuda:0 + max: '2.661e-02' + mean: '6.131e-05' + min: '-2.643e-02' + shape: + - 128 + - 128 + - 3 + - 3 + sum: '9.040e+00' +grads.network.layer2.3.conv3.weight: + device: cuda:0 + max: '2.310e-02' + mean: '4.181e-05' + min: '-2.429e-02' + shape: + - 512 + - 128 + - 1 + - 1 + sum: '2.74e+00' +grads.network.layer3.0.bn1.bias: + device: cuda:0 + max: '1.159e-02' + mean: '6.957e-05' + min: '-1.154e-02' + shape: + - 256 + sum: '1.781e-02' +grads.network.layer3.0.bn1.weight: + device: cuda:0 + max: '1.38e-02' + mean: '-4.657e-10' + min: '-1.321e-02' + shape: + - 256 + sum: '-1.192e-07' +grads.network.layer3.0.bn2.bias: + device: cuda:0 + max: '1.036e-02' + mean: '1.608e-04' + min: '-1.092e-02' + shape: + - 256 + sum: '4.116e-02' +grads.network.layer3.0.bn2.weight: + device: cuda:0 + max: '1.286e-02' + mean: '-9.262e-09' + min: '-1.329e-02' + shape: + - 256 + sum: '-2.371e-06' +grads.network.layer3.0.bn3.bias: + device: cuda:0 + max: '4.818e-03' + mean: '1.895e-05' + min: '-4.491e-03' + shape: + - 1024 + sum: '1.940e-02' +grads.network.layer3.0.bn3.weight: + device: cuda:0 + max: '6.393e-03' + mean: '-5.269e-05' + min: '-5.746e-03' + shape: + - 1024 + sum: '-5.396e-02' +grads.network.layer3.0.conv1.weight: + device: cuda:0 + max: '1.654e-02' + mean: '-4.966e-05' + min: '-1.824e-02' + shape: + - 256 + - 512 + - 1 + - 1 + sum: '-6.51e+00' +grads.network.layer3.0.conv2.weight: + device: cuda:0 + max: '1.841e-02' + mean: '-1.719e-05' + min: '-1.882e-02' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '-1.014e+01' +grads.network.layer3.0.conv3.weight: + device: cuda:0 + max: '1.641e-02' + mean: '-2.978e-05' + min: '-1.824e-02' + shape: + - 1024 + - 256 + - 1 + - 1 + sum: '-7.806e+00' +grads.network.layer3.0.downsample.0.weight: + device: cuda:0 + max: '1.271e-02' + mean: '-2.944e-05' + min: '-1.281e-02' + shape: + - 1024 + - 512 + - 1 + - 1 + sum: '-1.544e+01' +grads.network.layer3.0.downsample.1.bias: + device: cuda:0 + max: '4.818e-03' + mean: '1.895e-05' + min: '-4.491e-03' + shape: + - 1024 + sum: '1.940e-02' +grads.network.layer3.0.downsample.1.weight: + device: cuda:0 + max: '7.039e-03' + mean: '-1.403e-05' + min: '-5.472e-03' + shape: + - 1024 + sum: '-1.437e-02' +grads.network.layer3.1.bn1.bias: + device: cuda:0 + max: '1.027e-02' + mean: '-7.899e-05' + min: '-7.042e-03' + shape: + - 256 + sum: '-2.022e-02' +grads.network.layer3.1.bn1.weight: + device: cuda:0 + max: '9.592e-03' + mean: '-1.186e-09' + min: '-9.877e-03' + shape: + - 256 + sum: '-3.036e-07' +grads.network.layer3.1.bn2.bias: + device: cuda:0 + max: '5.802e-03' + mean: '-1.144e-04' + min: '-6.516e-03' + shape: + - 256 + sum: '-2.929e-02' +grads.network.layer3.1.bn2.weight: + device: cuda:0 + max: '7.174e-03' + mean: '1.312e-08' + min: '-7.594e-03' + shape: + - 256 + sum: '3.359e-06' +grads.network.layer3.1.bn3.bias: + device: cuda:0 + max: '2.986e-03' + mean: '-8.18e-06' + min: '-3.319e-03' + shape: + - 1024 + sum: '-8.376e-03' +grads.network.layer3.1.bn3.weight: + device: cuda:0 + max: '4.028e-03' + mean: '6.062e-05' + min: '-3.991e-03' + shape: + - 1024 + sum: '6.207e-02' +grads.network.layer3.1.conv1.weight: + device: cuda:0 + max: '8.729e-03' + mean: '-2.166e-05' + min: '-7.953e-03' + shape: + - 256 + - 1024 + - 1 + - 1 + sum: '-5.678e+00' +grads.network.layer3.1.conv2.weight: + device: cuda:0 + max: '1.39e-02' + mean: '-2.612e-05' + min: '-1.387e-02' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '-1.541e+01' +grads.network.layer3.1.conv3.weight: + device: cuda:0 + max: '1.024e-02' + mean: '-1.092e-05' + min: '-1.074e-02' + shape: + - 1024 + - 256 + - 1 + - 1 + sum: '-2.863e+00' +grads.network.layer3.2.bn1.bias: + device: cuda:0 + max: '7.474e-03' + mean: '1.205e-04' + min: '-6.481e-03' + shape: + - 256 + sum: '3.085e-02' +grads.network.layer3.2.bn1.weight: + device: cuda:0 + max: '9.865e-03' + mean: '-9.313e-10' + min: '-7.930e-03' + shape: + - 256 + sum: '-2.384e-07' +grads.network.layer3.2.bn2.bias: + device: cuda:0 + max: '5.072e-03' + mean: '1.298e-04' + min: '-4.838e-03' + shape: + - 256 + sum: '3.323e-02' +grads.network.layer3.2.bn2.weight: + device: cuda:0 + max: '6.424e-03' + mean: '9.468e-09' + min: '-5.991e-03' + shape: + - 256 + sum: '2.424e-06' +grads.network.layer3.2.bn3.bias: + device: cuda:0 + max: '1.696e-03' + mean: '2.526e-05' + min: '-1.766e-03' + shape: + - 1024 + sum: '2.587e-02' +grads.network.layer3.2.bn3.weight: + device: cuda:0 + max: '3.010e-03' + mean: '3.859e-05' + min: '-2.832e-03' + shape: + - 1024 + sum: '3.952e-02' +grads.network.layer3.2.conv1.weight: + device: cuda:0 + max: '6.116e-03' + mean: '-1.069e-05' + min: '-6.560e-03' + shape: + - 256 + - 1024 + - 1 + - 1 + sum: '-2.802e+00' +grads.network.layer3.2.conv2.weight: + device: cuda:0 + max: '9.867e-03' + mean: '-6.347e-06' + min: '-9.511e-03' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '-3.744e+00' +grads.network.layer3.2.conv3.weight: + device: cuda:0 + max: '7.406e-03' + mean: '-2.159e-05' + min: '-7.51e-03' + shape: + - 1024 + - 256 + - 1 + - 1 + sum: '-5.66e+00' +grads.network.layer3.3.bn1.bias: + device: cuda:0 + max: '3.839e-03' + mean: '4.194e-05' + min: '-4.033e-03' + shape: + - 256 + sum: '1.074e-02' +grads.network.layer3.3.bn1.weight: + device: cuda:0 + max: '5.956e-03' + mean: '1.382e-10' + min: '-5.073e-03' + shape: + - 256 + sum: '3.539e-08' +grads.network.layer3.3.bn2.bias: + device: cuda:0 + max: '4.210e-03' + mean: '3.714e-05' + min: '-3.497e-03' + shape: + - 256 + sum: '9.507e-03' +grads.network.layer3.3.bn2.weight: + device: cuda:0 + max: '4.847e-03' + mean: '-6.614e-09' + min: '-4.154e-03' + shape: + - 256 + sum: '-1.693e-06' +grads.network.layer3.3.bn3.bias: + device: cuda:0 + max: '1.448e-03' + mean: '1.18e-05' + min: '-1.585e-03' + shape: + - 1024 + sum: '1.208e-02' +grads.network.layer3.3.bn3.weight: + device: cuda:0 + max: '2.472e-03' + mean: '-3.084e-05' + min: '-2.461e-03' + shape: + - 1024 + sum: '-3.158e-02' +grads.network.layer3.3.conv1.weight: + device: cuda:0 + max: '4.561e-03' + mean: '-1.505e-06' + min: '-4.213e-03' + shape: + - 256 + - 1024 + - 1 + - 1 + sum: '-3.946e-01' +grads.network.layer3.3.conv2.weight: + device: cuda:0 + max: '7.155e-03' + mean: '-1.727e-05' + min: '-7.462e-03' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '-1.019e+01' +grads.network.layer3.3.conv3.weight: + device: cuda:0 + max: '7.199e-03' + mean: '-1.848e-05' + min: '-6.481e-03' + shape: + - 1024 + - 256 + - 1 + - 1 + sum: '-4.844e+00' +grads.network.layer3.4.bn1.bias: + device: cuda:0 + max: '3.403e-03' + mean: '2.286e-05' + min: '-3.422e-03' + shape: + - 256 + sum: '5.853e-03' +grads.network.layer3.4.bn1.weight: + device: cuda:0 + max: '3.392e-03' + mean: '7.512e-10' + min: '-4.168e-03' + shape: + - 256 + sum: '1.923e-07' +grads.network.layer3.4.bn2.bias: + device: cuda:0 + max: '2.511e-03' + mean: '5.277e-05' + min: '-3.381e-03' + shape: + - 256 + sum: '1.351e-02' +grads.network.layer3.4.bn2.weight: + device: cuda:0 + max: '4.038e-03' + mean: '3.572e-09' + min: '-3.609e-03' + shape: + - 256 + sum: '9.146e-07' +grads.network.layer3.4.bn3.bias: + device: cuda:0 + max: '1.408e-03' + mean: '1.227e-05' + min: '-8.456e-04' + shape: + - 1024 + sum: '1.256e-02' +grads.network.layer3.4.bn3.weight: + device: cuda:0 + max: '1.611e-03' + mean: '1.336e-05' + min: '-1.889e-03' + shape: + - 1024 + sum: '1.368e-02' +grads.network.layer3.4.conv1.weight: + device: cuda:0 + max: '3.532e-03' + mean: '-8.469e-06' + min: '-4.099e-03' + shape: + - 256 + - 1024 + - 1 + - 1 + sum: '-2.220e+00' +grads.network.layer3.4.conv2.weight: + device: cuda:0 + max: '5.658e-03' + mean: '-1.714e-05' + min: '-5.384e-03' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '-1.011e+01' +grads.network.layer3.4.conv3.weight: + device: cuda:0 + max: '4.909e-03' + mean: '-1.151e-05' + min: '-4.874e-03' + shape: + - 1024 + - 256 + - 1 + - 1 + sum: '-3.016e+00' +grads.network.layer3.5.bn1.bias: + device: cuda:0 + max: '2.425e-03' + mean: '-1.526e-05' + min: '-2.448e-03' + shape: + - 256 + sum: '-3.906e-03' +grads.network.layer3.5.bn1.weight: + device: cuda:0 + max: '3.617e-03' + mean: '7.203e-10' + min: '-2.678e-03' + shape: + - 256 + sum: '1.844e-07' +grads.network.layer3.5.bn2.bias: + device: cuda:0 + max: '2.354e-03' + mean: '5.188e-05' + min: '-3.471e-03' + shape: + - 256 + sum: '1.328e-02' +grads.network.layer3.5.bn2.weight: + device: cuda:0 + max: '2.992e-03' + mean: '-3.147e-09' + min: '-2.420e-03' + shape: + - 256 + sum: '-8.056e-07' +grads.network.layer3.5.bn3.bias: + device: cuda:0 + max: '6.43e-04' + mean: '8.147e-06' + min: '-6.512e-04' + shape: + - 1024 + sum: '8.342e-03' +grads.network.layer3.5.bn3.weight: + device: cuda:0 + max: '1.439e-03' + mean: '-1.501e-05' + min: '-1.433e-03' + shape: + - 1024 + sum: '-1.537e-02' +grads.network.layer3.5.conv1.weight: + device: cuda:0 + max: '2.588e-03' + mean: '-1.225e-05' + min: '-3.101e-03' + shape: + - 256 + - 1024 + - 1 + - 1 + sum: '-3.211e+00' +grads.network.layer3.5.conv2.weight: + device: cuda:0 + max: '4.908e-03' + mean: '-1.443e-05' + min: '-4.324e-03' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '-8.509e+00' +grads.network.layer3.5.conv3.weight: + device: cuda:0 + max: '4.695e-03' + mean: '-1.048e-05' + min: '-4.000e-03' + shape: + - 1024 + - 256 + - 1 + - 1 + sum: '-2.746e+00' +grads.network.layer4.0.bn1.bias: + device: cuda:0 + max: '2.172e-03' + mean: '-1.531e-06' + min: '-2.475e-03' + shape: + - 512 + sum: '-7.838e-04' +grads.network.layer4.0.bn1.weight: + device: cuda:0 + max: '2.885e-03' + mean: '1.164e-10' + min: '-3.367e-03' + shape: + - 512 + sum: '5.960e-08' +grads.network.layer4.0.bn2.bias: + device: cuda:0 + max: '1.743e-03' + mean: '4.506e-05' + min: '-1.865e-03' + shape: + - 512 + sum: '2.307e-02' +grads.network.layer4.0.bn2.weight: + device: cuda:0 + max: '2.32e-03' + mean: '1.145e-08' + min: '-3.617e-03' + shape: + - 512 + sum: '5.864e-06' +grads.network.layer4.0.bn3.bias: + device: cuda:0 + max: '2.545e-03' + mean: '8.033e-05' + min: '-2.183e-03' + shape: + - 2048 + sum: '1.645e-01' +grads.network.layer4.0.bn3.weight: + device: cuda:0 + max: '2.965e-03' + mean: '4.471e-05' + min: '-2.004e-03' + shape: + - 2048 + sum: '9.156e-02' +grads.network.layer4.0.conv1.weight: + device: cuda:0 + max: '3.048e-03' + mean: '-1.777e-05' + min: '-2.91e-03' + shape: + - 512 + - 1024 + - 1 + - 1 + sum: '-9.317e+00' +grads.network.layer4.0.conv2.weight: + device: cuda:0 + max: '4.142e-03' + mean: '-8.243e-06' + min: '-3.973e-03' + shape: + - 512 + - 512 + - 3 + - 3 + sum: '-1.945e+01' +grads.network.layer4.0.conv3.weight: + device: cuda:0 + max: '3.856e-03' + mean: '-4.106e-06' + min: '-4.645e-03' + shape: + - 2048 + - 512 + - 1 + - 1 + sum: '-4.306e+00' +grads.network.layer4.0.downsample.0.weight: + device: cuda:0 + max: '3.427e-03' + mean: '1.003e-06' + min: '-3.696e-03' + shape: + - 2048 + - 1024 + - 1 + - 1 + sum: '2.104e+00' +grads.network.layer4.0.downsample.1.bias: + device: cuda:0 + max: '2.545e-03' + mean: '8.033e-05' + min: '-2.183e-03' + shape: + - 2048 + sum: '1.645e-01' +grads.network.layer4.0.downsample.1.weight: + device: cuda:0 + max: '2.177e-03' + mean: '3.785e-05' + min: '-2.256e-03' + shape: + - 2048 + sum: '7.751e-02' +grads.network.layer4.1.bn1.bias: + device: cuda:0 + max: '1.501e-03' + mean: '2.144e-05' + min: '-1.368e-03' + shape: + - 512 + sum: '1.098e-02' +grads.network.layer4.1.bn1.weight: + device: cuda:0 + max: '2.379e-03' + mean: '7.913e-11' + min: '-2.5e-03' + shape: + - 512 + sum: '4.051e-08' +grads.network.layer4.1.bn2.bias: + device: cuda:0 + max: '1.778e-03' + mean: '4.209e-05' + min: '-1.812e-03' + shape: + - 512 + sum: '2.155e-02' +grads.network.layer4.1.bn2.weight: + device: cuda:0 + max: '2.058e-03' + mean: '1.25e-08' + min: '-2.322e-03' + shape: + - 512 + sum: '6.399e-06' +grads.network.layer4.1.bn3.bias: + device: cuda:0 + max: '2.914e-03' + mean: '1.136e-04' + min: '-3.222e-03' + shape: + - 2048 + sum: '2.327e-01' +grads.network.layer4.1.bn3.weight: + device: cuda:0 + max: '2.364e-03' + mean: '5.421e-05' + min: '-2.150e-03' + shape: + - 2048 + sum: '1.110e-01' +grads.network.layer4.1.conv1.weight: + device: cuda:0 + max: '1.885e-03' + mean: '-2.997e-06' + min: '-1.927e-03' + shape: + - 512 + - 2048 + - 1 + - 1 + sum: '-3.143e+00' +grads.network.layer4.1.conv2.weight: + device: cuda:0 + max: '3.744e-03' + mean: '-1.002e-05' + min: '-3.811e-03' + shape: + - 512 + - 512 + - 3 + - 3 + sum: '-2.364e+01' +grads.network.layer4.1.conv3.weight: + device: cuda:0 + max: '5.011e-03' + mean: '2.916e-07' + min: '-3.704e-03' + shape: + - 2048 + - 512 + - 1 + - 1 + sum: '3.058e-01' +grads.network.layer4.2.bn1.bias: + device: cuda:0 + max: '1.331e-03' + mean: '2.21e-05' + min: '-1.425e-03' + shape: + - 512 + sum: '1.131e-02' +grads.network.layer4.2.bn1.weight: + device: cuda:0 + max: '2.19e-03' + mean: '2.183e-10' + min: '-2.435e-03' + shape: + - 512 + sum: '1.118e-07' +grads.network.layer4.2.bn2.bias: + device: cuda:0 + max: '1.404e-03' + mean: '9.475e-06' + min: '-1.412e-03' + shape: + - 512 + sum: '4.851e-03' +grads.network.layer4.2.bn2.weight: + device: cuda:0 + max: '3.054e-03' + mean: '1.17e-08' + min: '-2.907e-03' + shape: + - 512 + sum: '5.990e-06' +grads.network.layer4.2.bn3.bias: + device: cuda:0 + max: '4.169e-03' + mean: '1.393e-04' + min: '-4.317e-03' + shape: + - 2048 + sum: '2.852e-01' +grads.network.layer4.2.bn3.weight: + device: cuda:0 + max: '2.599e-03' + mean: '5.148e-05' + min: '-1.775e-03' + shape: + - 2048 + sum: '1.054e-01' +grads.network.layer4.2.conv1.weight: + device: cuda:0 + max: '1.832e-03' + mean: '-4.348e-06' + min: '-1.785e-03' + shape: + - 512 + - 2048 + - 1 + - 1 + sum: '-4.559e+00' +grads.network.layer4.2.conv2.weight: + device: cuda:0 + max: '4.026e-03' + mean: '4.673e-06' + min: '-3.410e-03' + shape: + - 512 + - 512 + - 3 + - 3 + sum: '1.102e+01' +grads.network.layer4.2.conv3.weight: + device: cuda:0 + max: '4.736e-03' + mean: '-5.085e-06' + min: '-4.618e-03' + shape: + - 2048 + - 512 + - 1 + - 1 + sum: '-5.332e+00' +outputs.logits: + device: cuda:0 + max: '4.058e+00' + mean: '1.188e-02' + min: '-4.237e+00' + shape: + - 64 + - 1000 + sum: '7.600e+02' +outputs.loss: + device: cuda:0 + max: '7.112e+00' + mean: '7.112e+00' + min: '7.112e+00' + shape: [] + sum: '7.112e+00' +outputs.y: + device: cuda:0 + max: 988 + mean: '5.182e+02' + min: 0 + shape: + - 64 + sum: 33166 diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/fcnet_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/fcnet_cifar10_image_classifier.yaml similarity index 100% rename from .regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/fcnet_cifar10_example.yaml rename to .regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/fcnet_cifar10_image_classifier.yaml diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/fcnet_fashion_mnist_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/fcnet_fashion_mnist_image_classifier.yaml similarity index 100% rename from .regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/fcnet_fashion_mnist_example.yaml rename to .regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/fcnet_fashion_mnist_image_classifier.yaml diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/fcnet_mnist_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/fcnet_mnist_image_classifier.yaml similarity index 100% rename from .regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/fcnet_mnist_example.yaml rename to .regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/fcnet_mnist_image_classifier.yaml diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/resnet18_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet18_cifar10_image_classifier.yaml similarity index 100% rename from .regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/resnet18_cifar10_example.yaml rename to .regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet18_cifar10_image_classifier.yaml diff --git a/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet18_imagenet_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet18_imagenet_image_classifier.yaml new file mode 100644 index 00000000..071379c4 --- /dev/null +++ b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet18_imagenet_image_classifier.yaml @@ -0,0 +1,20 @@ +input: + device: cuda:0 + max: '2.640e+00' + mean: '-6.663e-02' + min: '-2.118e+00' + shape: + - 64 + - 3 + - 224 + - 224 + sum: '-6.419e+05' +out: + device: cuda:0 + max: '2.934e+00' + mean: '-8.071e-04' + min: '-2.896e+00' + shape: + - 64 + - 1000 + sum: '-5.165e+01' diff --git a/.regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/resnet50_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet50_cifar10_image_classifier.yaml similarity index 100% rename from .regression_files/project/algorithms/example_test/test_forward_pass_is_reproducible/cuda/resnet50_cifar10_example.yaml rename to .regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet50_cifar10_image_classifier.yaml diff --git a/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet50_imagenet_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet50_imagenet_image_classifier.yaml new file mode 100644 index 00000000..bfd8d4f6 --- /dev/null +++ b/.regression_files/project/algorithms/image_classifier_test/test_forward_pass_is_reproducible/cuda/resnet50_imagenet_image_classifier.yaml @@ -0,0 +1,20 @@ +input: + device: cuda:0 + max: '2.640e+00' + mean: '-6.663e-02' + min: '-2.118e+00' + shape: + - 64 + - 3 + - 224 + - 224 + sum: '-6.419e+05' +out: + device: cuda:0 + max: '4.058e+00' + mean: '1.188e-02' + min: '-4.237e+00' + shape: + - 64 + - 1000 + sum: '7.600e+02' diff --git a/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_cifar10_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_cifar10_image_classifier.yaml new file mode 100644 index 00000000..1018428b --- /dev/null +++ b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_cifar10_image_classifier.yaml @@ -0,0 +1,51 @@ +network.0.1.bias: + device: cuda:0 + max: '1.801e-02' + mean: '1.029e-03' + min: '-1.784e-02' + shape: + - 128 + sum: '1.317e-01' +network.0.1.weight: + device: cuda:0 + max: '1.804e-02' + mean: '1.616e-05' + min: '-1.804e-02' + shape: + - 128 + - 3072 + sum: '6.354e+00' +network.1.0.bias: + device: cuda:0 + max: '8.781e-02' + mean: '4.829e-04' + min: '-8.787e-02' + shape: + - 128 + sum: '6.181e-02' +network.1.0.weight: + device: cuda:0 + max: '8.837e-02' + mean: '-9.613e-04' + min: '-8.837e-02' + shape: + - 128 + - 128 + sum: '-1.575e+01' +network.2.0.bias: + device: cuda:0 + max: '8.495e-02' + mean: '-9.068e-04' + min: '-8.834e-02' + shape: + - 10 + sum: '-9.068e-03' +network.2.0.weight: + device: cuda:0 + max: '8.826e-02' + mean: '-3.724e-04' + min: '-8.834e-02' + shape: + - 10 + - 128 + sum: '-4.767e-01' diff --git a/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_fashion_mnist_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_fashion_mnist_image_classifier.yaml new file mode 100644 index 00000000..c85a5f80 --- /dev/null +++ b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_fashion_mnist_image_classifier.yaml @@ -0,0 +1,51 @@ +network.0.1.bias: + device: cuda:0 + max: '3.530e-02' + mean: '1.341e-03' + min: '-3.541e-02' + shape: + - 128 + sum: '1.716e-01' +network.0.1.weight: + device: cuda:0 + max: '3.571e-02' + mean: '9.349e-05' + min: '-3.571e-02' + shape: + - 128 + - 784 + sum: '9.382e+00' +network.1.0.bias: + device: cuda:0 + max: '8.268e-02' + mean: '-6.752e-03' + min: '-8.591e-02' + shape: + - 128 + sum: '-8.642e-01' +network.1.0.weight: + device: cuda:0 + max: '8.837e-02' + mean: '1.286e-04' + min: '-8.838e-02' + shape: + - 128 + - 128 + sum: '2.107e+00' +network.2.0.bias: + device: cuda:0 + max: '4.038e-02' + mean: '-3.545e-02' + min: '-7.938e-02' + shape: + - 10 + sum: '-3.545e-01' +network.2.0.weight: + device: cuda:0 + max: '8.829e-02' + mean: '-5.307e-04' + min: '-8.835e-02' + shape: + - 10 + - 128 + sum: '-6.793e-01' diff --git a/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_mnist_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_mnist_image_classifier.yaml new file mode 100644 index 00000000..c85a5f80 --- /dev/null +++ b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/fcnet_mnist_image_classifier.yaml @@ -0,0 +1,51 @@ +network.0.1.bias: + device: cuda:0 + max: '3.530e-02' + mean: '1.341e-03' + min: '-3.541e-02' + shape: + - 128 + sum: '1.716e-01' +network.0.1.weight: + device: cuda:0 + max: '3.571e-02' + mean: '9.349e-05' + min: '-3.571e-02' + shape: + - 128 + - 784 + sum: '9.382e+00' +network.1.0.bias: + device: cuda:0 + max: '8.268e-02' + mean: '-6.752e-03' + min: '-8.591e-02' + shape: + - 128 + sum: '-8.642e-01' +network.1.0.weight: + device: cuda:0 + max: '8.837e-02' + mean: '1.286e-04' + min: '-8.838e-02' + shape: + - 128 + - 128 + sum: '2.107e+00' +network.2.0.bias: + device: cuda:0 + max: '4.038e-02' + mean: '-3.545e-02' + min: '-7.938e-02' + shape: + - 10 + sum: '-3.545e-01' +network.2.0.weight: + device: cuda:0 + max: '8.829e-02' + mean: '-5.307e-04' + min: '-8.835e-02' + shape: + - 10 + - 128 + sum: '-6.793e-01' diff --git a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/resnet18_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet18_cifar10_image_classifier.yaml similarity index 76% rename from .regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/resnet18_cifar10_example.yaml rename to .regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet18_cifar10_image_classifier.yaml index ba0cad92..61ccf18e 100644 --- a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/resnet18_cifar10_example.yaml +++ b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet18_cifar10_image_classifier.yaml @@ -1,5 +1,5 @@ network.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -7,14 +7,14 @@ network.bn1.bias: - 64 sum: '0.e+00' network.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -22,7 +22,7 @@ network.bn1.running_mean: - 64 sum: '0.e+00' network.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -30,7 +30,7 @@ network.bn1.running_var: - 64 sum: '6.4e+01' network.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -38,35 +38,35 @@ network.bn1.weight: - 64 sum: '6.4e+01' network.conv1.weight: - device: cpu - max: '1.098e-01' - mean: '1.139e-04' - min: '-8.341e-02' + device: cuda:0 + max: '8.688e-02' + mean: '5.299e-04' + min: '-9.862e-02' shape: - 64 - 3 - 7 - 7 - sum: '1.072e+00' + sum: '4.986e+00' network.fc.bias: - device: cpu - max: '3.715e-02' - mean: '-1.094e-02' - min: '-3.341e-02' + device: cuda:0 + max: '4.314e-02' + mean: '2.057e-04' + min: '-3.14e-02' shape: - 10 - sum: '-1.094e-01' + sum: '2.057e-03' network.fc.weight: - device: cpu + device: cuda:0 max: '4.418e-02' - mean: '-4.792e-04' - min: '-4.418e-02' + mean: '1.848e-04' + min: '-4.414e-02' shape: - 10 - 512 - sum: '-2.454e+00' + sum: '9.461e-01' network.layer1.0.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -74,14 +74,14 @@ network.layer1.0.bn1.bias: - 64 sum: '0.e+00' network.layer1.0.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer1.0.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -89,7 +89,7 @@ network.layer1.0.bn1.running_mean: - 64 sum: '0.e+00' network.layer1.0.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -97,7 +97,7 @@ network.layer1.0.bn1.running_var: - 64 sum: '6.4e+01' network.layer1.0.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -105,7 +105,7 @@ network.layer1.0.bn1.weight: - 64 sum: '6.4e+01' network.layer1.0.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -113,14 +113,14 @@ network.layer1.0.bn2.bias: - 64 sum: '0.e+00' network.layer1.0.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer1.0.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -128,7 +128,7 @@ network.layer1.0.bn2.running_mean: - 64 sum: '0.e+00' network.layer1.0.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -136,7 +136,7 @@ network.layer1.0.bn2.running_var: - 64 sum: '6.4e+01' network.layer1.0.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -144,29 +144,29 @@ network.layer1.0.bn2.weight: - 64 sum: '6.4e+01' network.layer1.0.conv1.weight: - device: cpu - max: '2.499e-01' - mean: '2.448e-04' - min: '-2.519e-01' + device: cuda:0 + max: '2.433e-01' + mean: '1.396e-04' + min: '-2.501e-01' shape: - 64 - 64 - 3 - 3 - sum: '9.024e+00' + sum: '5.148e+00' network.layer1.0.conv2.weight: - device: cpu - max: '2.35e-01' - mean: '-2.816e-04' - min: '-2.581e-01' + device: cuda:0 + max: '2.442e-01' + mean: '1.259e-04' + min: '-2.666e-01' shape: - 64 - 64 - 3 - 3 - sum: '-1.038e+01' + sum: '4.642e+00' network.layer1.1.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -174,14 +174,14 @@ network.layer1.1.bn1.bias: - 64 sum: '0.e+00' network.layer1.1.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer1.1.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -189,7 +189,7 @@ network.layer1.1.bn1.running_mean: - 64 sum: '0.e+00' network.layer1.1.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -197,7 +197,7 @@ network.layer1.1.bn1.running_var: - 64 sum: '6.4e+01' network.layer1.1.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -205,7 +205,7 @@ network.layer1.1.bn1.weight: - 64 sum: '6.4e+01' network.layer1.1.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -213,14 +213,14 @@ network.layer1.1.bn2.bias: - 64 sum: '0.e+00' network.layer1.1.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer1.1.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -228,7 +228,7 @@ network.layer1.1.bn2.running_mean: - 64 sum: '0.e+00' network.layer1.1.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -236,7 +236,7 @@ network.layer1.1.bn2.running_var: - 64 sum: '6.4e+01' network.layer1.1.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -244,29 +244,29 @@ network.layer1.1.bn2.weight: - 64 sum: '6.4e+01' network.layer1.1.conv1.weight: - device: cpu - max: '2.130e-01' - mean: '-9.64e-05' - min: '-2.213e-01' + device: cuda:0 + max: '2.456e-01' + mean: '1.807e-04' + min: '-2.376e-01' shape: - 64 - 64 - 3 - 3 - sum: '-3.554e+00' + sum: '6.660e+00' network.layer1.1.conv2.weight: - device: cpu - max: '2.414e-01' - mean: '1.006e-04' - min: '-2.212e-01' + device: cuda:0 + max: '2.338e-01' + mean: '-3.408e-04' + min: '-2.402e-01' shape: - 64 - 64 - 3 - 3 - sum: '3.709e+00' + sum: '-1.256e+01' network.layer2.0.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -274,14 +274,14 @@ network.layer2.0.bn1.bias: - 128 sum: '0.e+00' network.layer2.0.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.0.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -289,7 +289,7 @@ network.layer2.0.bn1.running_mean: - 128 sum: '0.e+00' network.layer2.0.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -297,7 +297,7 @@ network.layer2.0.bn1.running_var: - 128 sum: '1.28e+02' network.layer2.0.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -305,7 +305,7 @@ network.layer2.0.bn1.weight: - 128 sum: '1.28e+02' network.layer2.0.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -313,14 +313,14 @@ network.layer2.0.bn2.bias: - 128 sum: '0.e+00' network.layer2.0.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.0.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -328,7 +328,7 @@ network.layer2.0.bn2.running_mean: - 128 sum: '0.e+00' network.layer2.0.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -336,7 +336,7 @@ network.layer2.0.bn2.running_var: - 128 sum: '1.28e+02' network.layer2.0.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -344,40 +344,40 @@ network.layer2.0.bn2.weight: - 128 sum: '1.28e+02' network.layer2.0.conv1.weight: - device: cpu - max: '1.781e-01' - mean: '-2.81e-04' - min: '-1.729e-01' + device: cuda:0 + max: '1.681e-01' + mean: '2.319e-04' + min: '-1.830e-01' shape: - 128 - 64 - 3 - 3 - sum: '-2.072e+01' + sum: '1.71e+01' network.layer2.0.conv2.weight: - device: cpu - max: '1.949e-01' - mean: '-2.364e-04' - min: '-1.890e-01' + device: cuda:0 + max: '2.008e-01' + mean: '-6.267e-05' + min: '-1.870e-01' shape: - 128 - 128 - 3 - 3 - sum: '-3.485e+01' + sum: '-9.240e+00' network.layer2.0.downsample.0.weight: - device: cpu - max: '5.532e-01' - mean: '2.595e-04' - min: '-4.129e-01' + device: cuda:0 + max: '5.180e-01' + mean: '-2.705e-03' + min: '-5.316e-01' shape: - 128 - 64 - 1 - 1 - sum: '2.126e+00' + sum: '-2.216e+01' network.layer2.0.downsample.1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -385,14 +385,14 @@ network.layer2.0.downsample.1.bias: - 128 sum: '0.e+00' network.layer2.0.downsample.1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.0.downsample.1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -400,7 +400,7 @@ network.layer2.0.downsample.1.running_mean: - 128 sum: '0.e+00' network.layer2.0.downsample.1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -408,7 +408,7 @@ network.layer2.0.downsample.1.running_var: - 128 sum: '1.28e+02' network.layer2.0.downsample.1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -416,7 +416,7 @@ network.layer2.0.downsample.1.weight: - 128 sum: '1.28e+02' network.layer2.1.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -424,14 +424,14 @@ network.layer2.1.bn1.bias: - 128 sum: '0.e+00' network.layer2.1.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.1.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -439,7 +439,7 @@ network.layer2.1.bn1.running_mean: - 128 sum: '0.e+00' network.layer2.1.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -447,7 +447,7 @@ network.layer2.1.bn1.running_var: - 128 sum: '1.28e+02' network.layer2.1.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -455,7 +455,7 @@ network.layer2.1.bn1.weight: - 128 sum: '1.28e+02' network.layer2.1.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -463,14 +463,14 @@ network.layer2.1.bn2.bias: - 128 sum: '0.e+00' network.layer2.1.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.1.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -478,7 +478,7 @@ network.layer2.1.bn2.running_mean: - 128 sum: '0.e+00' network.layer2.1.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -486,7 +486,7 @@ network.layer2.1.bn2.running_var: - 128 sum: '1.28e+02' network.layer2.1.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -494,29 +494,29 @@ network.layer2.1.bn2.weight: - 128 sum: '1.28e+02' network.layer2.1.conv1.weight: - device: cpu - max: '1.921e-01' - mean: '3.336e-05' - min: '-1.785e-01' + device: cuda:0 + max: '1.750e-01' + mean: '7.981e-05' + min: '-1.909e-01' shape: - 128 - 128 - 3 - 3 - sum: '4.92e+00' + sum: '1.177e+01' network.layer2.1.conv2.weight: - device: cpu - max: '1.825e-01' - mean: '-3.207e-05' - min: '-1.989e-01' + device: cuda:0 + max: '1.714e-01' + mean: '6.508e-05' + min: '-1.811e-01' shape: - 128 - 128 - 3 - 3 - sum: '-4.729e+00' + sum: '9.597e+00' network.layer3.0.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -524,14 +524,14 @@ network.layer3.0.bn1.bias: - 256 sum: '0.e+00' network.layer3.0.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.0.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -539,7 +539,7 @@ network.layer3.0.bn1.running_mean: - 256 sum: '0.e+00' network.layer3.0.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -547,7 +547,7 @@ network.layer3.0.bn1.running_var: - 256 sum: '2.56e+02' network.layer3.0.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -555,7 +555,7 @@ network.layer3.0.bn1.weight: - 256 sum: '2.56e+02' network.layer3.0.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -563,14 +563,14 @@ network.layer3.0.bn2.bias: - 256 sum: '0.e+00' network.layer3.0.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.0.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -578,7 +578,7 @@ network.layer3.0.bn2.running_mean: - 256 sum: '0.e+00' network.layer3.0.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -586,7 +586,7 @@ network.layer3.0.bn2.running_var: - 256 sum: '2.56e+02' network.layer3.0.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -594,40 +594,40 @@ network.layer3.0.bn2.weight: - 256 sum: '2.56e+02' network.layer3.0.conv1.weight: - device: cpu - max: '1.418e-01' - mean: '4.759e-05' - min: '-1.425e-01' + device: cuda:0 + max: '1.186e-01' + mean: '-5.228e-06' + min: '-1.308e-01' shape: - 256 - 128 - 3 - 3 - sum: '1.403e+01' + sum: '-1.542e+00' network.layer3.0.conv2.weight: - device: cpu - max: '1.464e-01' - mean: '3.416e-05' - min: '-1.367e-01' + device: cuda:0 + max: '1.360e-01' + mean: '-1.566e-05' + min: '-1.442e-01' shape: - 256 - 256 - 3 - 3 - sum: '2.015e+01' + sum: '-9.235e+00' network.layer3.0.downsample.0.weight: - device: cpu - max: '3.724e-01' - mean: '-3.193e-04' - min: '-4.37e-01' + device: cuda:0 + max: '4.034e-01' + mean: '-7.003e-06' + min: '-3.510e-01' shape: - 256 - 128 - 1 - 1 - sum: '-1.046e+01' + sum: '-2.295e-01' network.layer3.0.downsample.1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -635,14 +635,14 @@ network.layer3.0.downsample.1.bias: - 256 sum: '0.e+00' network.layer3.0.downsample.1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.0.downsample.1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -650,7 +650,7 @@ network.layer3.0.downsample.1.running_mean: - 256 sum: '0.e+00' network.layer3.0.downsample.1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -658,7 +658,7 @@ network.layer3.0.downsample.1.running_var: - 256 sum: '2.56e+02' network.layer3.0.downsample.1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -666,7 +666,7 @@ network.layer3.0.downsample.1.weight: - 256 sum: '2.56e+02' network.layer3.1.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -674,14 +674,14 @@ network.layer3.1.bn1.bias: - 256 sum: '0.e+00' network.layer3.1.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.1.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -689,7 +689,7 @@ network.layer3.1.bn1.running_mean: - 256 sum: '0.e+00' network.layer3.1.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -697,7 +697,7 @@ network.layer3.1.bn1.running_var: - 256 sum: '2.56e+02' network.layer3.1.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -705,7 +705,7 @@ network.layer3.1.bn1.weight: - 256 sum: '2.56e+02' network.layer3.1.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -713,14 +713,14 @@ network.layer3.1.bn2.bias: - 256 sum: '0.e+00' network.layer3.1.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.1.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -728,7 +728,7 @@ network.layer3.1.bn2.running_mean: - 256 sum: '0.e+00' network.layer3.1.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -736,7 +736,7 @@ network.layer3.1.bn2.running_var: - 256 sum: '2.56e+02' network.layer3.1.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -744,29 +744,29 @@ network.layer3.1.bn2.weight: - 256 sum: '2.56e+02' network.layer3.1.conv1.weight: - device: cpu - max: '1.478e-01' - mean: '-4.980e-05' - min: '-1.411e-01' + device: cuda:0 + max: '1.435e-01' + mean: '1.374e-05' + min: '-1.476e-01' shape: - 256 - 256 - 3 - 3 - sum: '-2.938e+01' + sum: '8.106e+00' network.layer3.1.conv2.weight: - device: cpu - max: '1.369e-01' - mean: '-3.677e-05' - min: '-1.348e-01' + device: cuda:0 + max: '1.273e-01' + mean: '8.978e-05' + min: '-1.346e-01' shape: - 256 - 256 - 3 - 3 - sum: '-2.169e+01' + sum: '5.295e+01' network.layer4.0.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -774,14 +774,14 @@ network.layer4.0.bn1.bias: - 512 sum: '0.e+00' network.layer4.0.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.0.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -789,7 +789,7 @@ network.layer4.0.bn1.running_mean: - 512 sum: '0.e+00' network.layer4.0.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -797,7 +797,7 @@ network.layer4.0.bn1.running_var: - 512 sum: '5.12e+02' network.layer4.0.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -805,7 +805,7 @@ network.layer4.0.bn1.weight: - 512 sum: '5.12e+02' network.layer4.0.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -813,14 +813,14 @@ network.layer4.0.bn2.bias: - 512 sum: '0.e+00' network.layer4.0.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.0.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -828,7 +828,7 @@ network.layer4.0.bn2.running_mean: - 512 sum: '0.e+00' network.layer4.0.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -836,7 +836,7 @@ network.layer4.0.bn2.running_var: - 512 sum: '5.12e+02' network.layer4.0.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -844,40 +844,40 @@ network.layer4.0.bn2.weight: - 512 sum: '5.12e+02' network.layer4.0.conv1.weight: - device: cpu - max: '9.989e-02' - mean: '-7.283e-06' - min: '-1.006e-01' + device: cuda:0 + max: '1.020e-01' + mean: '-2.986e-06' + min: '-1.011e-01' shape: - 512 - 256 - 3 - 3 - sum: '-8.591e+00' + sum: '-3.522e+00' network.layer4.0.conv2.weight: - device: cpu - max: '1.023e-01' - mean: '2.838e-06' - min: '-1.135e-01' + device: cuda:0 + max: '1.049e-01' + mean: '-2.121e-05' + min: '-1.011e-01' shape: - 512 - 512 - 3 - 3 - sum: '6.696e+00' + sum: '-5.004e+01' network.layer4.0.downsample.0.weight: - device: cpu - max: '2.664e-01' - mean: '1.458e-04' - min: '-2.861e-01' + device: cuda:0 + max: '2.638e-01' + mean: '-1.538e-05' + min: '-2.893e-01' shape: - 512 - 256 - 1 - 1 - sum: '1.911e+01' + sum: '-2.016e+00' network.layer4.0.downsample.1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -885,14 +885,14 @@ network.layer4.0.downsample.1.bias: - 512 sum: '0.e+00' network.layer4.0.downsample.1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.0.downsample.1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -900,7 +900,7 @@ network.layer4.0.downsample.1.running_mean: - 512 sum: '0.e+00' network.layer4.0.downsample.1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -908,7 +908,7 @@ network.layer4.0.downsample.1.running_var: - 512 sum: '5.12e+02' network.layer4.0.downsample.1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -916,7 +916,7 @@ network.layer4.0.downsample.1.weight: - 512 sum: '5.12e+02' network.layer4.1.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -924,14 +924,14 @@ network.layer4.1.bn1.bias: - 512 sum: '0.e+00' network.layer4.1.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.1.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -939,7 +939,7 @@ network.layer4.1.bn1.running_mean: - 512 sum: '0.e+00' network.layer4.1.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -947,7 +947,7 @@ network.layer4.1.bn1.running_var: - 512 sum: '5.12e+02' network.layer4.1.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -955,7 +955,7 @@ network.layer4.1.bn1.weight: - 512 sum: '5.12e+02' network.layer4.1.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -963,14 +963,14 @@ network.layer4.1.bn2.bias: - 512 sum: '0.e+00' network.layer4.1.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.1.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -978,7 +978,7 @@ network.layer4.1.bn2.running_mean: - 512 sum: '0.e+00' network.layer4.1.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -986,7 +986,7 @@ network.layer4.1.bn2.running_var: - 512 sum: '5.12e+02' network.layer4.1.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -994,24 +994,24 @@ network.layer4.1.bn2.weight: - 512 sum: '5.12e+02' network.layer4.1.conv1.weight: - device: cpu - max: '1.172e-01' - mean: '-1.526e-05' - min: '-1.015e-01' + device: cuda:0 + max: '1.056e-01' + mean: '4.031e-06' + min: '-1.011e-01' shape: - 512 - 512 - 3 - 3 - sum: '-3.601e+01' + sum: '9.511e+00' network.layer4.1.conv2.weight: - device: cpu - max: '9.908e-02' - mean: '8.558e-06' - min: '-1.071e-01' + device: cuda:0 + max: '1.072e-01' + mean: '-1.993e-05' + min: '-9.954e-02' shape: - 512 - 512 - 3 - 3 - sum: '2.019e+01' + sum: '-4.701e+01' diff --git a/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet18_imagenet_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet18_imagenet_image_classifier.yaml new file mode 100644 index 00000000..a3a1a99d --- /dev/null +++ b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet18_imagenet_image_classifier.yaml @@ -0,0 +1,1017 @@ +network.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.conv1.weight: + device: cuda:0 + max: '9.327e-02' + mean: '4.984e-04' + min: '-1.072e-01' + shape: + - 64 + - 3 + - 7 + - 7 + sum: '4.689e+00' +network.fc.bias: + device: cuda:0 + max: '4.419e-02' + mean: '1.212e-06' + min: '-4.419e-02' + shape: + - 1000 + sum: '1.212e-03' +network.fc.weight: + device: cuda:0 + max: '4.419e-02' + mean: '-6.997e-07' + min: '-4.419e-02' + shape: + - 1000 + - 512 + sum: '-3.583e-01' +network.layer1.0.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.0.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer1.0.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.0.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.0.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.0.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.0.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer1.0.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.0.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.0.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.0.conv1.weight: + device: cuda:0 + max: '2.442e-01' + mean: '1.259e-04' + min: '-2.666e-01' + shape: + - 64 + - 64 + - 3 + - 3 + sum: '4.642e+00' +network.layer1.0.conv2.weight: + device: cuda:0 + max: '2.456e-01' + mean: '1.807e-04' + min: '-2.376e-01' + shape: + - 64 + - 64 + - 3 + - 3 + sum: '6.660e+00' +network.layer1.1.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.1.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer1.1.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.1.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.1.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.1.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.1.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer1.1.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.1.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.1.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.1.conv1.weight: + device: cuda:0 + max: '2.338e-01' + mean: '-3.408e-04' + min: '-2.402e-01' + shape: + - 64 + - 64 + - 3 + - 3 + sum: '-1.256e+01' +network.layer1.1.conv2.weight: + device: cuda:0 + max: '2.224e-01' + mean: '2.189e-04' + min: '-2.588e-01' + shape: + - 64 + - 64 + - 3 + - 3 + sum: '8.07e+00' +network.layer2.0.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.0.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.0.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.0.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.0.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.0.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.0.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.0.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.0.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.0.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.0.conv1.weight: + device: cuda:0 + max: '2.008e-01' + mean: '8.513e-05' + min: '-1.854e-01' + shape: + - 128 + - 64 + - 3 + - 3 + sum: '6.276e+00' +network.layer2.0.conv2.weight: + device: cuda:0 + max: '1.766e-01' + mean: '1.21e-04' + min: '-1.79e-01' + shape: + - 128 + - 128 + - 3 + - 3 + sum: '1.784e+01' +network.layer2.0.downsample.0.weight: + device: cuda:0 + max: '5.054e-01' + mean: '-9.048e-04' + min: '-4.751e-01' + shape: + - 128 + - 64 + - 1 + - 1 + sum: '-7.412e+00' +network.layer2.0.downsample.1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.0.downsample.1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.0.downsample.1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.0.downsample.1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.0.downsample.1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.1.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.1.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.1.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.1.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.1.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.1.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.1.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.1.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.1.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.1.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.1.conv1.weight: + device: cuda:0 + max: '1.714e-01' + mean: '6.508e-05' + min: '-1.811e-01' + shape: + - 128 + - 128 + - 3 + - 3 + sum: '9.597e+00' +network.layer2.1.conv2.weight: + device: cuda:0 + max: '1.677e-01' + mean: '-1.988e-05' + min: '-1.746e-01' + shape: + - 128 + - 128 + - 3 + - 3 + sum: '-2.932e+00' +network.layer3.0.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.0.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.0.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.0.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.0.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.0.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.0.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.0.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.0.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.0.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.0.conv1.weight: + device: cuda:0 + max: '1.360e-01' + mean: '3.475e-05' + min: '-1.442e-01' + shape: + - 256 + - 128 + - 3 + - 3 + sum: '1.025e+01' +network.layer3.0.conv2.weight: + device: cuda:0 + max: '1.345e-01' + mean: '-1.856e-05' + min: '-1.299e-01' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '-1.095e+01' +network.layer3.0.downsample.0.weight: + device: cuda:0 + max: '3.523e-01' + mean: '1.2e-04' + min: '-3.863e-01' + shape: + - 256 + - 128 + - 1 + - 1 + sum: '3.931e+00' +network.layer3.0.downsample.1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.0.downsample.1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.0.downsample.1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.0.downsample.1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.0.downsample.1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.1.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.1.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.1.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.1.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.1.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.1.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.1.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.1.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.1.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.1.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.1.conv1.weight: + device: cuda:0 + max: '1.395e-01' + mean: '6.754e-05' + min: '-1.476e-01' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '3.984e+01' +network.layer3.1.conv2.weight: + device: cuda:0 + max: '1.443e-01' + mean: '4.953e-05' + min: '-1.376e-01' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '2.921e+01' +network.layer4.0.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.0.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.0.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.0.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.0.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.0.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.0.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.0.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.0.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.0.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.0.conv1.weight: + device: cuda:0 + max: '1.003e-01' + mean: '-1.587e-05' + min: '-1.011e-01' + shape: + - 512 + - 256 + - 3 + - 3 + sum: '-1.872e+01' +network.layer4.0.conv2.weight: + device: cuda:0 + max: '1.049e-01' + mean: '-1.442e-05' + min: '-1.011e-01' + shape: + - 512 + - 512 + - 3 + - 3 + sum: '-3.403e+01' +network.layer4.0.downsample.0.weight: + device: cuda:0 + max: '2.673e-01' + mean: '2.869e-04' + min: '-3.001e-01' + shape: + - 512 + - 256 + - 1 + - 1 + sum: '3.761e+01' +network.layer4.0.downsample.1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.0.downsample.1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.0.downsample.1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.0.downsample.1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.0.downsample.1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.1.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.1.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.1.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.1.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.1.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.1.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.1.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.1.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.1.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.1.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.1.conv1.weight: + device: cuda:0 + max: '1.056e-01' + mean: '1.585e-06' + min: '-1.011e-01' + shape: + - 512 + - 512 + - 3 + - 3 + sum: '3.74e+00' +network.layer4.1.conv2.weight: + device: cuda:0 + max: '1.072e-01' + mean: '-2.285e-05' + min: '-1.042e-01' + shape: + - 512 + - 512 + - 3 + - 3 + sum: '-5.392e+01' diff --git a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/resnet50_cifar10_example.yaml b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet50_cifar10_image_classifier.yaml similarity index 77% rename from .regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/resnet50_cifar10_example.yaml rename to .regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet50_cifar10_image_classifier.yaml index e6ed0e92..d0fb1b94 100644 --- a/.regression_files/project/algorithms/example_test/test_initialization_is_reproducible/cpu/resnet50_cifar10_example.yaml +++ b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet50_cifar10_image_classifier.yaml @@ -1,5 +1,5 @@ network.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -7,14 +7,14 @@ network.bn1.bias: - 64 sum: '0.e+00' network.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -22,7 +22,7 @@ network.bn1.running_mean: - 64 sum: '0.e+00' network.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -30,7 +30,7 @@ network.bn1.running_var: - 64 sum: '6.4e+01' network.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -38,35 +38,35 @@ network.bn1.weight: - 64 sum: '6.4e+01' network.conv1.weight: - device: cpu - max: '1.063e-01' - mean: '4.928e-04' - min: '-9.805e-02' + device: cuda:0 + max: '9.646e-02' + mean: '3.162e-04' + min: '-9.585e-02' shape: - 64 - 3 - 7 - 7 - sum: '4.636e+00' + sum: '2.975e+00' network.fc.bias: - device: cpu - max: '2.104e-02' - mean: '3.192e-04' - min: '-2.160e-02' + device: cuda:0 + max: '2.199e-02' + mean: '3.231e-03' + min: '-2.176e-02' shape: - 10 - sum: '3.192e-03' + sum: '3.231e-02' network.fc.weight: - device: cpu - max: '2.209e-02' - mean: '1.247e-04' + device: cuda:0 + max: '2.21e-02' + mean: '-7.184e-06' min: '-2.21e-02' shape: - 10 - 2048 - sum: '2.554e+00' + sum: '-1.471e-01' network.layer1.0.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -74,14 +74,14 @@ network.layer1.0.bn1.bias: - 64 sum: '0.e+00' network.layer1.0.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer1.0.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -89,7 +89,7 @@ network.layer1.0.bn1.running_mean: - 64 sum: '0.e+00' network.layer1.0.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -97,7 +97,7 @@ network.layer1.0.bn1.running_var: - 64 sum: '6.4e+01' network.layer1.0.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -105,7 +105,7 @@ network.layer1.0.bn1.weight: - 64 sum: '6.4e+01' network.layer1.0.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -113,14 +113,14 @@ network.layer1.0.bn2.bias: - 64 sum: '0.e+00' network.layer1.0.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer1.0.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -128,7 +128,7 @@ network.layer1.0.bn2.running_mean: - 64 sum: '0.e+00' network.layer1.0.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -136,7 +136,7 @@ network.layer1.0.bn2.running_var: - 64 sum: '6.4e+01' network.layer1.0.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -144,7 +144,7 @@ network.layer1.0.bn2.weight: - 64 sum: '6.4e+01' network.layer1.0.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -152,14 +152,14 @@ network.layer1.0.bn3.bias: - 256 sum: '0.e+00' network.layer1.0.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer1.0.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -167,7 +167,7 @@ network.layer1.0.bn3.running_mean: - 256 sum: '0.e+00' network.layer1.0.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -175,7 +175,7 @@ network.layer1.0.bn3.running_var: - 256 sum: '2.56e+02' network.layer1.0.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -183,51 +183,51 @@ network.layer1.0.bn3.weight: - 256 sum: '2.56e+02' network.layer1.0.conv1.weight: - device: cpu - max: '5.941e-01' - mean: '-1.580e-03' - min: '-6.47e-01' + device: cuda:0 + max: '7.081e-01' + mean: '-3.220e-03' + min: '-6.607e-01' shape: - 64 - 64 - 1 - 1 - sum: '-6.472e+00' + sum: '-1.319e+01' network.layer1.0.conv2.weight: - device: cpu - max: '2.475e-01' - mean: '1.651e-05' - min: '-2.377e-01' + device: cuda:0 + max: '2.489e-01' + mean: '-3.557e-04' + min: '-2.330e-01' shape: - 64 - 64 - 3 - 3 - sum: '6.087e-01' + sum: '-1.311e+01' network.layer1.0.conv3.weight: - device: cpu - max: '3.290e-01' - mean: '-1.486e-04' - min: '-3.494e-01' + device: cuda:0 + max: '3.157e-01' + mean: '2.669e-04' + min: '-3.577e-01' shape: - 256 - 64 - 1 - 1 - sum: '-2.435e+00' + sum: '4.374e+00' network.layer1.0.downsample.0.weight: - device: cpu - max: '3.666e-01' - mean: '3.372e-04' - min: '-3.401e-01' + device: cuda:0 + max: '3.370e-01' + mean: '4.294e-04' + min: '-3.389e-01' shape: - 256 - 64 - 1 - 1 - sum: '5.525e+00' + sum: '7.036e+00' network.layer1.0.downsample.1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -235,14 +235,14 @@ network.layer1.0.downsample.1.bias: - 256 sum: '0.e+00' network.layer1.0.downsample.1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer1.0.downsample.1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -250,7 +250,7 @@ network.layer1.0.downsample.1.running_mean: - 256 sum: '0.e+00' network.layer1.0.downsample.1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -258,7 +258,7 @@ network.layer1.0.downsample.1.running_var: - 256 sum: '2.56e+02' network.layer1.0.downsample.1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -266,7 +266,7 @@ network.layer1.0.downsample.1.weight: - 256 sum: '2.56e+02' network.layer1.1.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -274,14 +274,14 @@ network.layer1.1.bn1.bias: - 64 sum: '0.e+00' network.layer1.1.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer1.1.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -289,7 +289,7 @@ network.layer1.1.bn1.running_mean: - 64 sum: '0.e+00' network.layer1.1.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -297,7 +297,7 @@ network.layer1.1.bn1.running_var: - 64 sum: '6.4e+01' network.layer1.1.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -305,7 +305,7 @@ network.layer1.1.bn1.weight: - 64 sum: '6.4e+01' network.layer1.1.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -313,14 +313,14 @@ network.layer1.1.bn2.bias: - 64 sum: '0.e+00' network.layer1.1.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer1.1.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -328,7 +328,7 @@ network.layer1.1.bn2.running_mean: - 64 sum: '0.e+00' network.layer1.1.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -336,7 +336,7 @@ network.layer1.1.bn2.running_var: - 64 sum: '6.4e+01' network.layer1.1.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -344,7 +344,7 @@ network.layer1.1.bn2.weight: - 64 sum: '6.4e+01' network.layer1.1.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -352,14 +352,14 @@ network.layer1.1.bn3.bias: - 256 sum: '0.e+00' network.layer1.1.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer1.1.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -367,7 +367,7 @@ network.layer1.1.bn3.running_mean: - 256 sum: '0.e+00' network.layer1.1.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -375,7 +375,7 @@ network.layer1.1.bn3.running_var: - 256 sum: '2.56e+02' network.layer1.1.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -383,40 +383,40 @@ network.layer1.1.bn3.weight: - 256 sum: '2.56e+02' network.layer1.1.conv1.weight: - device: cpu - max: '6.431e-01' - mean: '-6.870e-05' - min: '-7.341e-01' + device: cuda:0 + max: '7.008e-01' + mean: '3.792e-04' + min: '-6.543e-01' shape: - 64 - 256 - 1 - 1 - sum: '-1.126e+00' + sum: '6.214e+00' network.layer1.1.conv2.weight: - device: cpu - max: '2.367e-01' - mean: '-7.922e-05' - min: '-2.362e-01' + device: cuda:0 + max: '2.569e-01' + mean: '-2.808e-06' + min: '-2.296e-01' shape: - 64 - 64 - 3 - 3 - sum: '-2.920e+00' + sum: '-1.035e-01' network.layer1.1.conv3.weight: - device: cpu - max: '3.581e-01' - mean: '3.216e-04' - min: '-3.573e-01' + device: cuda:0 + max: '3.335e-01' + mean: '-1.113e-03' + min: '-3.427e-01' shape: - 256 - 64 - 1 - 1 - sum: '5.268e+00' + sum: '-1.824e+01' network.layer1.2.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -424,14 +424,14 @@ network.layer1.2.bn1.bias: - 64 sum: '0.e+00' network.layer1.2.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer1.2.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -439,7 +439,7 @@ network.layer1.2.bn1.running_mean: - 64 sum: '0.e+00' network.layer1.2.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -447,7 +447,7 @@ network.layer1.2.bn1.running_var: - 64 sum: '6.4e+01' network.layer1.2.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -455,7 +455,7 @@ network.layer1.2.bn1.weight: - 64 sum: '6.4e+01' network.layer1.2.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -463,14 +463,14 @@ network.layer1.2.bn2.bias: - 64 sum: '0.e+00' network.layer1.2.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer1.2.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -478,7 +478,7 @@ network.layer1.2.bn2.running_mean: - 64 sum: '0.e+00' network.layer1.2.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -486,7 +486,7 @@ network.layer1.2.bn2.running_var: - 64 sum: '6.4e+01' network.layer1.2.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -494,7 +494,7 @@ network.layer1.2.bn2.weight: - 64 sum: '6.4e+01' network.layer1.2.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -502,14 +502,14 @@ network.layer1.2.bn3.bias: - 256 sum: '0.e+00' network.layer1.2.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer1.2.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -517,7 +517,7 @@ network.layer1.2.bn3.running_mean: - 256 sum: '0.e+00' network.layer1.2.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -525,7 +525,7 @@ network.layer1.2.bn3.running_var: - 256 sum: '2.56e+02' network.layer1.2.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -533,40 +533,40 @@ network.layer1.2.bn3.weight: - 256 sum: '2.56e+02' network.layer1.2.conv1.weight: - device: cpu - max: '6.670e-01' - mean: '-1.511e-03' - min: '-7.024e-01' + device: cuda:0 + max: '7.078e-01' + mean: '2.205e-03' + min: '-6.688e-01' shape: - 64 - 256 - 1 - 1 - sum: '-2.476e+01' + sum: '3.613e+01' network.layer1.2.conv2.weight: - device: cpu - max: '2.378e-01' - mean: '-2.972e-04' - min: '-2.387e-01' + device: cuda:0 + max: '2.568e-01' + mean: '2.909e-04' + min: '-2.361e-01' shape: - 64 - 64 - 3 - 3 - sum: '-1.095e+01' + sum: '1.072e+01' network.layer1.2.conv3.weight: - device: cpu - max: '3.828e-01' - mean: '-2.277e-04' - min: '-3.256e-01' + device: cuda:0 + max: '3.423e-01' + mean: '-6.033e-04' + min: '-3.476e-01' shape: - 256 - 64 - 1 - 1 - sum: '-3.730e+00' + sum: '-9.884e+00' network.layer2.0.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -574,14 +574,14 @@ network.layer2.0.bn1.bias: - 128 sum: '0.e+00' network.layer2.0.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.0.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -589,7 +589,7 @@ network.layer2.0.bn1.running_mean: - 128 sum: '0.e+00' network.layer2.0.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -597,7 +597,7 @@ network.layer2.0.bn1.running_var: - 128 sum: '1.28e+02' network.layer2.0.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -605,7 +605,7 @@ network.layer2.0.bn1.weight: - 128 sum: '1.28e+02' network.layer2.0.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -613,14 +613,14 @@ network.layer2.0.bn2.bias: - 128 sum: '0.e+00' network.layer2.0.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.0.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -628,7 +628,7 @@ network.layer2.0.bn2.running_mean: - 128 sum: '0.e+00' network.layer2.0.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -636,7 +636,7 @@ network.layer2.0.bn2.running_var: - 128 sum: '1.28e+02' network.layer2.0.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -644,7 +644,7 @@ network.layer2.0.bn2.weight: - 128 sum: '1.28e+02' network.layer2.0.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -652,14 +652,14 @@ network.layer2.0.bn3.bias: - 512 sum: '0.e+00' network.layer2.0.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.0.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -667,7 +667,7 @@ network.layer2.0.bn3.running_mean: - 512 sum: '0.e+00' network.layer2.0.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -675,7 +675,7 @@ network.layer2.0.bn3.running_var: - 512 sum: '5.12e+02' network.layer2.0.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -683,51 +683,51 @@ network.layer2.0.bn3.weight: - 512 sum: '5.12e+02' network.layer2.0.conv1.weight: - device: cpu - max: '4.811e-01' - mean: '1.971e-04' - min: '-5.037e-01' + device: cuda:0 + max: '5.195e-01' + mean: '7.903e-06' + min: '-5.187e-01' shape: - 128 - 256 - 1 - 1 - sum: '6.458e+00' + sum: '2.59e-01' network.layer2.0.conv2.weight: - device: cpu - max: '1.834e-01' - mean: '-1.511e-05' - min: '-1.870e-01' + device: cuda:0 + max: '1.880e-01' + mean: '2.495e-04' + min: '-1.736e-01' shape: - 128 - 128 - 3 - 3 - sum: '-2.228e+00' + sum: '3.678e+01' network.layer2.0.conv3.weight: - device: cpu - max: '2.532e-01' - mean: '-9.596e-05' - min: '-2.615e-01' + device: cuda:0 + max: '2.546e-01' + mean: '2.444e-04' + min: '-2.541e-01' shape: - 512 - 128 - 1 - 1 - sum: '-6.289e+00' + sum: '1.602e+01' network.layer2.0.downsample.0.weight: - device: cpu - max: '2.66e-01' - mean: '3.258e-04' - min: '-2.709e-01' + device: cuda:0 + max: '3.065e-01' + mean: '3.991e-05' + min: '-2.480e-01' shape: - 512 - 256 - 1 - 1 - sum: '4.270e+01' + sum: '5.231e+00' network.layer2.0.downsample.1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -735,14 +735,14 @@ network.layer2.0.downsample.1.bias: - 512 sum: '0.e+00' network.layer2.0.downsample.1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.0.downsample.1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -750,7 +750,7 @@ network.layer2.0.downsample.1.running_mean: - 512 sum: '0.e+00' network.layer2.0.downsample.1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -758,7 +758,7 @@ network.layer2.0.downsample.1.running_var: - 512 sum: '5.12e+02' network.layer2.0.downsample.1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -766,7 +766,7 @@ network.layer2.0.downsample.1.weight: - 512 sum: '5.12e+02' network.layer2.1.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -774,14 +774,14 @@ network.layer2.1.bn1.bias: - 128 sum: '0.e+00' network.layer2.1.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.1.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -789,7 +789,7 @@ network.layer2.1.bn1.running_mean: - 128 sum: '0.e+00' network.layer2.1.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -797,7 +797,7 @@ network.layer2.1.bn1.running_var: - 128 sum: '1.28e+02' network.layer2.1.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -805,7 +805,7 @@ network.layer2.1.bn1.weight: - 128 sum: '1.28e+02' network.layer2.1.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -813,14 +813,14 @@ network.layer2.1.bn2.bias: - 128 sum: '0.e+00' network.layer2.1.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.1.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -828,7 +828,7 @@ network.layer2.1.bn2.running_mean: - 128 sum: '0.e+00' network.layer2.1.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -836,7 +836,7 @@ network.layer2.1.bn2.running_var: - 128 sum: '1.28e+02' network.layer2.1.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -844,7 +844,7 @@ network.layer2.1.bn2.weight: - 128 sum: '1.28e+02' network.layer2.1.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -852,14 +852,14 @@ network.layer2.1.bn3.bias: - 512 sum: '0.e+00' network.layer2.1.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.1.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -867,7 +867,7 @@ network.layer2.1.bn3.running_mean: - 512 sum: '0.e+00' network.layer2.1.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -875,7 +875,7 @@ network.layer2.1.bn3.running_var: - 512 sum: '5.12e+02' network.layer2.1.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -883,40 +883,40 @@ network.layer2.1.bn3.weight: - 512 sum: '5.12e+02' network.layer2.1.conv1.weight: - device: cpu - max: '5.121e-01' - mean: '-1.819e-04' - min: '-5.277e-01' + device: cuda:0 + max: '5.655e-01' + mean: '-1.772e-04' + min: '-5.812e-01' shape: - 128 - 512 - 1 - 1 - sum: '-1.192e+01' + sum: '-1.161e+01' network.layer2.1.conv2.weight: - device: cpu - max: '1.973e-01' - mean: '6.795e-05' - min: '-1.822e-01' + device: cuda:0 + max: '1.912e-01' + mean: '-1.939e-04' + min: '-1.828e-01' shape: - 128 - 128 - 3 - 3 - sum: '1.002e+01' + sum: '-2.859e+01' network.layer2.1.conv3.weight: - device: cpu - max: '2.505e-01' - mean: '-7.241e-04' - min: '-2.531e-01' + device: cuda:0 + max: '2.647e-01' + mean: '1.202e-04' + min: '-2.835e-01' shape: - 512 - 128 - 1 - 1 - sum: '-4.745e+01' + sum: '7.879e+00' network.layer2.2.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -924,14 +924,14 @@ network.layer2.2.bn1.bias: - 128 sum: '0.e+00' network.layer2.2.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.2.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -939,7 +939,7 @@ network.layer2.2.bn1.running_mean: - 128 sum: '0.e+00' network.layer2.2.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -947,7 +947,7 @@ network.layer2.2.bn1.running_var: - 128 sum: '1.28e+02' network.layer2.2.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -955,7 +955,7 @@ network.layer2.2.bn1.weight: - 128 sum: '1.28e+02' network.layer2.2.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -963,14 +963,14 @@ network.layer2.2.bn2.bias: - 128 sum: '0.e+00' network.layer2.2.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.2.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -978,7 +978,7 @@ network.layer2.2.bn2.running_mean: - 128 sum: '0.e+00' network.layer2.2.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -986,7 +986,7 @@ network.layer2.2.bn2.running_var: - 128 sum: '1.28e+02' network.layer2.2.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -994,7 +994,7 @@ network.layer2.2.bn2.weight: - 128 sum: '1.28e+02' network.layer2.2.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1002,14 +1002,14 @@ network.layer2.2.bn3.bias: - 512 sum: '0.e+00' network.layer2.2.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.2.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1017,7 +1017,7 @@ network.layer2.2.bn3.running_mean: - 512 sum: '0.e+00' network.layer2.2.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1025,7 +1025,7 @@ network.layer2.2.bn3.running_var: - 512 sum: '5.12e+02' network.layer2.2.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1033,40 +1033,40 @@ network.layer2.2.bn3.weight: - 512 sum: '5.12e+02' network.layer2.2.conv1.weight: - device: cpu - max: '5.326e-01' - mean: '2.855e-04' - min: '-4.874e-01' + device: cuda:0 + max: '5.352e-01' + mean: '1.514e-04' + min: '-4.77e-01' shape: - 128 - 512 - 1 - 1 - sum: '1.871e+01' + sum: '9.922e+00' network.layer2.2.conv2.weight: - device: cpu - max: '1.926e-01' - mean: '1.28e-05' - min: '-1.865e-01' + device: cuda:0 + max: '1.992e-01' + mean: '-3.131e-05' + min: '-1.781e-01' shape: - 128 - 128 - 3 - 3 - sum: '1.887e+00' + sum: '-4.617e+00' network.layer2.2.conv3.weight: - device: cpu - max: '2.606e-01' - mean: '-1.18e-04' - min: '-2.621e-01' + device: cuda:0 + max: '3.018e-01' + mean: '8.808e-05' + min: '-2.617e-01' shape: - 512 - 128 - 1 - 1 - sum: '-7.731e+00' + sum: '5.772e+00' network.layer2.3.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1074,14 +1074,14 @@ network.layer2.3.bn1.bias: - 128 sum: '0.e+00' network.layer2.3.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.3.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1089,7 +1089,7 @@ network.layer2.3.bn1.running_mean: - 128 sum: '0.e+00' network.layer2.3.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1097,7 +1097,7 @@ network.layer2.3.bn1.running_var: - 128 sum: '1.28e+02' network.layer2.3.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1105,7 +1105,7 @@ network.layer2.3.bn1.weight: - 128 sum: '1.28e+02' network.layer2.3.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1113,14 +1113,14 @@ network.layer2.3.bn2.bias: - 128 sum: '0.e+00' network.layer2.3.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.3.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1128,7 +1128,7 @@ network.layer2.3.bn2.running_mean: - 128 sum: '0.e+00' network.layer2.3.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1136,7 +1136,7 @@ network.layer2.3.bn2.running_var: - 128 sum: '1.28e+02' network.layer2.3.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1144,7 +1144,7 @@ network.layer2.3.bn2.weight: - 128 sum: '1.28e+02' network.layer2.3.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1152,14 +1152,14 @@ network.layer2.3.bn3.bias: - 512 sum: '0.e+00' network.layer2.3.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer2.3.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1167,7 +1167,7 @@ network.layer2.3.bn3.running_mean: - 512 sum: '0.e+00' network.layer2.3.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1175,7 +1175,7 @@ network.layer2.3.bn3.running_var: - 512 sum: '5.12e+02' network.layer2.3.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1183,40 +1183,40 @@ network.layer2.3.bn3.weight: - 512 sum: '5.12e+02' network.layer2.3.conv1.weight: - device: cpu - max: '5.012e-01' - mean: '-7.271e-04' - min: '-5.501e-01' + device: cuda:0 + max: '5.314e-01' + mean: '-3.536e-04' + min: '-5.475e-01' shape: - 128 - 512 - 1 - 1 - sum: '-4.765e+01' + sum: '-2.318e+01' network.layer2.3.conv2.weight: - device: cpu - max: '1.814e-01' - mean: '5.993e-05' - min: '-2.048e-01' + device: cuda:0 + max: '1.754e-01' + mean: '7.783e-05' + min: '-1.808e-01' shape: - 128 - 128 - 3 - 3 - sum: '8.837e+00' + sum: '1.148e+01' network.layer2.3.conv3.weight: - device: cpu - max: '2.943e-01' - mean: '-2.147e-04' - min: '-2.827e-01' + device: cuda:0 + max: '2.382e-01' + mean: '-1.054e-05' + min: '-2.517e-01' shape: - 512 - 128 - 1 - 1 - sum: '-1.407e+01' + sum: '-6.906e-01' network.layer3.0.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1224,14 +1224,14 @@ network.layer3.0.bn1.bias: - 256 sum: '0.e+00' network.layer3.0.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.0.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1239,7 +1239,7 @@ network.layer3.0.bn1.running_mean: - 256 sum: '0.e+00' network.layer3.0.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1247,7 +1247,7 @@ network.layer3.0.bn1.running_var: - 256 sum: '2.56e+02' network.layer3.0.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1255,7 +1255,7 @@ network.layer3.0.bn1.weight: - 256 sum: '2.56e+02' network.layer3.0.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1263,14 +1263,14 @@ network.layer3.0.bn2.bias: - 256 sum: '0.e+00' network.layer3.0.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.0.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1278,7 +1278,7 @@ network.layer3.0.bn2.running_mean: - 256 sum: '0.e+00' network.layer3.0.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1286,7 +1286,7 @@ network.layer3.0.bn2.running_var: - 256 sum: '2.56e+02' network.layer3.0.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1294,7 +1294,7 @@ network.layer3.0.bn2.weight: - 256 sum: '2.56e+02' network.layer3.0.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1302,14 +1302,14 @@ network.layer3.0.bn3.bias: - 1024 sum: '0.e+00' network.layer3.0.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.0.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1317,7 +1317,7 @@ network.layer3.0.bn3.running_mean: - 1024 sum: '0.e+00' network.layer3.0.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1325,7 +1325,7 @@ network.layer3.0.bn3.running_var: - 1024 sum: '1.024e+03' network.layer3.0.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1333,51 +1333,51 @@ network.layer3.0.bn3.weight: - 1024 sum: '1.024e+03' network.layer3.0.conv1.weight: - device: cpu - max: '3.887e-01' - mean: '2.347e-04' - min: '-3.860e-01' + device: cuda:0 + max: '3.667e-01' + mean: '-1.312e-04' + min: '-3.741e-01' shape: - 256 - 512 - 1 - 1 - sum: '3.076e+01' + sum: '-1.72e+01' network.layer3.0.conv2.weight: - device: cpu - max: '1.372e-01' - mean: '-1.56e-05' - min: '-1.419e-01' + device: cuda:0 + max: '1.525e-01' + mean: '3.130e-05' + min: '-1.458e-01' shape: - 256 - 256 - 3 - 3 - sum: '-9.199e+00' + sum: '1.846e+01' network.layer3.0.conv3.weight: - device: cpu - max: '1.974e-01' - mean: '-2.099e-05' - min: '-2.157e-01' + device: cuda:0 + max: '2.06e-01' + mean: '1.398e-05' + min: '-2.206e-01' shape: - 1024 - 256 - 1 - 1 - sum: '-5.501e+00' + sum: '3.665e+00' network.layer3.0.downsample.0.weight: - device: cpu - max: '2.111e-01' - mean: '-1.147e-05' - min: '-2.026e-01' + device: cuda:0 + max: '1.988e-01' + mean: '2.828e-05' + min: '-2.006e-01' shape: - 1024 - 512 - 1 - 1 - sum: '-6.012e+00' + sum: '1.483e+01' network.layer3.0.downsample.1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1385,14 +1385,14 @@ network.layer3.0.downsample.1.bias: - 1024 sum: '0.e+00' network.layer3.0.downsample.1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.0.downsample.1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1400,7 +1400,7 @@ network.layer3.0.downsample.1.running_mean: - 1024 sum: '0.e+00' network.layer3.0.downsample.1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1408,7 +1408,7 @@ network.layer3.0.downsample.1.running_var: - 1024 sum: '1.024e+03' network.layer3.0.downsample.1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1416,7 +1416,7 @@ network.layer3.0.downsample.1.weight: - 1024 sum: '1.024e+03' network.layer3.1.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1424,14 +1424,14 @@ network.layer3.1.bn1.bias: - 256 sum: '0.e+00' network.layer3.1.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.1.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1439,7 +1439,7 @@ network.layer3.1.bn1.running_mean: - 256 sum: '0.e+00' network.layer3.1.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1447,7 +1447,7 @@ network.layer3.1.bn1.running_var: - 256 sum: '2.56e+02' network.layer3.1.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1455,7 +1455,7 @@ network.layer3.1.bn1.weight: - 256 sum: '2.56e+02' network.layer3.1.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1463,14 +1463,14 @@ network.layer3.1.bn2.bias: - 256 sum: '0.e+00' network.layer3.1.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.1.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1478,7 +1478,7 @@ network.layer3.1.bn2.running_mean: - 256 sum: '0.e+00' network.layer3.1.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1486,7 +1486,7 @@ network.layer3.1.bn2.running_var: - 256 sum: '2.56e+02' network.layer3.1.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1494,7 +1494,7 @@ network.layer3.1.bn2.weight: - 256 sum: '2.56e+02' network.layer3.1.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1502,14 +1502,14 @@ network.layer3.1.bn3.bias: - 1024 sum: '0.e+00' network.layer3.1.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.1.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1517,7 +1517,7 @@ network.layer3.1.bn3.running_mean: - 1024 sum: '0.e+00' network.layer3.1.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1525,7 +1525,7 @@ network.layer3.1.bn3.running_var: - 1024 sum: '1.024e+03' network.layer3.1.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1533,40 +1533,40 @@ network.layer3.1.bn3.weight: - 1024 sum: '1.024e+03' network.layer3.1.conv1.weight: - device: cpu - max: '4.004e-01' - mean: '1.076e-04' - min: '-3.917e-01' + device: cuda:0 + max: '3.843e-01' + mean: '2.675e-04' + min: '-3.99e-01' shape: - 256 - 1024 - 1 - 1 - sum: '2.822e+01' + sum: '7.013e+01' network.layer3.1.conv2.weight: - device: cpu - max: '1.322e-01' - mean: '-7.433e-06' - min: '-1.435e-01' + device: cuda:0 + max: '1.38e-01' + mean: '-3.53e-06' + min: '-1.294e-01' shape: - 256 - 256 - 3 - 3 - sum: '-4.384e+00' + sum: '-2.082e+00' network.layer3.1.conv3.weight: - device: cpu - max: '2.148e-01' - mean: '-2.367e-05' - min: '-2.066e-01' + device: cuda:0 + max: '2.052e-01' + mean: '-7.496e-06' + min: '-1.973e-01' shape: - 1024 - 256 - 1 - 1 - sum: '-6.205e+00' + sum: '-1.965e+00' network.layer3.2.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1574,14 +1574,14 @@ network.layer3.2.bn1.bias: - 256 sum: '0.e+00' network.layer3.2.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.2.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1589,7 +1589,7 @@ network.layer3.2.bn1.running_mean: - 256 sum: '0.e+00' network.layer3.2.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1597,7 +1597,7 @@ network.layer3.2.bn1.running_var: - 256 sum: '2.56e+02' network.layer3.2.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1605,7 +1605,7 @@ network.layer3.2.bn1.weight: - 256 sum: '2.56e+02' network.layer3.2.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1613,14 +1613,14 @@ network.layer3.2.bn2.bias: - 256 sum: '0.e+00' network.layer3.2.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.2.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1628,7 +1628,7 @@ network.layer3.2.bn2.running_mean: - 256 sum: '0.e+00' network.layer3.2.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1636,7 +1636,7 @@ network.layer3.2.bn2.running_var: - 256 sum: '2.56e+02' network.layer3.2.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1644,7 +1644,7 @@ network.layer3.2.bn2.weight: - 256 sum: '2.56e+02' network.layer3.2.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1652,14 +1652,14 @@ network.layer3.2.bn3.bias: - 1024 sum: '0.e+00' network.layer3.2.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.2.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1667,7 +1667,7 @@ network.layer3.2.bn3.running_mean: - 1024 sum: '0.e+00' network.layer3.2.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1675,7 +1675,7 @@ network.layer3.2.bn3.running_var: - 1024 sum: '1.024e+03' network.layer3.2.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1683,40 +1683,40 @@ network.layer3.2.bn3.weight: - 1024 sum: '1.024e+03' network.layer3.2.conv1.weight: - device: cpu - max: '4.098e-01' - mean: '7.033e-06' - min: '-4.186e-01' + device: cuda:0 + max: '4.040e-01' + mean: '5.938e-06' + min: '-4.109e-01' shape: - 256 - 1024 - 1 - 1 - sum: '1.844e+00' + sum: '1.557e+00' network.layer3.2.conv2.weight: - device: cpu - max: '1.384e-01' - mean: '5.707e-05' - min: '-1.45e-01' + device: cuda:0 + max: '1.381e-01' + mean: '-1.49e-05' + min: '-1.505e-01' shape: - 256 - 256 - 3 - 3 - sum: '3.366e+01' + sum: '-8.787e+00' network.layer3.2.conv3.weight: - device: cpu - max: '1.963e-01' - mean: '-1.181e-05' - min: '-1.884e-01' + device: cuda:0 + max: '1.964e-01' + mean: '8.209e-05' + min: '-1.861e-01' shape: - 1024 - 256 - 1 - 1 - sum: '-3.096e+00' + sum: '2.152e+01' network.layer3.3.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1724,14 +1724,14 @@ network.layer3.3.bn1.bias: - 256 sum: '0.e+00' network.layer3.3.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.3.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1739,7 +1739,7 @@ network.layer3.3.bn1.running_mean: - 256 sum: '0.e+00' network.layer3.3.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1747,7 +1747,7 @@ network.layer3.3.bn1.running_var: - 256 sum: '2.56e+02' network.layer3.3.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1755,7 +1755,7 @@ network.layer3.3.bn1.weight: - 256 sum: '2.56e+02' network.layer3.3.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1763,14 +1763,14 @@ network.layer3.3.bn2.bias: - 256 sum: '0.e+00' network.layer3.3.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.3.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1778,7 +1778,7 @@ network.layer3.3.bn2.running_mean: - 256 sum: '0.e+00' network.layer3.3.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1786,7 +1786,7 @@ network.layer3.3.bn2.running_var: - 256 sum: '2.56e+02' network.layer3.3.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1794,7 +1794,7 @@ network.layer3.3.bn2.weight: - 256 sum: '2.56e+02' network.layer3.3.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1802,14 +1802,14 @@ network.layer3.3.bn3.bias: - 1024 sum: '0.e+00' network.layer3.3.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.3.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1817,7 +1817,7 @@ network.layer3.3.bn3.running_mean: - 1024 sum: '0.e+00' network.layer3.3.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1825,7 +1825,7 @@ network.layer3.3.bn3.running_var: - 1024 sum: '1.024e+03' network.layer3.3.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1833,40 +1833,40 @@ network.layer3.3.bn3.weight: - 1024 sum: '1.024e+03' network.layer3.3.conv1.weight: - device: cpu - max: '4.032e-01' - mean: '6.746e-06' - min: '-4.411e-01' + device: cuda:0 + max: '3.85e-01' + mean: '-1.446e-04' + min: '-4.104e-01' shape: - 256 - 1024 - 1 - 1 - sum: '1.768e+00' + sum: '-3.789e+01' network.layer3.3.conv2.weight: - device: cpu - max: '1.377e-01' - mean: '4.517e-05' - min: '-1.378e-01' + device: cuda:0 + max: '1.48e-01' + mean: '-4.522e-05' + min: '-1.423e-01' shape: - 256 - 256 - 3 - 3 - sum: '2.664e+01' + sum: '-2.667e+01' network.layer3.3.conv3.weight: - device: cpu - max: '2.2e-01' - mean: '8.760e-05' - min: '-1.877e-01' + device: cuda:0 + max: '1.972e-01' + mean: '-4.765e-05' + min: '-2.067e-01' shape: - 1024 - 256 - 1 - 1 - sum: '2.296e+01' + sum: '-1.249e+01' network.layer3.4.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1874,14 +1874,14 @@ network.layer3.4.bn1.bias: - 256 sum: '0.e+00' network.layer3.4.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.4.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1889,7 +1889,7 @@ network.layer3.4.bn1.running_mean: - 256 sum: '0.e+00' network.layer3.4.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1897,7 +1897,7 @@ network.layer3.4.bn1.running_var: - 256 sum: '2.56e+02' network.layer3.4.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1905,7 +1905,7 @@ network.layer3.4.bn1.weight: - 256 sum: '2.56e+02' network.layer3.4.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1913,14 +1913,14 @@ network.layer3.4.bn2.bias: - 256 sum: '0.e+00' network.layer3.4.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.4.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1928,7 +1928,7 @@ network.layer3.4.bn2.running_mean: - 256 sum: '0.e+00' network.layer3.4.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1936,7 +1936,7 @@ network.layer3.4.bn2.running_var: - 256 sum: '2.56e+02' network.layer3.4.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1944,7 +1944,7 @@ network.layer3.4.bn2.weight: - 256 sum: '2.56e+02' network.layer3.4.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1952,14 +1952,14 @@ network.layer3.4.bn3.bias: - 1024 sum: '0.e+00' network.layer3.4.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.4.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -1967,7 +1967,7 @@ network.layer3.4.bn3.running_mean: - 1024 sum: '0.e+00' network.layer3.4.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1975,7 +1975,7 @@ network.layer3.4.bn3.running_var: - 1024 sum: '1.024e+03' network.layer3.4.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -1983,40 +1983,40 @@ network.layer3.4.bn3.weight: - 1024 sum: '1.024e+03' network.layer3.4.conv1.weight: - device: cpu - max: '4.246e-01' - mean: '5.362e-06' - min: '-4.278e-01' + device: cuda:0 + max: '4.356e-01' + mean: '9.811e-05' + min: '-3.892e-01' shape: - 256 - 1024 - 1 - 1 - sum: '1.406e+00' + sum: '2.572e+01' network.layer3.4.conv2.weight: - device: cpu - max: '1.393e-01' - mean: '2.222e-06' - min: '-1.434e-01' + device: cuda:0 + max: '1.430e-01' + mean: '-3.322e-05' + min: '-1.325e-01' shape: - 256 - 256 - 3 - 3 - sum: '1.311e+00' + sum: '-1.959e+01' network.layer3.4.conv3.weight: - device: cpu - max: '2.e-01' - mean: '9.206e-05' - min: '-2.008e-01' + device: cuda:0 + max: '1.993e-01' + mean: '3.794e-05' + min: '-2.046e-01' shape: - 1024 - 256 - 1 - 1 - sum: '2.413e+01' + sum: '9.945e+00' network.layer3.5.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2024,14 +2024,14 @@ network.layer3.5.bn1.bias: - 256 sum: '0.e+00' network.layer3.5.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.5.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2039,7 +2039,7 @@ network.layer3.5.bn1.running_mean: - 256 sum: '0.e+00' network.layer3.5.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2047,7 +2047,7 @@ network.layer3.5.bn1.running_var: - 256 sum: '2.56e+02' network.layer3.5.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2055,7 +2055,7 @@ network.layer3.5.bn1.weight: - 256 sum: '2.56e+02' network.layer3.5.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2063,14 +2063,14 @@ network.layer3.5.bn2.bias: - 256 sum: '0.e+00' network.layer3.5.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.5.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2078,7 +2078,7 @@ network.layer3.5.bn2.running_mean: - 256 sum: '0.e+00' network.layer3.5.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2086,7 +2086,7 @@ network.layer3.5.bn2.running_var: - 256 sum: '2.56e+02' network.layer3.5.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2094,7 +2094,7 @@ network.layer3.5.bn2.weight: - 256 sum: '2.56e+02' network.layer3.5.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2102,14 +2102,14 @@ network.layer3.5.bn3.bias: - 1024 sum: '0.e+00' network.layer3.5.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer3.5.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2117,7 +2117,7 @@ network.layer3.5.bn3.running_mean: - 1024 sum: '0.e+00' network.layer3.5.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2125,7 +2125,7 @@ network.layer3.5.bn3.running_var: - 1024 sum: '1.024e+03' network.layer3.5.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2133,40 +2133,40 @@ network.layer3.5.bn3.weight: - 1024 sum: '1.024e+03' network.layer3.5.conv1.weight: - device: cpu - max: '4.474e-01' - mean: '-1.600e-05' - min: '-4.060e-01' + device: cuda:0 + max: '4.095e-01' + mean: '4.100e-05' + min: '-3.786e-01' shape: - 256 - 1024 - 1 - 1 - sum: '-4.194e+00' + sum: '1.075e+01' network.layer3.5.conv2.weight: - device: cpu - max: '1.359e-01' - mean: '3.909e-05' - min: '-1.454e-01' + device: cuda:0 + max: '1.341e-01' + mean: '-1.609e-05' + min: '-1.361e-01' shape: - 256 - 256 - 3 - 3 - sum: '2.306e+01' + sum: '-9.492e+00' network.layer3.5.conv3.weight: - device: cpu - max: '2.021e-01' - mean: '8.33e-05' - min: '-1.915e-01' + device: cuda:0 + max: '1.988e-01' + mean: '-1.139e-04' + min: '-2.040e-01' shape: - 1024 - 256 - 1 - 1 - sum: '2.184e+01' + sum: '-2.986e+01' network.layer4.0.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2174,14 +2174,14 @@ network.layer4.0.bn1.bias: - 512 sum: '0.e+00' network.layer4.0.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.0.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2189,7 +2189,7 @@ network.layer4.0.bn1.running_mean: - 512 sum: '0.e+00' network.layer4.0.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2197,7 +2197,7 @@ network.layer4.0.bn1.running_var: - 512 sum: '5.12e+02' network.layer4.0.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2205,7 +2205,7 @@ network.layer4.0.bn1.weight: - 512 sum: '5.12e+02' network.layer4.0.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2213,14 +2213,14 @@ network.layer4.0.bn2.bias: - 512 sum: '0.e+00' network.layer4.0.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.0.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2228,7 +2228,7 @@ network.layer4.0.bn2.running_mean: - 512 sum: '0.e+00' network.layer4.0.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2236,7 +2236,7 @@ network.layer4.0.bn2.running_var: - 512 sum: '5.12e+02' network.layer4.0.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2244,7 +2244,7 @@ network.layer4.0.bn2.weight: - 512 sum: '5.12e+02' network.layer4.0.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2252,14 +2252,14 @@ network.layer4.0.bn3.bias: - 2048 sum: '0.e+00' network.layer4.0.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.0.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2267,7 +2267,7 @@ network.layer4.0.bn3.running_mean: - 2048 sum: '0.e+00' network.layer4.0.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2275,7 +2275,7 @@ network.layer4.0.bn3.running_var: - 2048 sum: '2.048e+03' network.layer4.0.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2283,51 +2283,51 @@ network.layer4.0.bn3.weight: - 2048 sum: '2.048e+03' network.layer4.0.conv1.weight: - device: cpu - max: '3.176e-01' - mean: '-1.807e-05' - min: '-3.028e-01' + device: cuda:0 + max: '2.970e-01' + mean: '5.637e-05' + min: '-2.903e-01' shape: - 512 - 1024 - 1 - 1 - sum: '-9.476e+00' + sum: '2.955e+01' network.layer4.0.conv2.weight: - device: cpu - max: '9.886e-02' - mean: '1.319e-05' - min: '-1.076e-01' + device: cuda:0 + max: '9.993e-02' + mean: '1.64e-05' + min: '-1.102e-01' shape: - 512 - 512 - 3 - 3 - sum: '3.112e+01' + sum: '3.869e+01' network.layer4.0.conv3.weight: - device: cpu - max: '1.626e-01' - mean: '-1.957e-05' - min: '-1.542e-01' + device: cuda:0 + max: '1.534e-01' + mean: '-2.382e-06' + min: '-1.673e-01' shape: - 2048 - 512 - 1 - 1 - sum: '-2.052e+01' + sum: '-2.498e+00' network.layer4.0.downsample.0.weight: - device: cpu - max: '1.639e-01' - mean: '4.621e-05' - min: '-1.535e-01' + device: cuda:0 + max: '1.475e-01' + mean: '-6.343e-06' + min: '-1.472e-01' shape: - 2048 - 1024 - 1 - 1 - sum: '9.69e+01' + sum: '-1.330e+01' network.layer4.0.downsample.1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2335,14 +2335,14 @@ network.layer4.0.downsample.1.bias: - 2048 sum: '0.e+00' network.layer4.0.downsample.1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.0.downsample.1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2350,7 +2350,7 @@ network.layer4.0.downsample.1.running_mean: - 2048 sum: '0.e+00' network.layer4.0.downsample.1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2358,7 +2358,7 @@ network.layer4.0.downsample.1.running_var: - 2048 sum: '2.048e+03' network.layer4.0.downsample.1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2366,7 +2366,7 @@ network.layer4.0.downsample.1.weight: - 2048 sum: '2.048e+03' network.layer4.1.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2374,14 +2374,14 @@ network.layer4.1.bn1.bias: - 512 sum: '0.e+00' network.layer4.1.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.1.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2389,7 +2389,7 @@ network.layer4.1.bn1.running_mean: - 512 sum: '0.e+00' network.layer4.1.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2397,7 +2397,7 @@ network.layer4.1.bn1.running_var: - 512 sum: '5.12e+02' network.layer4.1.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2405,7 +2405,7 @@ network.layer4.1.bn1.weight: - 512 sum: '5.12e+02' network.layer4.1.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2413,14 +2413,14 @@ network.layer4.1.bn2.bias: - 512 sum: '0.e+00' network.layer4.1.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.1.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2428,7 +2428,7 @@ network.layer4.1.bn2.running_mean: - 512 sum: '0.e+00' network.layer4.1.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2436,7 +2436,7 @@ network.layer4.1.bn2.running_var: - 512 sum: '5.12e+02' network.layer4.1.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2444,7 +2444,7 @@ network.layer4.1.bn2.weight: - 512 sum: '5.12e+02' network.layer4.1.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2452,14 +2452,14 @@ network.layer4.1.bn3.bias: - 2048 sum: '0.e+00' network.layer4.1.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.1.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2467,7 +2467,7 @@ network.layer4.1.bn3.running_mean: - 2048 sum: '0.e+00' network.layer4.1.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2475,7 +2475,7 @@ network.layer4.1.bn3.running_var: - 2048 sum: '2.048e+03' network.layer4.1.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2483,40 +2483,40 @@ network.layer4.1.bn3.weight: - 2048 sum: '2.048e+03' network.layer4.1.conv1.weight: - device: cpu - max: '3.065e-01' - mean: '-6.068e-05' - min: '-2.977e-01' + device: cuda:0 + max: '3.285e-01' + mean: '5.911e-05' + min: '-3.033e-01' shape: - 512 - 2048 - 1 - 1 - sum: '-6.363e+01' + sum: '6.198e+01' network.layer4.1.conv2.weight: - device: cpu - max: '9.902e-02' - mean: '1.140e-06' - min: '-1.08e-01' + device: cuda:0 + max: '1.104e-01' + mean: '2.457e-05' + min: '-1.031e-01' shape: - 512 - 512 - 3 - 3 - sum: '2.690e+00' + sum: '5.797e+01' network.layer4.1.conv3.weight: - device: cpu - max: '1.517e-01' - mean: '-3.666e-05' - min: '-1.526e-01' + device: cuda:0 + max: '1.483e-01' + mean: '-6.445e-06' + min: '-1.555e-01' shape: - 2048 - 512 - 1 - 1 - sum: '-3.844e+01' + sum: '-6.758e+00' network.layer4.2.bn1.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2524,14 +2524,14 @@ network.layer4.2.bn1.bias: - 512 sum: '0.e+00' network.layer4.2.bn1.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.2.bn1.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2539,7 +2539,7 @@ network.layer4.2.bn1.running_mean: - 512 sum: '0.e+00' network.layer4.2.bn1.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2547,7 +2547,7 @@ network.layer4.2.bn1.running_var: - 512 sum: '5.12e+02' network.layer4.2.bn1.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2555,7 +2555,7 @@ network.layer4.2.bn1.weight: - 512 sum: '5.12e+02' network.layer4.2.bn2.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2563,14 +2563,14 @@ network.layer4.2.bn2.bias: - 512 sum: '0.e+00' network.layer4.2.bn2.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.2.bn2.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2578,7 +2578,7 @@ network.layer4.2.bn2.running_mean: - 512 sum: '0.e+00' network.layer4.2.bn2.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2586,7 +2586,7 @@ network.layer4.2.bn2.running_var: - 512 sum: '5.12e+02' network.layer4.2.bn2.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2594,7 +2594,7 @@ network.layer4.2.bn2.weight: - 512 sum: '5.12e+02' network.layer4.2.bn3.bias: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2602,14 +2602,14 @@ network.layer4.2.bn3.bias: - 2048 sum: '0.e+00' network.layer4.2.bn3.num_batches_tracked: - device: cpu + device: cuda:0 max: 0 mean: '0.e+00' min: 0 shape: [] sum: 0 network.layer4.2.bn3.running_mean: - device: cpu + device: cuda:0 max: '0.e+00' mean: '0.e+00' min: '0.e+00' @@ -2617,7 +2617,7 @@ network.layer4.2.bn3.running_mean: - 2048 sum: '0.e+00' network.layer4.2.bn3.running_var: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2625,7 +2625,7 @@ network.layer4.2.bn3.running_var: - 2048 sum: '2.048e+03' network.layer4.2.bn3.weight: - device: cpu + device: cuda:0 max: '1.e+00' mean: '1.e+00' min: '1.e+00' @@ -2633,35 +2633,35 @@ network.layer4.2.bn3.weight: - 2048 sum: '2.048e+03' network.layer4.2.conv1.weight: - device: cpu - max: '2.82e-01' - mean: '-9.716e-05' - min: '-2.873e-01' + device: cuda:0 + max: '2.960e-01' + mean: '-1.275e-04' + min: '-3.368e-01' shape: - 512 - 2048 - 1 - 1 - sum: '-1.019e+02' + sum: '-1.337e+02' network.layer4.2.conv2.weight: - device: cpu - max: '1.111e-01' - mean: '-2.905e-06' - min: '-1.051e-01' + device: cuda:0 + max: '9.885e-02' + mean: '-6.874e-06' + min: '-9.988e-02' shape: - 512 - 512 - 3 - 3 - sum: '-6.853e+00' + sum: '-1.622e+01' network.layer4.2.conv3.weight: - device: cpu - max: '1.576e-01' - mean: '5.136e-06' - min: '-1.479e-01' + device: cuda:0 + max: '1.45e-01' + mean: '1.976e-05' + min: '-1.578e-01' shape: - 2048 - 512 - 1 - 1 - sum: '5.386e+00' + sum: '2.073e+01' diff --git a/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet50_imagenet_image_classifier.yaml b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet50_imagenet_image_classifier.yaml new file mode 100644 index 00000000..929934db --- /dev/null +++ b/.regression_files/project/algorithms/image_classifier_test/test_initialization_is_reproducible/cuda/resnet50_imagenet_image_classifier.yaml @@ -0,0 +1,2667 @@ +network.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.conv1.weight: + device: cuda:0 + max: '1.019e-01' + mean: '2.309e-04' + min: '-8.332e-02' + shape: + - 64 + - 3 + - 7 + - 7 + sum: '2.172e+00' +network.fc.bias: + device: cuda:0 + max: '2.203e-02' + mean: '4.486e-04' + min: '-2.206e-02' + shape: + - 1000 + sum: '4.486e-01' +network.fc.weight: + device: cuda:0 + max: '2.21e-02' + mean: '6.154e-06' + min: '-2.21e-02' + shape: + - 1000 + - 2048 + sum: '1.260e+01' +network.layer1.0.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.0.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer1.0.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.0.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.0.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.0.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.0.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer1.0.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.0.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.0.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.0.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer1.0.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer1.0.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer1.0.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer1.0.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer1.0.conv1.weight: + device: cuda:0 + max: '6.509e-01' + mean: '1.445e-03' + min: '-6.027e-01' + shape: + - 64 + - 64 + - 1 + - 1 + sum: '5.919e+00' +network.layer1.0.conv2.weight: + device: cuda:0 + max: '2.359e-01' + mean: '1.355e-04' + min: '-2.49e-01' + shape: + - 64 + - 64 + - 3 + - 3 + sum: '4.995e+00' +network.layer1.0.conv3.weight: + device: cuda:0 + max: '3.852e-01' + mean: '3.642e-04' + min: '-3.478e-01' + shape: + - 256 + - 64 + - 1 + - 1 + sum: '5.966e+00' +network.layer1.0.downsample.0.weight: + device: cuda:0 + max: '3.423e-01' + mean: '-6.033e-04' + min: '-3.476e-01' + shape: + - 256 + - 64 + - 1 + - 1 + sum: '-9.884e+00' +network.layer1.0.downsample.1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer1.0.downsample.1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer1.0.downsample.1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer1.0.downsample.1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer1.0.downsample.1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer1.1.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.1.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer1.1.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.1.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.1.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.1.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.1.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer1.1.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.1.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.1.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.1.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer1.1.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer1.1.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer1.1.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer1.1.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer1.1.conv1.weight: + device: cuda:0 + max: '7.347e-01' + mean: '1.03e-03' + min: '-6.643e-01' + shape: + - 64 + - 256 + - 1 + - 1 + sum: '1.687e+01' +network.layer1.1.conv2.weight: + device: cuda:0 + max: '2.614e-01' + mean: '3.465e-04' + min: '-2.217e-01' + shape: + - 64 + - 64 + - 3 + - 3 + sum: '1.277e+01' +network.layer1.1.conv3.weight: + device: cuda:0 + max: '3.091e-01' + mean: '4.206e-05' + min: '-3.557e-01' + shape: + - 256 + - 64 + - 1 + - 1 + sum: '6.892e-01' +network.layer1.2.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.2.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer1.2.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.2.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.2.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.2.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.2.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer1.2.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 64 + sum: '0.e+00' +network.layer1.2.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.2.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 64 + sum: '6.4e+01' +network.layer1.2.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer1.2.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer1.2.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer1.2.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer1.2.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer1.2.conv1.weight: + device: cuda:0 + max: '6.524e-01' + mean: '-1.441e-03' + min: '-6.990e-01' + shape: + - 64 + - 256 + - 1 + - 1 + sum: '-2.362e+01' +network.layer1.2.conv2.weight: + device: cuda:0 + max: '2.666e-01' + mean: '-3.895e-05' + min: '-2.347e-01' + shape: + - 64 + - 64 + - 3 + - 3 + sum: '-1.436e+00' +network.layer1.2.conv3.weight: + device: cuda:0 + max: '3.408e-01' + mean: '5.479e-04' + min: '-3.091e-01' + shape: + - 256 + - 64 + - 1 + - 1 + sum: '8.977e+00' +network.layer2.0.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.0.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.0.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.0.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.0.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.0.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.0.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.0.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.0.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.0.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.0.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer2.0.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.0.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer2.0.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer2.0.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer2.0.conv1.weight: + device: cuda:0 + max: '5.176e-01' + mean: '-5.491e-04' + min: '-4.999e-01' + shape: + - 128 + - 256 + - 1 + - 1 + sum: '-1.799e+01' +network.layer2.0.conv2.weight: + device: cuda:0 + max: '1.808e-01' + mean: '-1.218e-04' + min: '-1.887e-01' + shape: + - 128 + - 128 + - 3 + - 3 + sum: '-1.796e+01' +network.layer2.0.conv3.weight: + device: cuda:0 + max: '2.875e-01' + mean: '-1.799e-04' + min: '-2.593e-01' + shape: + - 512 + - 128 + - 1 + - 1 + sum: '-1.179e+01' +network.layer2.0.downsample.0.weight: + device: cuda:0 + max: '3.018e-01' + mean: '-5.660e-05' + min: '-2.697e-01' + shape: + - 512 + - 256 + - 1 + - 1 + sum: '-7.419e+00' +network.layer2.0.downsample.1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer2.0.downsample.1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.0.downsample.1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer2.0.downsample.1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer2.0.downsample.1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer2.1.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.1.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.1.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.1.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.1.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.1.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.1.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.1.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.1.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.1.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.1.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer2.1.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.1.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer2.1.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer2.1.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer2.1.conv1.weight: + device: cuda:0 + max: '5.314e-01' + mean: '-3.536e-04' + min: '-5.475e-01' + shape: + - 128 + - 512 + - 1 + - 1 + sum: '-2.318e+01' +network.layer2.1.conv2.weight: + device: cuda:0 + max: '1.754e-01' + mean: '7.783e-05' + min: '-1.808e-01' + shape: + - 128 + - 128 + - 3 + - 3 + sum: '1.148e+01' +network.layer2.1.conv3.weight: + device: cuda:0 + max: '2.382e-01' + mean: '-1.054e-05' + min: '-2.517e-01' + shape: + - 512 + - 128 + - 1 + - 1 + sum: '-6.906e-01' +network.layer2.2.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.2.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.2.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.2.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.2.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.2.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.2.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.2.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.2.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.2.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.2.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer2.2.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.2.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer2.2.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer2.2.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer2.2.conv1.weight: + device: cuda:0 + max: '4.971e-01' + mean: '-3.09e-04' + min: '-5.291e-01' + shape: + - 128 + - 512 + - 1 + - 1 + sum: '-2.025e+01' +network.layer2.2.conv2.weight: + device: cuda:0 + max: '2.107e-01' + mean: '-7.661e-06' + min: '-1.779e-01' + shape: + - 128 + - 128 + - 3 + - 3 + sum: '-1.13e+00' +network.layer2.2.conv3.weight: + device: cuda:0 + max: '3.236e-01' + mean: '2.725e-05' + min: '-3.006e-01' + shape: + - 512 + - 128 + - 1 + - 1 + sum: '1.786e+00' +network.layer2.3.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.3.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.3.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.3.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.3.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.3.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.3.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.3.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 128 + sum: '0.e+00' +network.layer2.3.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.3.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 128 + sum: '1.28e+02' +network.layer2.3.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer2.3.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer2.3.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer2.3.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer2.3.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer2.3.conv1.weight: + device: cuda:0 + max: '5.317e-01' + mean: '9.857e-05' + min: '-5.177e-01' + shape: + - 128 + - 512 + - 1 + - 1 + sum: '6.460e+00' +network.layer2.3.conv2.weight: + device: cuda:0 + max: '1.874e-01' + mean: '6.223e-05' + min: '-1.855e-01' + shape: + - 128 + - 128 + - 3 + - 3 + sum: '9.176e+00' +network.layer2.3.conv3.weight: + device: cuda:0 + max: '2.559e-01' + mean: '-2.673e-04' + min: '-2.529e-01' + shape: + - 512 + - 128 + - 1 + - 1 + sum: '-1.752e+01' +network.layer3.0.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.0.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.0.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.0.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.0.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.0.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.0.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.0.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.0.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.0.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.0.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 1024 + sum: '0.e+00' +network.layer3.0.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.0.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 1024 + sum: '0.e+00' +network.layer3.0.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.layer3.0.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.layer3.0.conv1.weight: + device: cuda:0 + max: '3.843e-01' + mean: '3.586e-04' + min: '-3.99e-01' + shape: + - 256 + - 512 + - 1 + - 1 + sum: '4.701e+01' +network.layer3.0.conv2.weight: + device: cuda:0 + max: '1.38e-01' + mean: '-3.53e-06' + min: '-1.294e-01' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '-2.082e+00' +network.layer3.0.conv3.weight: + device: cuda:0 + max: '2.052e-01' + mean: '-7.496e-06' + min: '-1.973e-01' + shape: + - 1024 + - 256 + - 1 + - 1 + sum: '-1.965e+00' +network.layer3.0.downsample.0.weight: + device: cuda:0 + max: '2.020e-01' + mean: '1.340e-05' + min: '-2.257e-01' + shape: + - 1024 + - 512 + - 1 + - 1 + sum: '7.027e+00' +network.layer3.0.downsample.1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 1024 + sum: '0.e+00' +network.layer3.0.downsample.1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.0.downsample.1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 1024 + sum: '0.e+00' +network.layer3.0.downsample.1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.layer3.0.downsample.1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.layer3.1.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.1.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.1.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.1.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.1.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.1.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.1.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.1.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.1.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.1.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.1.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 1024 + sum: '0.e+00' +network.layer3.1.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.1.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 1024 + sum: '0.e+00' +network.layer3.1.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.layer3.1.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.layer3.1.conv1.weight: + device: cuda:0 + max: '4.143e-01' + mean: '1.499e-05' + min: '-3.709e-01' + shape: + - 256 + - 1024 + - 1 + - 1 + sum: '3.93e+00' +network.layer3.1.conv2.weight: + device: cuda:0 + max: '1.309e-01' + mean: '1.100e-05' + min: '-1.368e-01' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '6.490e+00' +network.layer3.1.conv3.weight: + device: cuda:0 + max: '2.051e-01' + mean: '-1.367e-04' + min: '-1.971e-01' + shape: + - 1024 + - 256 + - 1 + - 1 + sum: '-3.584e+01' +network.layer3.2.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.2.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.2.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.2.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.2.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.2.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.2.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.2.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.2.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.2.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.2.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 1024 + sum: '0.e+00' +network.layer3.2.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.2.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 1024 + sum: '0.e+00' +network.layer3.2.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.layer3.2.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.layer3.2.conv1.weight: + device: cuda:0 + max: '3.993e-01' + mean: '-1.212e-04' + min: '-4.269e-01' + shape: + - 256 + - 1024 + - 1 + - 1 + sum: '-3.178e+01' +network.layer3.2.conv2.weight: + device: cuda:0 + max: '1.517e-01' + mean: '1.648e-05' + min: '-1.378e-01' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '9.721e+00' +network.layer3.2.conv3.weight: + device: cuda:0 + max: '1.958e-01' + mean: '-6.993e-06' + min: '-1.987e-01' + shape: + - 1024 + - 256 + - 1 + - 1 + sum: '-1.833e+00' +network.layer3.3.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.3.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.3.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.3.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.3.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.3.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.3.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.3.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.3.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.3.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.3.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 1024 + sum: '0.e+00' +network.layer3.3.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.3.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 1024 + sum: '0.e+00' +network.layer3.3.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.layer3.3.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.layer3.3.conv1.weight: + device: cuda:0 + max: '4.290e-01' + mean: '-2.493e-04' + min: '-3.916e-01' + shape: + - 256 + - 1024 + - 1 + - 1 + sum: '-6.535e+01' +network.layer3.3.conv2.weight: + device: cuda:0 + max: '1.365e-01' + mean: '1.203e-05' + min: '-1.364e-01' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '7.097e+00' +network.layer3.3.conv3.weight: + device: cuda:0 + max: '2.011e-01' + mean: '9.821e-05' + min: '-2.042e-01' + shape: + - 1024 + - 256 + - 1 + - 1 + sum: '2.575e+01' +network.layer3.4.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.4.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.4.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.4.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.4.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.4.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.4.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.4.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.4.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.4.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.4.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 1024 + sum: '0.e+00' +network.layer3.4.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.4.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 1024 + sum: '0.e+00' +network.layer3.4.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.layer3.4.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.layer3.4.conv1.weight: + device: cuda:0 + max: '3.968e-01' + mean: '-2.179e-04' + min: '-3.871e-01' + shape: + - 256 + - 1024 + - 1 + - 1 + sum: '-5.712e+01' +network.layer3.4.conv2.weight: + device: cuda:0 + max: '1.392e-01' + mean: '-2.276e-05' + min: '-1.360e-01' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '-1.342e+01' +network.layer3.4.conv3.weight: + device: cuda:0 + max: '2.100e-01' + mean: '9.087e-05' + min: '-2.052e-01' + shape: + - 1024 + - 256 + - 1 + - 1 + sum: '2.382e+01' +network.layer3.5.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.5.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.5.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.5.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.5.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.5.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.5.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.5.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.layer3.5.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.5.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 256 + sum: '2.56e+02' +network.layer3.5.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 1024 + sum: '0.e+00' +network.layer3.5.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer3.5.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 1024 + sum: '0.e+00' +network.layer3.5.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.layer3.5.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.layer3.5.conv1.weight: + device: cuda:0 + max: '3.732e-01' + mean: '4.573e-05' + min: '-4.036e-01' + shape: + - 256 + - 1024 + - 1 + - 1 + sum: '1.199e+01' +network.layer3.5.conv2.weight: + device: cuda:0 + max: '1.382e-01' + mean: '3.509e-05' + min: '-1.344e-01' + shape: + - 256 + - 256 + - 3 + - 3 + sum: '2.07e+01' +network.layer3.5.conv3.weight: + device: cuda:0 + max: '2.12e-01' + mean: '-2.857e-05' + min: '-2.015e-01' + shape: + - 1024 + - 256 + - 1 + - 1 + sum: '-7.489e+00' +network.layer4.0.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.0.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.0.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.0.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.0.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.0.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.0.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.0.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.0.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.0.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.0.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 2048 + sum: '0.e+00' +network.layer4.0.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.0.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 2048 + sum: '0.e+00' +network.layer4.0.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 2048 + sum: '2.048e+03' +network.layer4.0.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 2048 + sum: '2.048e+03' +network.layer4.0.conv1.weight: + device: cuda:0 + max: '2.853e-01' + mean: '2.027e-04' + min: '-2.964e-01' + shape: + - 512 + - 1024 + - 1 + - 1 + sum: '1.063e+02' +network.layer4.0.conv2.weight: + device: cuda:0 + max: '1.022e-01' + mean: '-7.219e-06' + min: '-1.115e-01' + shape: + - 512 + - 512 + - 3 + - 3 + sum: '-1.703e+01' +network.layer4.0.conv3.weight: + device: cuda:0 + max: '1.469e-01' + mean: '1.062e-05' + min: '-1.472e-01' + shape: + - 2048 + - 512 + - 1 + - 1 + sum: '1.113e+01' +network.layer4.0.downsample.0.weight: + device: cuda:0 + max: '1.643e-01' + mean: '1.053e-05' + min: '-1.525e-01' + shape: + - 2048 + - 1024 + - 1 + - 1 + sum: '2.209e+01' +network.layer4.0.downsample.1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 2048 + sum: '0.e+00' +network.layer4.0.downsample.1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.0.downsample.1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 2048 + sum: '0.e+00' +network.layer4.0.downsample.1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 2048 + sum: '2.048e+03' +network.layer4.0.downsample.1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 2048 + sum: '2.048e+03' +network.layer4.1.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.1.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.1.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.1.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.1.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.1.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.1.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.1.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.1.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.1.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.1.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 2048 + sum: '0.e+00' +network.layer4.1.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.1.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 2048 + sum: '0.e+00' +network.layer4.1.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 2048 + sum: '2.048e+03' +network.layer4.1.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 2048 + sum: '2.048e+03' +network.layer4.1.conv1.weight: + device: cuda:0 + max: '3.313e-01' + mean: '1.118e-04' + min: '-3.093e-01' + shape: + - 512 + - 2048 + - 1 + - 1 + sum: '1.172e+02' +network.layer4.1.conv2.weight: + device: cuda:0 + max: '1.056e-01' + mean: '-1.704e-05' + min: '-1.123e-01' + shape: + - 512 + - 512 + - 3 + - 3 + sum: '-4.019e+01' +network.layer4.1.conv3.weight: + device: cuda:0 + max: '1.447e-01' + mean: '3.966e-06' + min: '-1.413e-01' + shape: + - 2048 + - 512 + - 1 + - 1 + sum: '4.158e+00' +network.layer4.2.bn1.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.2.bn1.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.2.bn1.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.2.bn1.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.2.bn1.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.2.bn2.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.2.bn2.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.2.bn2.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 512 + sum: '0.e+00' +network.layer4.2.bn2.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.2.bn2.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 512 + sum: '5.12e+02' +network.layer4.2.bn3.bias: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 2048 + sum: '0.e+00' +network.layer4.2.bn3.num_batches_tracked: + device: cuda:0 + max: 0 + mean: '0.e+00' + min: 0 + shape: [] + sum: 0 +network.layer4.2.bn3.running_mean: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 2048 + sum: '0.e+00' +network.layer4.2.bn3.running_var: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 2048 + sum: '2.048e+03' +network.layer4.2.bn3.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 2048 + sum: '2.048e+03' +network.layer4.2.conv1.weight: + device: cuda:0 + max: '2.966e-01' + mean: '-2.162e-05' + min: '-2.997e-01' + shape: + - 512 + - 2048 + - 1 + - 1 + sum: '-2.267e+01' +network.layer4.2.conv2.weight: + device: cuda:0 + max: '9.663e-02' + mean: '-1.553e-06' + min: '-1.052e-01' + shape: + - 512 + - 512 + - 3 + - 3 + sum: '-3.664e+00' +network.layer4.2.conv3.weight: + device: cuda:0 + max: '1.522e-01' + mean: '-1.257e-05' + min: '-1.512e-01' + shape: + - 2048 + - 512 + - 1 + - 1 + sum: '-1.318e+01' diff --git a/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cpu/cifar10_jax_example.yaml b/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cpu/cifar10_jax_example.yaml deleted file mode 100644 index 7b9e8b58..00000000 --- a/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cpu/cifar10_jax_example.yaml +++ /dev/null @@ -1,22 +0,0 @@ -input: - device: cpu - hash: -1373365636602041987 - max: 2.1 - mean: -0.0 - min: -2.0 - shape: - - 128 - - 3 - - 32 - - 32 - sum: -2429.8 -out: - device: cpu - hash: 7290015411165007734 - max: 1.0 - mean: 0.1 - min: -0.8 - shape: - - 128 - - 10 - sum: 151.9 diff --git a/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cpu/fashion_mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cpu/fashion_mnist_jax_example.yaml deleted file mode 100644 index 913c73b8..00000000 --- a/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cpu/fashion_mnist_jax_example.yaml +++ /dev/null @@ -1,22 +0,0 @@ -input: - device: cpu - hash: 9223185275738543696 - max: 2.8 - mean: 0.5 - min: -0.4 - shape: - - 128 - - 1 - - 28 - - 28 - sum: 48391.2 -out: - device: cpu - hash: 8278441553463422914 - max: 1.0 - mean: -0.0 - min: -1.0 - shape: - - 128 - - 10 - sum: -14.1 diff --git a/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cpu/cifar10_jax_example.yaml b/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cpu/cifar10_jax_example.yaml deleted file mode 100644 index 7e5c8245..00000000 --- a/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cpu/cifar10_jax_example.yaml +++ /dev/null @@ -1,80 +0,0 @@ -network.params.0: - device: cpu - hash: -4218701300434786233 - max: 0.0 - mean: 0.0 - min: 0.0 - shape: - - 32 - sum: 0.0 -network.params.1: - device: cpu - hash: 6448973716641827056 - max: 0.4 - mean: -0.0 - min: -0.4 - shape: - - 3 - - 3 - - 3 - - 32 - sum: -7.1 -network.params.2: - device: cpu - hash: -5258163774450544391 - max: 0.0 - mean: 0.0 - min: 0.0 - shape: - - 64 - sum: 0.0 -network.params.3: - device: cpu - hash: -195626296360386472 - max: 0.1 - mean: 0.0 - min: -0.1 - shape: - - 3 - - 3 - - 32 - - 64 - sum: 8.3 -network.params.4: - device: cpu - hash: 3505480816438514598 - max: 0.0 - mean: 0.0 - min: 0.0 - shape: - - 256 - sum: 0.0 -network.params.5: - device: cpu - hash: 7328344990793555668 - max: 0.0 - mean: 0.0 - min: -0.0 - shape: - - 4096 - - 256 - sum: 17.4 -network.params.6: - device: cpu - hash: -7222447081605638768 - max: 0.0 - mean: 0.0 - min: 0.0 - shape: - - 10 - sum: 0.0 -network.params.7: - device: cpu - hash: -2983191316776450796 - max: 0.1 - mean: 0.0 - min: -0.1 - shape: - - 256 - - 10 - sum: 1.8 diff --git a/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cpu/fashion_mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cpu/fashion_mnist_jax_example.yaml deleted file mode 100644 index deba293a..00000000 --- a/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cpu/fashion_mnist_jax_example.yaml +++ /dev/null @@ -1,80 +0,0 @@ -network.params.0: - device: cpu - hash: -4218701300434786233 - max: 0.0 - mean: 0.0 - min: 0.0 - shape: - - 32 - sum: 0.0 -network.params.1: - device: cpu - hash: -2168085942084572394 - max: 0.7 - mean: -0.0 - min: -0.7 - shape: - - 3 - - 3 - - 1 - - 32 - sum: -0.3 -network.params.2: - device: cpu - hash: -5258163774450544391 - max: 0.0 - mean: 0.0 - min: 0.0 - shape: - - 64 - sum: 0.0 -network.params.3: - device: cpu - hash: -195626296360386472 - max: 0.1 - mean: 0.0 - min: -0.1 - shape: - - 3 - - 3 - - 32 - - 64 - sum: 8.3 -network.params.4: - device: cpu - hash: 3505480816438514598 - max: 0.0 - mean: 0.0 - min: 0.0 - shape: - - 256 - sum: 0.0 -network.params.5: - device: cpu - hash: 8975080659470718874 - max: 0.0 - mean: 0.0 - min: -0.0 - shape: - - 3136 - - 256 - sum: 15.7 -network.params.6: - device: cpu - hash: -7222447081605638768 - max: 0.0 - mean: 0.0 - min: 0.0 - shape: - - 10 - sum: 0.0 -network.params.7: - device: cpu - hash: -2983191316776450796 - max: 0.1 - mean: 0.0 - min: -0.1 - shape: - - 256 - - 10 - sum: 1.8 diff --git a/.regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/cifar10_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/cifar10_jax_cnn_jax_image_classifier.yaml similarity index 84% rename from .regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/cifar10_jax_example.yaml rename to .regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/cifar10_jax_cnn_jax_image_classifier.yaml index abb5c072..ff422c2a 100644 --- a/.regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/cifar10_jax_example.yaml +++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/cifar10_jax_cnn_jax_image_classifier.yaml @@ -1,5 +1,5 @@ batch.0: - device: cpu + device: cuda:0 max: '2.126e+00' mean: '-6.179e-03' min: '-1.989e+00' @@ -10,7 +10,7 @@ batch.0: - 32 sum: '-2.43e+03' batch.1: - device: cpu + device: cuda:0 max: 9 mean: '4.555e+00' min: 0 @@ -18,7 +18,7 @@ batch.1: - 128 sum: 583 grads.network.params.0: - device: cpu + device: cuda:0 max: '9.654e-03' mean: '1.276e-03' min: '-1.148e-02' @@ -26,7 +26,7 @@ grads.network.params.0: - 32 sum: '4.083e-02' grads.network.params.1: - device: cpu + device: cuda:0 max: '1.149e-02' mean: '5.030e-04' min: '-1.473e-02' @@ -37,7 +37,7 @@ grads.network.params.1: - 32 sum: '4.346e-01' grads.network.params.2: - device: cpu + device: cuda:0 max: '1.680e-02' mean: '1.566e-03' min: '-7.296e-03' @@ -45,7 +45,7 @@ grads.network.params.2: - 64 sum: '1.002e-01' grads.network.params.3: - device: cpu + device: cuda:0 max: '2.507e-02' mean: '4.631e-04' min: '-2.280e-02' @@ -56,7 +56,7 @@ grads.network.params.3: - 64 sum: '8.536e+00' grads.network.params.4: - device: cpu + device: cuda:0 max: '1.025e-02' mean: '1.384e-04' min: '-1.082e-02' @@ -64,7 +64,7 @@ grads.network.params.4: - 256 sum: '3.542e-02' grads.network.params.5: - device: cpu + device: cuda:0 max: '3.064e-02' mean: '3.315e-05' min: '-2.379e-02' @@ -73,7 +73,7 @@ grads.network.params.5: - 256 sum: '3.476e+01' grads.network.params.6: - device: cpu + device: cuda:0 max: '2.984e-02' mean: '-5.588e-10' min: '-2.597e-02' @@ -81,16 +81,16 @@ grads.network.params.6: - 10 sum: '-5.588e-09' grads.network.params.7: - device: cpu + device: cuda:0 max: '4.361e-02' - mean: '-1.63e-10' + mean: '-2.154e-10' min: '-4.662e-02' shape: - 256 - 10 - sum: '-4.172e-07' + sum: '-5.513e-07' outputs.logits: - device: cpu + device: cuda:0 max: '9.608e-01' mean: '1.186e-01' min: '-7.613e-01' @@ -99,14 +99,14 @@ outputs.logits: - 10 sum: '1.519e+02' outputs.loss: - device: cpu + device: cuda:0 max: '2.341e+00' mean: '2.341e+00' min: '2.341e+00' shape: [] sum: '2.341e+00' outputs.y: - device: cpu + device: cuda:0 max: 9 mean: '4.555e+00' min: 0 diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/cifar10_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/cifar10_jax_fcnet_jax_image_classifier.yaml new file mode 100644 index 00000000..2fe6e1fa --- /dev/null +++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/cifar10_jax_fcnet_jax_image_classifier.yaml @@ -0,0 +1,77 @@ +batch.0: + device: cuda:0 + max: '2.126e+00' + mean: '-6.179e-03' + min: '-1.989e+00' + shape: + - 128 + - 3 + - 32 + - 32 + sum: '-2.43e+03' +batch.1: + device: cuda:0 + max: 9 + mean: '4.555e+00' + min: 0 + shape: + - 128 + sum: 583 +grads.network.params.0: + device: cuda:0 + max: '1.552e-02' + mean: '8.602e-04' + min: '-9.862e-03' + shape: + - 256 + sum: '2.202e-01' +grads.network.params.1: + device: cuda:0 + max: '2.677e-02' + mean: '1.968e-05' + min: '-2.576e-02' + shape: + - 3072 + - 256 + sum: '1.548e+01' +grads.network.params.2: + device: cuda:0 + max: '6.868e-02' + mean: '0.e+00' + min: '-3.458e-02' + shape: + - 10 + sum: '0.e+00' +grads.network.params.3: + device: cuda:0 + max: '1.497e-01' + mean: '-2.445e-10' + min: '-1.415e-01' + shape: + - 256 + - 10 + sum: '-6.258e-07' +outputs.logits: + device: cuda:0 + max: '2.380e+00' + mean: '5.809e-02' + min: '-3.135e+00' + shape: + - 128 + - 10 + sum: '7.436e+01' +outputs.loss: + device: cuda:0 + max: '2.466e+00' + mean: '2.466e+00' + min: '2.466e+00' + shape: [] + sum: '2.466e+00' +outputs.y: + device: cuda:0 + max: 9 + mean: '4.555e+00' + min: 0 + shape: + - 128 + sum: 583 diff --git a/.regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/fashion_mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/fashion_mnist_jax_cnn_jax_image_classifier.yaml similarity index 84% rename from .regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/fashion_mnist_jax_example.yaml rename to .regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/fashion_mnist_jax_cnn_jax_image_classifier.yaml index bdc2a02f..7b7a7623 100644 --- a/.regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/fashion_mnist_jax_example.yaml +++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/fashion_mnist_jax_cnn_jax_image_classifier.yaml @@ -1,5 +1,5 @@ batch.0: - device: cpu + device: cuda:0 max: '2.821e+00' mean: '4.822e-01' min: '-4.242e-01' @@ -10,7 +10,7 @@ batch.0: - 28 sum: '4.839e+04' batch.1: - device: cpu + device: cuda:0 max: 9 mean: '4.555e+00' min: 0 @@ -18,7 +18,7 @@ batch.1: - 128 sum: 583 grads.network.params.0: - device: cpu + device: cuda:0 max: '1.949e-02' mean: '4.526e-03' min: '-1.615e-02' @@ -26,7 +26,7 @@ grads.network.params.0: - 32 sum: '1.448e-01' grads.network.params.1: - device: cpu + device: cuda:0 max: '4.36e-02' mean: '5.924e-03' min: '-3.013e-02' @@ -37,7 +37,7 @@ grads.network.params.1: - 32 sum: '1.706e+00' grads.network.params.2: - device: cpu + device: cuda:0 max: '2.734e-02' mean: '1.847e-03' min: '-1.76e-02' @@ -45,7 +45,7 @@ grads.network.params.2: - 64 sum: '1.182e-01' grads.network.params.3: - device: cpu + device: cuda:0 max: '6.099e-02' mean: '1.127e-03' min: '-5.833e-02' @@ -56,7 +56,7 @@ grads.network.params.3: - 64 sum: '2.077e+01' grads.network.params.4: - device: cpu + device: cuda:0 max: '2.451e-02' mean: '1.065e-03' min: '-1.999e-02' @@ -64,7 +64,7 @@ grads.network.params.4: - 256 sum: '2.727e-01' grads.network.params.5: - device: cpu + device: cuda:0 max: '7.691e-02' mean: '3.075e-04' min: '-6.106e-02' @@ -73,7 +73,7 @@ grads.network.params.5: - 256 sum: '2.469e+02' grads.network.params.6: - device: cpu + device: cuda:0 max: '5.898e-02' mean: '-1.863e-09' min: '-7.022e-02' @@ -81,16 +81,16 @@ grads.network.params.6: - 10 sum: '-1.863e-08' grads.network.params.7: - device: cpu + device: cuda:0 max: '1.382e-01' - mean: '-5.821e-11' + mean: '-1.775e-10' min: '-1.376e-01' shape: - 256 - 10 - sum: '-1.490e-07' + sum: '-4.545e-07' outputs.logits: - device: cpu + device: cuda:0 max: '1.032e+00' mean: '-1.1e-02' min: '-9.602e-01' @@ -99,14 +99,14 @@ outputs.logits: - 10 sum: '-1.408e+01' outputs.loss: - device: cpu + device: cuda:0 max: '2.385e+00' mean: '2.385e+00' min: '2.385e+00' shape: [] sum: '2.385e+00' outputs.y: - device: cpu + device: cuda:0 max: 9 mean: '4.555e+00' min: 0 diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/fashion_mnist_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/fashion_mnist_jax_fcnet_jax_image_classifier.yaml new file mode 100644 index 00000000..7a36defc --- /dev/null +++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/fashion_mnist_jax_fcnet_jax_image_classifier.yaml @@ -0,0 +1,77 @@ +batch.0: + device: cuda:0 + max: '2.821e+00' + mean: '4.822e-01' + min: '-4.242e-01' + shape: + - 128 + - 1 + - 28 + - 28 + sum: '4.839e+04' +batch.1: + device: cuda:0 + max: 9 + mean: '4.555e+00' + min: 0 + shape: + - 128 + sum: 583 +grads.network.params.0: + device: cuda:0 + max: '2.188e-02' + mean: '8.325e-04' + min: '-2.096e-02' + shape: + - 256 + sum: '2.131e-01' +grads.network.params.1: + device: cuda:0 + max: '5.304e-02' + mean: '4.879e-04' + min: '-4.886e-02' + shape: + - 784 + - 256 + sum: '9.792e+01' +grads.network.params.2: + device: cuda:0 + max: '1.375e-01' + mean: '0.e+00' + min: '-9.162e-02' + shape: + - 10 + sum: '0.e+00' +grads.network.params.3: + device: cuda:0 + max: '3.990e-01' + mean: '-1.106e-10' + min: '-2.054e-01' + shape: + - 256 + - 10 + sum: '-2.831e-07' +outputs.logits: + device: cuda:0 + max: '2.656e+00' + mean: '2.355e-02' + min: '-2.715e+00' + shape: + - 128 + - 10 + sum: '3.015e+01' +outputs.loss: + device: cuda:0 + max: '2.554e+00' + mean: '2.554e+00' + min: '2.554e+00' + shape: [] + sum: '2.554e+00' +outputs.y: + device: cuda:0 + max: 9 + mean: '4.555e+00' + min: 0 + shape: + - 128 + sum: 583 diff --git a/.regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/mnist_jax_cnn_jax_image_classifier.yaml similarity index 82% rename from .regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/mnist_jax_example.yaml rename to .regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/mnist_jax_cnn_jax_image_classifier.yaml index f4c17e52..d41f869b 100644 --- a/.regression_files/project/algorithms/jax_example_test/test_backward_pass_is_reproducible/cpu/mnist_jax_example.yaml +++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/mnist_jax_cnn_jax_image_classifier.yaml @@ -1,5 +1,5 @@ batch.0: - device: cpu + device: cuda:0 max: '2.821e+00' mean: '1.432e-02' min: '-4.242e-01' @@ -10,7 +10,7 @@ batch.0: - 28 sum: '1.437e+03' batch.1: - device: cpu + device: cuda:0 max: 9 mean: '4.242e+00' min: 0 @@ -18,7 +18,7 @@ batch.1: - 128 sum: 543 grads.network.params.0: - device: cpu + device: cuda:0 max: '1.65e-02' mean: '2.109e-03' min: '-8.628e-03' @@ -26,7 +26,7 @@ grads.network.params.0: - 32 sum: '6.748e-02' grads.network.params.1: - device: cpu + device: cuda:0 max: '1.893e-02' mean: '-1.55e-05' min: '-1.627e-02' @@ -37,7 +37,7 @@ grads.network.params.1: - 32 sum: '-4.463e-03' grads.network.params.2: - device: cpu + device: cuda:0 max: '2.053e-02' mean: '1.196e-03' min: '-1.783e-02' @@ -45,7 +45,7 @@ grads.network.params.2: - 64 sum: '7.653e-02' grads.network.params.3: - device: cpu + device: cuda:0 max: '2.25e-02' mean: '3.613e-04' min: '-2.352e-02' @@ -56,7 +56,7 @@ grads.network.params.3: - 64 sum: '6.659e+00' grads.network.params.4: - device: cpu + device: cuda:0 max: '2.231e-02' mean: '2.332e-04' min: '-2.018e-02' @@ -64,7 +64,7 @@ grads.network.params.4: - 256 sum: '5.970e-02' grads.network.params.5: - device: cpu + device: cuda:0 max: '5.356e-02' mean: '3.131e-05' min: '-4.563e-02' @@ -73,24 +73,24 @@ grads.network.params.5: - 256 sum: '2.514e+01' grads.network.params.6: - device: cpu + device: cuda:0 max: '6.484e-02' - mean: '-1.397e-09' + mean: '-1.490e-09' min: '-8.046e-02' shape: - 10 - sum: '-1.397e-08' + sum: '-1.490e-08' grads.network.params.7: - device: cpu + device: cuda:0 max: '7.496e-02' - mean: '-3.376e-10' + mean: '-3.361e-10' min: '-8.565e-02' shape: - 256 - 10 - sum: '-8.643e-07' + sum: '-8.605e-07' outputs.logits: - device: cpu + device: cuda:0 max: '8.092e-01' mean: '-2.764e-02' min: '-1.135e+00' @@ -99,14 +99,14 @@ outputs.logits: - 10 sum: '-3.538e+01' outputs.loss: - device: cpu + device: cuda:0 max: '2.303e+00' mean: '2.303e+00' min: '2.303e+00' shape: [] sum: '2.303e+00' outputs.y: - device: cpu + device: cuda:0 max: 9 mean: '4.242e+00' min: 0 diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/mnist_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/mnist_jax_fcnet_jax_image_classifier.yaml new file mode 100644 index 00000000..b1219522 --- /dev/null +++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_backward_pass_is_reproducible/mnist_jax_fcnet_jax_image_classifier.yaml @@ -0,0 +1,77 @@ +batch.0: + device: cuda:0 + max: '2.821e+00' + mean: '1.432e-02' + min: '-4.242e-01' + shape: + - 128 + - 1 + - 28 + - 28 + sum: '1.437e+03' +batch.1: + device: cuda:0 + max: 9 + mean: '4.242e+00' + min: 0 + shape: + - 128 + sum: 543 +grads.network.params.0: + device: cuda:0 + max: '1.386e-02' + mean: '8.019e-04' + min: '-1.326e-02' + shape: + - 256 + sum: '2.053e-01' +grads.network.params.1: + device: cuda:0 + max: '3.122e-02' + mean: '-1.002e-04' + min: '-3.579e-02' + shape: + - 784 + - 256 + sum: '-2.012e+01' +grads.network.params.2: + device: cuda:0 + max: '4.549e-02' + mean: '0.e+00' + min: '-7.537e-02' + shape: + - 10 + sum: '0.e+00' +grads.network.params.3: + device: cuda:0 + max: '7.07e-02' + mean: '-5.821e-11' + min: '-1.064e-01' + shape: + - 256 + - 10 + sum: '-1.490e-07' +outputs.logits: + device: cuda:0 + max: '1.85e+00' + mean: '6.708e-02' + min: '-1.919e+00' + shape: + - 128 + - 10 + sum: '8.586e+01' +outputs.loss: + device: cuda:0 + max: '2.398e+00' + mean: '2.398e+00' + min: '2.398e+00' + shape: [] + sum: '2.398e+00' +outputs.y: + device: cuda:0 + max: 9 + mean: '4.242e+00' + min: 0 + shape: + - 128 + sum: 543 diff --git a/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cuda/cifar10_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/cifar10_jax_cnn_jax_image_classifier.yaml similarity index 100% rename from .regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cuda/cifar10_jax_example.yaml rename to .regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/cifar10_jax_cnn_jax_image_classifier.yaml diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/cifar10_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/cifar10_jax_fcnet_jax_image_classifier.yaml new file mode 100644 index 00000000..c73fe9ab --- /dev/null +++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/cifar10_jax_fcnet_jax_image_classifier.yaml @@ -0,0 +1,20 @@ +input: + device: cuda:0 + max: '2.126e+00' + mean: '-6.179e-03' + min: '-1.989e+00' + shape: + - 128 + - 3 + - 32 + - 32 + sum: '-2.43e+03' +out: + device: cuda:0 + max: '2.380e+00' + mean: '5.809e-02' + min: '-3.135e+00' + shape: + - 128 + - 10 + sum: '7.436e+01' diff --git a/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cuda/fashion_mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/fashion_mnist_jax_cnn_jax_image_classifier.yaml similarity index 100% rename from .regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cuda/fashion_mnist_jax_example.yaml rename to .regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/fashion_mnist_jax_cnn_jax_image_classifier.yaml diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/fashion_mnist_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/fashion_mnist_jax_fcnet_jax_image_classifier.yaml new file mode 100644 index 00000000..7e489df5 --- /dev/null +++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/fashion_mnist_jax_fcnet_jax_image_classifier.yaml @@ -0,0 +1,20 @@ +input: + device: cuda:0 + max: '2.821e+00' + mean: '4.822e-01' + min: '-4.242e-01' + shape: + - 128 + - 1 + - 28 + - 28 + sum: '4.839e+04' +out: + device: cuda:0 + max: '2.656e+00' + mean: '2.355e-02' + min: '-2.715e+00' + shape: + - 128 + - 10 + sum: '3.015e+01' diff --git a/.regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cuda/mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/mnist_jax_cnn_jax_image_classifier.yaml similarity index 100% rename from .regression_files/project/algorithms/jax_example_test/test_forward_pass_is_reproducible/cuda/mnist_jax_example.yaml rename to .regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/mnist_jax_cnn_jax_image_classifier.yaml diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/mnist_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/mnist_jax_fcnet_jax_image_classifier.yaml new file mode 100644 index 00000000..5659f1e9 --- /dev/null +++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_forward_pass_is_reproducible/cuda/mnist_jax_fcnet_jax_image_classifier.yaml @@ -0,0 +1,20 @@ +input: + device: cuda:0 + max: '2.821e+00' + mean: '1.432e-02' + min: '-4.242e-01' + shape: + - 128 + - 1 + - 28 + - 28 + sum: '1.437e+03' +out: + device: cuda:0 + max: '1.85e+00' + mean: '6.708e-02' + min: '-1.919e+00' + shape: + - 128 + - 10 + sum: '8.586e+01' diff --git a/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cuda/cifar10_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/cifar10_jax_cnn_jax_image_classifier.yaml similarity index 100% rename from .regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cuda/cifar10_jax_example.yaml rename to .regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/cifar10_jax_cnn_jax_image_classifier.yaml diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/cifar10_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/cifar10_jax_fcnet_jax_image_classifier.yaml new file mode 100644 index 00000000..178d3b7e --- /dev/null +++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/cifar10_jax_fcnet_jax_image_classifier.yaml @@ -0,0 +1,34 @@ +network.params.0: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.params.1: + device: cuda:0 + max: '4.102e-02' + mean: '2.969e-05' + min: '-4.102e-02' + shape: + - 3072 + - 256 + sum: '2.335e+01' +network.params.2: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 10 + sum: '0.e+00' +network.params.3: + device: cuda:0 + max: '1.421e-01' + mean: '7.197e-04' + min: '-1.416e-01' + shape: + - 256 + - 10 + sum: '1.842e+00' diff --git a/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cuda/fashion_mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/fashion_mnist_jax_cnn_jax_image_classifier.yaml similarity index 100% rename from .regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cuda/fashion_mnist_jax_example.yaml rename to .regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/fashion_mnist_jax_cnn_jax_image_classifier.yaml diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/fashion_mnist_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/fashion_mnist_jax_fcnet_jax_image_classifier.yaml new file mode 100644 index 00000000..b29367ad --- /dev/null +++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/fashion_mnist_jax_fcnet_jax_image_classifier.yaml @@ -0,0 +1,34 @@ +network.params.0: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.params.1: + device: cuda:0 + max: '8.120e-02' + mean: '-2.572e-05' + min: '-8.120e-02' + shape: + - 784 + - 256 + sum: '-5.162e+00' +network.params.2: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 10 + sum: '0.e+00' +network.params.3: + device: cuda:0 + max: '1.421e-01' + mean: '7.197e-04' + min: '-1.416e-01' + shape: + - 256 + - 10 + sum: '1.842e+00' diff --git a/.regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cuda/mnist_jax_example.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/mnist_jax_cnn_jax_image_classifier.yaml similarity index 100% rename from .regression_files/project/algorithms/jax_example_test/test_initialization_is_reproducible/cuda/mnist_jax_example.yaml rename to .regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/mnist_jax_cnn_jax_image_classifier.yaml diff --git a/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/mnist_jax_fcnet_jax_image_classifier.yaml b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/mnist_jax_fcnet_jax_image_classifier.yaml new file mode 100644 index 00000000..b29367ad --- /dev/null +++ b/.regression_files/project/algorithms/jax_image_classifier_test/test_initialization_is_reproducible/cuda/mnist_jax_fcnet_jax_image_classifier.yaml @@ -0,0 +1,34 @@ +network.params.0: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 256 + sum: '0.e+00' +network.params.1: + device: cuda:0 + max: '8.120e-02' + mean: '-2.572e-05' + min: '-8.120e-02' + shape: + - 784 + - 256 + sum: '-5.162e+00' +network.params.2: + device: cuda:0 + max: '0.e+00' + mean: '0.e+00' + min: '0.e+00' + shape: + - 10 + sum: '0.e+00' +network.params.3: + device: cuda:0 + max: '1.421e-01' + mean: '7.197e-04' + min: '-1.416e-01' + shape: + - 256 + - 10 + sum: '1.842e+00' diff --git a/.regression_files/project/algorithms/jax_rl_example_test/test_lightning/123_Pendulum_v1_15.yaml b/.regression_files/project/algorithms/jax_ppo_test/test_lightning/Pendulum_v1_42_15.yaml similarity index 60% rename from .regression_files/project/algorithms/jax_rl_example_test/test_lightning/123_Pendulum_v1_15.yaml rename to .regression_files/project/algorithms/jax_ppo_test/test_lightning/Pendulum_v1_42_15.yaml index e70ed343..a47898ea 100644 --- a/.regression_files/project/algorithms/jax_rl_example_test/test_lightning/123_Pendulum_v1_15.yaml +++ b/.regression_files/project/algorithms/jax_ppo_test/test_lightning/Pendulum_v1_42_15.yaml @@ -5,8 +5,8 @@ val/episode_lengths: shape: [] sum: '2.e+02' val/rewards: - max: '-1.222e+03' - mean: '-1.222e+03' - min: '-1.222e+03' + max: '-9.099e+02' + mean: '-9.099e+02' + min: '-9.099e+02' shape: [] - sum: '-1.222e+03' + sum: '-9.099e+02' diff --git a/.regression_files/project/algorithms/jax_rl_example_test/test_ours_with_trainer/123_Pendulum_v1.yaml b/.regression_files/project/algorithms/jax_ppo_test/test_ours/42__123__Pendulum_v1.yaml similarity index 61% rename from .regression_files/project/algorithms/jax_rl_example_test/test_ours_with_trainer/123_Pendulum_v1.yaml rename to .regression_files/project/algorithms/jax_ppo_test/test_ours/42__123__Pendulum_v1.yaml index d83973a5..113d223f 100644 --- a/.regression_files/project/algorithms/jax_rl_example_test/test_ours_with_trainer/123_Pendulum_v1.yaml +++ b/.regression_files/project/algorithms/jax_ppo_test/test_ours/42__123__Pendulum_v1.yaml @@ -1,16 +1,18 @@ cumulative_reward: - max: '-6.495e+02' - mean: '-1.229e+03' + max: '-7.835e-01' + mean: '-9.323e+02' min: '-1.878e+03' shape: + - 2 - 76 - 128 - sum: '-1.196e+07' + sum: '-1.814e+07' episode_length: max: 200 mean: '2.e+02' min: 200 shape: + - 2 - 76 - 128 - sum: 1945600 + sum: 3891200 diff --git a/.regression_files/project/algorithms/jax_rl_example_test/test_ours/123_Pendulum_v1.yaml b/.regression_files/project/algorithms/jax_ppo_test/test_ours_with_trainer/42__123__Pendulum_v1.yaml similarity index 61% rename from .regression_files/project/algorithms/jax_rl_example_test/test_ours/123_Pendulum_v1.yaml rename to .regression_files/project/algorithms/jax_ppo_test/test_ours_with_trainer/42__123__Pendulum_v1.yaml index d83973a5..113d223f 100644 --- a/.regression_files/project/algorithms/jax_rl_example_test/test_ours/123_Pendulum_v1.yaml +++ b/.regression_files/project/algorithms/jax_ppo_test/test_ours_with_trainer/42__123__Pendulum_v1.yaml @@ -1,16 +1,18 @@ cumulative_reward: - max: '-6.495e+02' - mean: '-1.229e+03' + max: '-7.835e-01' + mean: '-9.323e+02' min: '-1.878e+03' shape: + - 2 - 76 - 128 - sum: '-1.196e+07' + sum: '-1.814e+07' episode_length: max: 200 mean: '2.e+02' min: 200 shape: + - 2 - 76 - 128 - sum: 1945600 + sum: 3891200 diff --git a/.regression_files/project/algorithms/jax_rl_example_test/test_rejax/123_Pendulum_v1.yaml b/.regression_files/project/algorithms/jax_ppo_test/test_rejax/42__123__Pendulum_v1.yaml similarity index 61% rename from .regression_files/project/algorithms/jax_rl_example_test/test_rejax/123_Pendulum_v1.yaml rename to .regression_files/project/algorithms/jax_ppo_test/test_rejax/42__123__Pendulum_v1.yaml index 8b29ccb9..bf24f361 100644 --- a/.regression_files/project/algorithms/jax_rl_example_test/test_rejax/123_Pendulum_v1.yaml +++ b/.regression_files/project/algorithms/jax_ppo_test/test_rejax/42__123__Pendulum_v1.yaml @@ -1,16 +1,18 @@ cumulative_reward: - max: '-4.319e-01' - mean: '-5.755e+02' + max: '-3.978e-01' + mean: '-5.231e+02' min: '-1.872e+03' shape: + - 2 - 76 - 128 - sum: '-5.599e+06' + sum: '-1.018e+07' episode_length: max: 200 mean: '2.e+02' min: 200 shape: + - 2 - 76 - 128 - sum: 1945600 + sum: 3891200 diff --git a/.regression_files/project/algorithms/llm_finetuning_test/test_backward_pass_is_reproducible/llm_finetuning.yaml b/.regression_files/project/algorithms/llm_finetuning_test/test_backward_pass_is_reproducible/llm_finetuning.yaml new file mode 100644 index 00000000..e1932620 --- /dev/null +++ b/.regression_files/project/algorithms/llm_finetuning_test/test_backward_pass_is_reproducible/llm_finetuning.yaml @@ -0,0 +1,3286 @@ +batch.attention_mask: + device: cuda:0 + max: 1 + mean: '1.e+00' + min: 1 + shape: + - 8 + - 256 + sum: 2048 +batch.input_ids: + device: cuda:0 + max: 50118 + mean: '5.447e+03' + min: 2 + shape: + - 8 + - 256 + sum: 11154886 +batch.labels: + device: cuda:0 + max: 50118 + mean: '5.447e+03' + min: 2 + shape: + - 8 + - 256 + sum: 11154886 +grads.network.model.decoder.embed_positions.weight: + device: cuda:0 + max: '2.549e-02' + mean: '2.795e-07' + min: '-2.530e-02' + shape: + - 2050 + - 1024 + sum: '5.867e-01' +grads.network.model.decoder.embed_tokens.weight: + device: cuda:0 + max: '7.65e-01' + mean: '-2.928e-07' + min: '-9.832e-01' + shape: + - 50272 + - 512 + sum: '-7.537e+00' +grads.network.model.decoder.layers.0.fc1.bias: + device: cuda:0 + max: '2.624e-03' + mean: '-2.445e-06' + min: '-8.882e-03' + shape: + - 4096 + sum: '-1.001e-02' +grads.network.model.decoder.layers.0.fc1.weight: + device: cuda:0 + max: '8.724e-02' + mean: '4.963e-09' + min: '-1.222e-01' + shape: + - 4096 + - 1024 + sum: '2.082e-02' +grads.network.model.decoder.layers.0.fc2.bias: + device: cuda:0 + max: '1.031e-02' + mean: '7.276e-12' + min: '-1.265e-02' + shape: + - 1024 + sum: '7.451e-09' +grads.network.model.decoder.layers.0.fc2.weight: + device: cuda:0 + max: '1.836e-02' + mean: '0.e+00' + min: '-1.480e-02' + shape: + - 1024 + - 4096 + sum: '0.e+00' +grads.network.model.decoder.layers.0.final_layer_norm.bias: + device: cuda:0 + max: '1.124e-02' + mean: '2.244e-06' + min: '-1.343e-02' + shape: + - 1024 + sum: '2.298e-03' +grads.network.model.decoder.layers.0.final_layer_norm.weight: + device: cuda:0 + max: '9.238e-03' + mean: '-1.765e-05' + min: '-5.406e-02' + shape: + - 1024 + sum: '-1.807e-02' +grads.network.model.decoder.layers.0.self_attn.k_proj.bias: + device: cuda:0 + max: '1.455e-10' + mean: '1.036e-12' + min: '-1.673e-10' + shape: + - 1024 + sum: '1.061e-09' +grads.network.model.decoder.layers.0.self_attn.k_proj.weight: + device: cuda:0 + max: '1.895e-04' + mean: '6.07e-11' + min: '-1.679e-04' + shape: + - 1024 + - 1024 + sum: '6.365e-05' +grads.network.model.decoder.layers.0.self_attn.out_proj.bias: + device: cuda:0 + max: '2.459e-01' + mean: '-8.149e-10' + min: '-2.594e-01' + shape: + - 1024 + sum: '-8.345e-07' +grads.network.model.decoder.layers.0.self_attn.out_proj.weight: + device: cuda:0 + max: '7.433e-03' + mean: '1.705e-13' + min: '-7.011e-03' + shape: + - 1024 + - 1024 + sum: '1.788e-07' +grads.network.model.decoder.layers.0.self_attn.q_proj.bias: + device: cuda:0 + max: '4.872e-04' + mean: '3.458e-07' + min: '-5.13e-04' + shape: + - 1024 + sum: '3.541e-04' +grads.network.model.decoder.layers.0.self_attn.q_proj.weight: + device: cuda:0 + max: '3.873e-04' + mean: '3.472e-09' + min: '-4.093e-04' + shape: + - 1024 + - 1024 + sum: '3.641e-03' +grads.network.model.decoder.layers.0.self_attn.v_proj.bias: + device: cuda:0 + max: '1.222e-01' + mean: '5.112e-04' + min: '-1.374e-01' + shape: + - 1024 + sum: '5.235e-01' +grads.network.model.decoder.layers.0.self_attn.v_proj.weight: + device: cuda:0 + max: '7.942e-02' + mean: '3.069e-07' + min: '-7.008e-02' + shape: + - 1024 + - 1024 + sum: '3.218e-01' +grads.network.model.decoder.layers.0.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.182e-02' + mean: '-1.809e-05' + min: '-1.26e-02' + shape: + - 1024 + sum: '-1.852e-02' +grads.network.model.decoder.layers.0.self_attn_layer_norm.weight: + device: cuda:0 + max: '9.642e-03' + mean: '-9.916e-07' + min: '-4.965e-02' + shape: + - 1024 + sum: '-1.015e-03' +grads.network.model.decoder.layers.1.fc1.bias: + device: cuda:0 + max: '5.562e-03' + mean: '-1.470e-06' + min: '-7.369e-03' + shape: + - 4096 + sum: '-6.023e-03' +grads.network.model.decoder.layers.1.fc1.weight: + device: cuda:0 + max: '6.877e-02' + mean: '2.984e-09' + min: '-9.409e-02' + shape: + - 4096 + - 1024 + sum: '1.251e-02' +grads.network.model.decoder.layers.1.fc2.bias: + device: cuda:0 + max: '1.038e-02' + mean: '1.819e-11' + min: '-1.155e-02' + shape: + - 1024 + sum: '1.863e-08' +grads.network.model.decoder.layers.1.fc2.weight: + device: cuda:0 + max: '1.431e-02' + mean: '2.558e-13' + min: '-1.138e-02' + shape: + - 1024 + - 4096 + sum: '1.073e-06' +grads.network.model.decoder.layers.1.final_layer_norm.bias: + device: cuda:0 + max: '1.17e-02' + mean: '-9.708e-05' + min: '-1.293e-02' + shape: + - 1024 + sum: '-9.941e-02' +grads.network.model.decoder.layers.1.final_layer_norm.weight: + device: cuda:0 + max: '1.304e-02' + mean: '1.814e-05' + min: '-3.518e-02' + shape: + - 1024 + sum: '1.858e-02' +grads.network.model.decoder.layers.1.self_attn.k_proj.bias: + device: cuda:0 + max: '6.403e-10' + mean: '6.279e-13' + min: '-1.397e-09' + shape: + - 1024 + sum: '6.430e-10' +grads.network.model.decoder.layers.1.self_attn.k_proj.weight: + device: cuda:0 + max: '3.312e-02' + mean: '3.22e-15' + min: '-3.174e-02' + shape: + - 1024 + - 1024 + sum: '3.376e-09' +grads.network.model.decoder.layers.1.self_attn.out_proj.bias: + device: cuda:0 + max: '9.799e-03' + mean: '2.183e-11' + min: '-1.048e-02' + shape: + - 1024 + sum: '2.235e-08' +grads.network.model.decoder.layers.1.self_attn.out_proj.weight: + device: cuda:0 + max: '1.020e-02' + mean: '-1.705e-13' + min: '-1.033e-02' + shape: + - 1024 + - 1024 + sum: '-1.788e-07' +grads.network.model.decoder.layers.1.self_attn.q_proj.bias: + device: cuda:0 + max: '1.236e-03' + mean: '-3.821e-06' + min: '-2.06e-03' + shape: + - 1024 + sum: '-3.913e-03' +grads.network.model.decoder.layers.1.self_attn.q_proj.weight: + device: cuda:0 + max: '1.833e-02' + mean: '-2.680e-08' + min: '-1.194e-02' + shape: + - 1024 + - 1024 + sum: '-2.811e-02' +grads.network.model.decoder.layers.1.self_attn.v_proj.bias: + device: cuda:0 + max: '1.296e-02' + mean: '1.047e-04' + min: '-9.251e-03' + shape: + - 1024 + sum: '1.072e-01' +grads.network.model.decoder.layers.1.self_attn.v_proj.weight: + device: cuda:0 + max: '2.234e-01' + mean: '7.347e-07' + min: '-1.650e-01' + shape: + - 1024 + - 1024 + sum: '7.704e-01' +grads.network.model.decoder.layers.1.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.000e-02' + mean: '-4.235e-05' + min: '-1.078e-02' + shape: + - 1024 + sum: '-4.337e-02' +grads.network.model.decoder.layers.1.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.163e-02' + mean: '5.549e-06' + min: '-3.955e-02' + shape: + - 1024 + sum: '5.682e-03' +grads.network.model.decoder.layers.10.fc1.bias: + device: cuda:0 + max: '1.167e-02' + mean: '-1.093e-05' + min: '-4.407e-03' + shape: + - 4096 + sum: '-4.475e-02' +grads.network.model.decoder.layers.10.fc1.weight: + device: cuda:0 + max: '1.255e-01' + mean: '-1.298e-08' + min: '-2.335e-01' + shape: + - 4096 + - 1024 + sum: '-5.445e-02' +grads.network.model.decoder.layers.10.fc2.bias: + device: cuda:0 + max: '9.324e-03' + mean: '3.638e-12' + min: '-9.376e-03' + shape: + - 1024 + sum: '3.725e-09' +grads.network.model.decoder.layers.10.fc2.weight: + device: cuda:0 + max: '1.888e-02' + mean: '1.137e-13' + min: '-1.95e-02' + shape: + - 1024 + - 4096 + sum: '4.768e-07' +grads.network.model.decoder.layers.10.final_layer_norm.bias: + device: cuda:0 + max: '1.063e-02' + mean: '1.763e-04' + min: '-1.049e-02' + shape: + - 1024 + sum: '1.805e-01' +grads.network.model.decoder.layers.10.final_layer_norm.weight: + device: cuda:0 + max: '1.245e-02' + mean: '1.566e-05' + min: '-1.95e-02' + shape: + - 1024 + sum: '1.604e-02' +grads.network.model.decoder.layers.10.self_attn.k_proj.bias: + device: cuda:0 + max: '1.863e-09' + mean: '-8.787e-12' + min: '-1.164e-09' + shape: + - 1024 + sum: '-8.998e-09' +grads.network.model.decoder.layers.10.self_attn.k_proj.weight: + device: cuda:0 + max: '1.065e-01' + mean: '1.164e-13' + min: '-1.330e-01' + shape: + - 1024 + - 1024 + sum: '1.220e-07' +grads.network.model.decoder.layers.10.self_attn.out_proj.bias: + device: cuda:0 + max: '8.365e-03' + mean: '1.819e-11' + min: '-8.918e-03' + shape: + - 1024 + sum: '1.863e-08' +grads.network.model.decoder.layers.10.self_attn.out_proj.weight: + device: cuda:0 + max: '7.876e-03' + mean: '3.126e-13' + min: '-7.644e-03' + shape: + - 1024 + - 1024 + sum: '3.278e-07' +grads.network.model.decoder.layers.10.self_attn.q_proj.bias: + device: cuda:0 + max: '3.907e-03' + mean: '-1.607e-05' + min: '-4.692e-03' + shape: + - 1024 + sum: '-1.645e-02' +grads.network.model.decoder.layers.10.self_attn.q_proj.weight: + device: cuda:0 + max: '3.358e-02' + mean: '1.291e-07' + min: '-4.45e-02' + shape: + - 1024 + - 1024 + sum: '1.354e-01' +grads.network.model.decoder.layers.10.self_attn.v_proj.bias: + device: cuda:0 + max: '9.312e-03' + mean: '-8.616e-05' + min: '-9.148e-03' + shape: + - 1024 + sum: '-8.822e-02' +grads.network.model.decoder.layers.10.self_attn.v_proj.weight: + device: cuda:0 + max: '2.466e-01' + mean: '6.922e-07' + min: '-2.438e-01' + shape: + - 1024 + - 1024 + sum: '7.259e-01' +grads.network.model.decoder.layers.10.self_attn_layer_norm.bias: + device: cuda:0 + max: '8.563e-03' + mean: '-2.205e-05' + min: '-9.231e-03' + shape: + - 1024 + sum: '-2.258e-02' +grads.network.model.decoder.layers.10.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.004e-02' + mean: '8.82e-06' + min: '-2.064e-02' + shape: + - 1024 + sum: '9.032e-03' +grads.network.model.decoder.layers.11.fc1.bias: + device: cuda:0 + max: '4.537e-03' + mean: '-1.97e-05' + min: '-1.077e-02' + shape: + - 4096 + sum: '-8.069e-02' +grads.network.model.decoder.layers.11.fc1.weight: + device: cuda:0 + max: '1.921e-01' + mean: '-8.097e-08' + min: '-1.258e-01' + shape: + - 4096 + - 1024 + sum: '-3.396e-01' +grads.network.model.decoder.layers.11.fc2.bias: + device: cuda:0 + max: '9.747e-03' + mean: '0.e+00' + min: '-1.146e-02' + shape: + - 1024 + sum: '0.e+00' +grads.network.model.decoder.layers.11.fc2.weight: + device: cuda:0 + max: '2.297e-02' + mean: '-2.274e-13' + min: '-2.611e-02' + shape: + - 1024 + - 4096 + sum: '-9.537e-07' +grads.network.model.decoder.layers.11.final_layer_norm.bias: + device: cuda:0 + max: '1.074e-02' + mean: '-1.697e-04' + min: '-1.309e-02' + shape: + - 1024 + sum: '-1.738e-01' +grads.network.model.decoder.layers.11.final_layer_norm.weight: + device: cuda:0 + max: '4.611e-02' + mean: '-1.405e-05' + min: '-1.679e-02' + shape: + - 1024 + sum: '-1.439e-02' +grads.network.model.decoder.layers.11.self_attn.k_proj.bias: + device: cuda:0 + max: '4.075e-10' + mean: '3.897e-12' + min: '-5.239e-10' + shape: + - 1024 + sum: '3.990e-09' +grads.network.model.decoder.layers.11.self_attn.k_proj.weight: + device: cuda:0 + max: '3.695e-02' + mean: '-2.855e-13' + min: '-3.176e-02' + shape: + - 1024 + - 1024 + sum: '-2.994e-07' +grads.network.model.decoder.layers.11.self_attn.out_proj.bias: + device: cuda:0 + max: '1.050e-02' + mean: '1.819e-12' + min: '-1.04e-02' + shape: + - 1024 + sum: '1.863e-09' +grads.network.model.decoder.layers.11.self_attn.out_proj.weight: + device: cuda:0 + max: '4.005e-03' + mean: '-4.619e-14' + min: '-3.44e-03' + shape: + - 1024 + - 1024 + sum: '-4.843e-08' +grads.network.model.decoder.layers.11.self_attn.q_proj.bias: + device: cuda:0 + max: '1.21e-03' + mean: '-1.349e-05' + min: '-2.133e-03' + shape: + - 1024 + sum: '-1.382e-02' +grads.network.model.decoder.layers.11.self_attn.q_proj.weight: + device: cuda:0 + max: '2.495e-02' + mean: '1.265e-07' + min: '-2.483e-02' + shape: + - 1024 + - 1024 + sum: '1.326e-01' +grads.network.model.decoder.layers.11.self_attn.v_proj.bias: + device: cuda:0 + max: '9.094e-03' + mean: '-1.657e-05' + min: '-1.120e-02' + shape: + - 1024 + sum: '-1.697e-02' +grads.network.model.decoder.layers.11.self_attn.v_proj.weight: + device: cuda:0 + max: '2.806e-01' + mean: '1.554e-07' + min: '-2.307e-01' + shape: + - 1024 + - 1024 + sum: '1.629e-01' +grads.network.model.decoder.layers.11.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.090e-02' + mean: '4.103e-05' + min: '-1.074e-02' + shape: + - 1024 + sum: '4.202e-02' +grads.network.model.decoder.layers.11.self_attn_layer_norm.weight: + device: cuda:0 + max: '9.913e-03' + mean: '8.734e-06' + min: '-2.563e-02' + shape: + - 1024 + sum: '8.943e-03' +grads.network.model.decoder.layers.12.fc1.bias: + device: cuda:0 + max: '4.174e-03' + mean: '-9.494e-06' + min: '-5.266e-03' + shape: + - 4096 + sum: '-3.889e-02' +grads.network.model.decoder.layers.12.fc1.weight: + device: cuda:0 + max: '1.308e-01' + mean: '-4.169e-08' + min: '-1.225e-01' + shape: + - 4096 + - 1024 + sum: '-1.749e-01' +grads.network.model.decoder.layers.12.fc2.bias: + device: cuda:0 + max: '9.381e-03' + mean: '0.e+00' + min: '-9.925e-03' + shape: + - 1024 + sum: '0.e+00' +grads.network.model.decoder.layers.12.fc2.weight: + device: cuda:0 + max: '1.477e-02' + mean: '-1.137e-13' + min: '-1.799e-02' + shape: + - 1024 + - 4096 + sum: '-4.768e-07' +grads.network.model.decoder.layers.12.final_layer_norm.bias: + device: cuda:0 + max: '1.085e-02' + mean: '-6.289e-05' + min: '-1.164e-02' + shape: + - 1024 + sum: '-6.440e-02' +grads.network.model.decoder.layers.12.final_layer_norm.weight: + device: cuda:0 + max: '2.347e-02' + mean: '1.717e-05' + min: '-3.135e-02' + shape: + - 1024 + sum: '1.758e-02' +grads.network.model.decoder.layers.12.self_attn.k_proj.bias: + device: cuda:0 + max: '6.694e-10' + mean: '8.309e-13' + min: '-4.948e-10' + shape: + - 1024 + sum: '8.508e-10' +grads.network.model.decoder.layers.12.self_attn.k_proj.weight: + device: cuda:0 + max: '7.397e-02' + mean: '-2.175e-13' + min: '-9.768e-02' + shape: + - 1024 + - 1024 + sum: '-2.281e-07' +grads.network.model.decoder.layers.12.self_attn.out_proj.bias: + device: cuda:0 + max: '9.249e-03' + mean: '-7.276e-12' + min: '-9.731e-03' + shape: + - 1024 + sum: '-7.451e-09' +grads.network.model.decoder.layers.12.self_attn.out_proj.weight: + device: cuda:0 + max: '4.412e-03' + mean: '1.421e-13' + min: '-4.588e-03' + shape: + - 1024 + - 1024 + sum: '1.490e-07' +grads.network.model.decoder.layers.12.self_attn.q_proj.bias: + device: cuda:0 + max: '3.407e-03' + mean: '2.445e-05' + min: '-1.779e-03' + shape: + - 1024 + sum: '2.504e-02' +grads.network.model.decoder.layers.12.self_attn.q_proj.weight: + device: cuda:0 + max: '4.225e-02' + mean: '-3.557e-07' + min: '-4.189e-02' + shape: + - 1024 + - 1024 + sum: '-3.729e-01' +grads.network.model.decoder.layers.12.self_attn.v_proj.bias: + device: cuda:0 + max: '8.426e-03' + mean: '2.616e-05' + min: '-1.041e-02' + shape: + - 1024 + sum: '2.679e-02' +grads.network.model.decoder.layers.12.self_attn.v_proj.weight: + device: cuda:0 + max: '2.573e-01' + mean: '-3.806e-07' + min: '-2.223e-01' + shape: + - 1024 + - 1024 + sum: '-3.990e-01' +grads.network.model.decoder.layers.12.self_attn_layer_norm.bias: + device: cuda:0 + max: '9.540e-03' + mean: '1.539e-05' + min: '-1.009e-02' + shape: + - 1024 + sum: '1.576e-02' +grads.network.model.decoder.layers.12.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.112e-02' + mean: '6.956e-06' + min: '-3.292e-02' + shape: + - 1024 + sum: '7.123e-03' +grads.network.model.decoder.layers.13.fc1.bias: + device: cuda:0 + max: '4.255e-03' + mean: '-6.284e-06' + min: '-3.659e-03' + shape: + - 4096 + sum: '-2.574e-02' +grads.network.model.decoder.layers.13.fc1.weight: + device: cuda:0 + max: '9.864e-02' + mean: '-1.925e-08' + min: '-8.668e-02' + shape: + - 4096 + - 1024 + sum: '-8.074e-02' +grads.network.model.decoder.layers.13.fc2.bias: + device: cuda:0 + max: '8.901e-03' + mean: '-9.095e-12' + min: '-9.272e-03' + shape: + - 1024 + sum: '-9.313e-09' +grads.network.model.decoder.layers.13.fc2.weight: + device: cuda:0 + max: '9.958e-03' + mean: '-1.137e-13' + min: '-1.159e-02' + shape: + - 1024 + - 4096 + sum: '-4.768e-07' +grads.network.model.decoder.layers.13.final_layer_norm.bias: + device: cuda:0 + max: '1.098e-02' + mean: '1.136e-04' + min: '-1.088e-02' + shape: + - 1024 + sum: '1.163e-01' +grads.network.model.decoder.layers.13.final_layer_norm.weight: + device: cuda:0 + max: '3.056e-02' + mean: '2.505e-06' + min: '-2.49e-02' + shape: + - 1024 + sum: '2.565e-03' +grads.network.model.decoder.layers.13.self_attn.k_proj.bias: + device: cuda:0 + max: '3.056e-10' + mean: '-3.326e-12' + min: '-4.657e-10' + shape: + - 1024 + sum: '-3.406e-09' +grads.network.model.decoder.layers.13.self_attn.k_proj.weight: + device: cuda:0 + max: '3.654e-02' + mean: '2.432e-13' + min: '-4.357e-02' + shape: + - 1024 + - 1024 + sum: '2.551e-07' +grads.network.model.decoder.layers.13.self_attn.out_proj.bias: + device: cuda:0 + max: '7.424e-03' + mean: '-3.638e-12' + min: '-9.317e-03' + shape: + - 1024 + sum: '-3.725e-09' +grads.network.model.decoder.layers.13.self_attn.out_proj.weight: + device: cuda:0 + max: '3.228e-03' + mean: '7.105e-14' + min: '-2.774e-03' + shape: + - 1024 + - 1024 + sum: '7.451e-08' +grads.network.model.decoder.layers.13.self_attn.q_proj.bias: + device: cuda:0 + max: '2.412e-03' + mean: '1.546e-05' + min: '-1.678e-03' + shape: + - 1024 + sum: '1.583e-02' +grads.network.model.decoder.layers.13.self_attn.q_proj.weight: + device: cuda:0 + max: '1.646e-02' + mean: '-2.364e-07' + min: '-1.986e-02' + shape: + - 1024 + - 1024 + sum: '-2.479e-01' +grads.network.model.decoder.layers.13.self_attn.v_proj.bias: + device: cuda:0 + max: '9.358e-03' + mean: '-2.785e-05' + min: '-8.192e-03' + shape: + - 1024 + sum: '-2.851e-02' +grads.network.model.decoder.layers.13.self_attn.v_proj.weight: + device: cuda:0 + max: '2.093e-01' + mean: '4.26e-07' + min: '-2.454e-01' + shape: + - 1024 + - 1024 + sum: '4.467e-01' +grads.network.model.decoder.layers.13.self_attn_layer_norm.bias: + device: cuda:0 + max: '7.755e-03' + mean: '4.027e-05' + min: '-9.616e-03' + shape: + - 1024 + sum: '4.124e-02' +grads.network.model.decoder.layers.13.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.237e-02' + mean: '2.634e-06' + min: '-3.056e-02' + shape: + - 1024 + sum: '2.697e-03' +grads.network.model.decoder.layers.14.fc1.bias: + device: cuda:0 + max: '3.368e-03' + mean: '-4.94e-06' + min: '-4.024e-03' + shape: + - 4096 + sum: '-2.023e-02' +grads.network.model.decoder.layers.14.fc1.weight: + device: cuda:0 + max: '1.023e-01' + mean: '-4.683e-09' + min: '-8.753e-02' + shape: + - 4096 + - 1024 + sum: '-1.964e-02' +grads.network.model.decoder.layers.14.fc2.bias: + device: cuda:0 + max: '9.881e-03' + mean: '-2.183e-11' + min: '-9.016e-03' + shape: + - 1024 + sum: '-2.235e-08' +grads.network.model.decoder.layers.14.fc2.weight: + device: cuda:0 + max: '1.668e-02' + mean: '-1.592e-12' + min: '-1.498e-02' + shape: + - 1024 + - 4096 + sum: '-6.676e-06' +grads.network.model.decoder.layers.14.final_layer_norm.bias: + device: cuda:0 + max: '1.219e-02' + mean: '2.743e-05' + min: '-1.083e-02' + shape: + - 1024 + sum: '2.809e-02' +grads.network.model.decoder.layers.14.final_layer_norm.weight: + device: cuda:0 + max: '1.590e-02' + mean: '-4.36e-06' + min: '-3.127e-02' + shape: + - 1024 + sum: '-4.464e-03' +grads.network.model.decoder.layers.14.self_attn.k_proj.bias: + device: cuda:0 + max: '3.929e-10' + mean: '-2.173e-12' + min: '-3.056e-10' + shape: + - 1024 + sum: '-2.226e-09' +grads.network.model.decoder.layers.14.self_attn.k_proj.weight: + device: cuda:0 + max: '5.135e-02' + mean: '-5.795e-14' + min: '-4.326e-02' + shape: + - 1024 + - 1024 + sum: '-6.077e-08' +grads.network.model.decoder.layers.14.self_attn.out_proj.bias: + device: cuda:0 + max: '9.779e-03' + mean: '9.095e-12' + min: '-8.985e-03' + shape: + - 1024 + sum: '9.313e-09' +grads.network.model.decoder.layers.14.self_attn.out_proj.weight: + device: cuda:0 + max: '2.521e-03' + mean: '-2.842e-14' + min: '-2.492e-03' + shape: + - 1024 + - 1024 + sum: '-2.980e-08' +grads.network.model.decoder.layers.14.self_attn.q_proj.bias: + device: cuda:0 + max: '2.483e-03' + mean: '-2.104e-05' + min: '-4.766e-03' + shape: + - 1024 + sum: '-2.155e-02' +grads.network.model.decoder.layers.14.self_attn.q_proj.weight: + device: cuda:0 + max: '3.591e-02' + mean: '4.924e-07' + min: '-2.957e-02' + shape: + - 1024 + - 1024 + sum: '5.163e-01' +grads.network.model.decoder.layers.14.self_attn.v_proj.bias: + device: cuda:0 + max: '8.477e-03' + mean: '1.055e-04' + min: '-8.184e-03' + shape: + - 1024 + sum: '1.081e-01' +grads.network.model.decoder.layers.14.self_attn.v_proj.weight: + device: cuda:0 + max: '2.027e-01' + mean: '-2.47e-06' + min: '-2.218e-01' + shape: + - 1024 + - 1024 + sum: '-2.59e+00' +grads.network.model.decoder.layers.14.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.029e-02' + mean: '4.850e-05' + min: '-9.323e-03' + shape: + - 1024 + sum: '4.967e-02' +grads.network.model.decoder.layers.14.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.910e-02' + mean: '5.651e-06' + min: '-3.208e-02' + shape: + - 1024 + sum: '5.786e-03' +grads.network.model.decoder.layers.15.fc1.bias: + device: cuda:0 + max: '5.394e-03' + mean: '-1.012e-05' + min: '-6.176e-03' + shape: + - 4096 + sum: '-4.146e-02' +grads.network.model.decoder.layers.15.fc1.weight: + device: cuda:0 + max: '8.324e-02' + mean: '-1.046e-08' + min: '-1.047e-01' + shape: + - 4096 + - 1024 + sum: '-4.386e-02' +grads.network.model.decoder.layers.15.fc2.bias: + device: cuda:0 + max: '9.866e-03' + mean: '-7.276e-12' + min: '-1.172e-02' + shape: + - 1024 + sum: '-7.451e-09' +grads.network.model.decoder.layers.15.fc2.weight: + device: cuda:0 + max: '1.37e-02' + mean: '-5.684e-13' + min: '-1.439e-02' + shape: + - 1024 + - 4096 + sum: '-2.384e-06' +grads.network.model.decoder.layers.15.final_layer_norm.bias: + device: cuda:0 + max: '1.231e-02' + mean: '-1.332e-04' + min: '-1.468e-02' + shape: + - 1024 + sum: '-1.364e-01' +grads.network.model.decoder.layers.15.final_layer_norm.weight: + device: cuda:0 + max: '3.634e-02' + mean: '1.128e-05' + min: '-3.444e-02' + shape: + - 1024 + sum: '1.155e-02' +grads.network.model.decoder.layers.15.self_attn.k_proj.bias: + device: cuda:0 + max: '1.164e-09' + mean: '3.457e-12' + min: '-4.657e-10' + shape: + - 1024 + sum: '3.54e-09' +grads.network.model.decoder.layers.15.self_attn.k_proj.weight: + device: cuda:0 + max: '3.154e-02' + mean: '4.652e-14' + min: '-2.124e-02' + shape: + - 1024 + - 1024 + sum: '4.878e-08' +grads.network.model.decoder.layers.15.self_attn.out_proj.bias: + device: cuda:0 + max: '9.871e-03' + mean: '-1.455e-11' + min: '-9.811e-03' + shape: + - 1024 + sum: '-1.490e-08' +grads.network.model.decoder.layers.15.self_attn.out_proj.weight: + device: cuda:0 + max: '4.353e-03' + mean: '1.421e-14' + min: '-4.717e-03' + shape: + - 1024 + - 1024 + sum: '1.490e-08' +grads.network.model.decoder.layers.15.self_attn.q_proj.bias: + device: cuda:0 + max: '1.886e-03' + mean: '2.190e-05' + min: '-2.335e-03' + shape: + - 1024 + sum: '2.243e-02' +grads.network.model.decoder.layers.15.self_attn.q_proj.weight: + device: cuda:0 + max: '2.037e-02' + mean: '-4.754e-07' + min: '-2.289e-02' + shape: + - 1024 + - 1024 + sum: '-4.985e-01' +grads.network.model.decoder.layers.15.self_attn.v_proj.bias: + device: cuda:0 + max: '7.805e-03' + mean: '-4.434e-05' + min: '-9.824e-03' + shape: + - 1024 + sum: '-4.541e-02' +grads.network.model.decoder.layers.15.self_attn.v_proj.weight: + device: cuda:0 + max: '1.984e-01' + mean: '9.627e-07' + min: '-1.703e-01' + shape: + - 1024 + - 1024 + sum: '1.009e+00' +grads.network.model.decoder.layers.15.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.079e-02' + mean: '1.138e-04' + min: '-1.047e-02' + shape: + - 1024 + sum: '1.165e-01' +grads.network.model.decoder.layers.15.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.985e-02' + mean: '-3.775e-06' + min: '-3.666e-02' + shape: + - 1024 + sum: '-3.866e-03' +grads.network.model.decoder.layers.16.fc1.bias: + device: cuda:0 + max: '4.077e-03' + mean: '2.515e-06' + min: '-4.591e-03' + shape: + - 4096 + sum: '1.030e-02' +grads.network.model.decoder.layers.16.fc1.weight: + device: cuda:0 + max: '1.095e-01' + mean: '2.903e-09' + min: '-1.061e-01' + shape: + - 4096 + - 1024 + sum: '1.218e-02' +grads.network.model.decoder.layers.16.fc2.bias: + device: cuda:0 + max: '1.072e-02' + mean: '0.e+00' + min: '-1.028e-02' + shape: + - 1024 + sum: '0.e+00' +grads.network.model.decoder.layers.16.fc2.weight: + device: cuda:0 + max: '2.759e-02' + mean: '0.e+00' + min: '-2.188e-02' + shape: + - 1024 + - 4096 + sum: '0.e+00' +grads.network.model.decoder.layers.16.final_layer_norm.bias: + device: cuda:0 + max: '1.385e-02' + mean: '3.693e-04' + min: '-1.169e-02' + shape: + - 1024 + sum: '3.781e-01' +grads.network.model.decoder.layers.16.final_layer_norm.weight: + device: cuda:0 + max: '2.044e-02' + mean: '-2.249e-06' + min: '-2.405e-02' + shape: + - 1024 + sum: '-2.303e-03' +grads.network.model.decoder.layers.16.self_attn.k_proj.bias: + device: cuda:0 + max: '4.657e-10' + mean: '-1.148e-12' + min: '-4.657e-10' + shape: + - 1024 + sum: '-1.176e-09' +grads.network.model.decoder.layers.16.self_attn.k_proj.weight: + device: cuda:0 + max: '2.442e-02' + mean: '7.527e-14' + min: '-2.925e-02' + shape: + - 1024 + - 1024 + sum: '7.893e-08' +grads.network.model.decoder.layers.16.self_attn.out_proj.bias: + device: cuda:0 + max: '8.875e-03' + mean: '0.e+00' + min: '-9.845e-03' + shape: + - 1024 + sum: '0.e+00' +grads.network.model.decoder.layers.16.self_attn.out_proj.weight: + device: cuda:0 + max: '2.749e-03' + mean: '-1.563e-13' + min: '-2.783e-03' + shape: + - 1024 + - 1024 + sum: '-1.639e-07' +grads.network.model.decoder.layers.16.self_attn.q_proj.bias: + device: cuda:0 + max: '1.541e-03' + mean: '-7.89e-06' + min: '-2.125e-03' + shape: + - 1024 + sum: '-8.079e-03' +grads.network.model.decoder.layers.16.self_attn.q_proj.weight: + device: cuda:0 + max: '2.979e-02' + mean: '1.649e-07' + min: '-3.029e-02' + shape: + - 1024 + - 1024 + sum: '1.729e-01' +grads.network.model.decoder.layers.16.self_attn.v_proj.bias: + device: cuda:0 + max: '9.657e-03' + mean: '-1.308e-04' + min: '-9.640e-03' + shape: + - 1024 + sum: '-1.339e-01' +grads.network.model.decoder.layers.16.self_attn.v_proj.weight: + device: cuda:0 + max: '2.179e-01' + mean: '2.732e-06' + min: '-2.213e-01' + shape: + - 1024 + - 1024 + sum: '2.865e+00' +grads.network.model.decoder.layers.16.self_attn_layer_norm.bias: + device: cuda:0 + max: '9.162e-03' + mean: '-9.535e-05' + min: '-1.059e-02' + shape: + - 1024 + sum: '-9.764e-02' +grads.network.model.decoder.layers.16.self_attn_layer_norm.weight: + device: cuda:0 + max: '2.578e-02' + mean: '9.235e-06' + min: '-2.987e-02' + shape: + - 1024 + sum: '9.457e-03' +grads.network.model.decoder.layers.17.fc1.bias: + device: cuda:0 + max: '6.044e-03' + mean: '2.890e-06' + min: '-6.564e-03' + shape: + - 4096 + sum: '1.184e-02' +grads.network.model.decoder.layers.17.fc1.weight: + device: cuda:0 + max: '1.345e-01' + mean: '5.029e-10' + min: '-1.541e-01' + shape: + - 4096 + - 1024 + sum: '2.109e-03' +grads.network.model.decoder.layers.17.fc2.bias: + device: cuda:0 + max: '1.305e-02' + mean: '0.e+00' + min: '-1.607e-02' + shape: + - 1024 + sum: '0.e+00' +grads.network.model.decoder.layers.17.fc2.weight: + device: cuda:0 + max: '2.616e-02' + mean: '0.e+00' + min: '-3.049e-02' + shape: + - 1024 + - 4096 + sum: '0.e+00' +grads.network.model.decoder.layers.17.final_layer_norm.bias: + device: cuda:0 + max: '1.535e-02' + mean: '-2.257e-04' + min: '-1.923e-02' + shape: + - 1024 + sum: '-2.311e-01' +grads.network.model.decoder.layers.17.final_layer_norm.weight: + device: cuda:0 + max: '3.850e-02' + mean: '2.985e-05' + min: '-2.193e-02' + shape: + - 1024 + sum: '3.056e-02' +grads.network.model.decoder.layers.17.self_attn.k_proj.bias: + device: cuda:0 + max: '3.201e-10' + mean: '1.170e-12' + min: '-2.183e-10' + shape: + - 1024 + sum: '1.198e-09' +grads.network.model.decoder.layers.17.self_attn.k_proj.weight: + device: cuda:0 + max: '1.88e-02' + mean: '1.493e-13' + min: '-1.416e-02' + shape: + - 1024 + - 1024 + sum: '1.566e-07' +grads.network.model.decoder.layers.17.self_attn.out_proj.bias: + device: cuda:0 + max: '1.277e-02' + mean: '-1.455e-11' + min: '-1.398e-02' + shape: + - 1024 + sum: '-1.490e-08' +grads.network.model.decoder.layers.17.self_attn.out_proj.weight: + device: cuda:0 + max: '3.332e-03' + mean: '9.592e-14' + min: '-4.020e-03' + shape: + - 1024 + - 1024 + sum: '1.006e-07' +grads.network.model.decoder.layers.17.self_attn.q_proj.bias: + device: cuda:0 + max: '8.169e-04' + mean: '1.575e-07' + min: '-1.763e-03' + shape: + - 1024 + sum: '1.613e-04' +grads.network.model.decoder.layers.17.self_attn.q_proj.weight: + device: cuda:0 + max: '2.347e-02' + mean: '-2.684e-09' + min: '-1.066e-02' + shape: + - 1024 + - 1024 + sum: '-2.815e-03' +grads.network.model.decoder.layers.17.self_attn.v_proj.bias: + device: cuda:0 + max: '1.098e-02' + mean: '-1.444e-05' + min: '-1.304e-02' + shape: + - 1024 + sum: '-1.479e-02' +grads.network.model.decoder.layers.17.self_attn.v_proj.weight: + device: cuda:0 + max: '3.683e-01' + mean: '2.462e-07' + min: '-3.150e-01' + shape: + - 1024 + - 1024 + sum: '2.581e-01' +grads.network.model.decoder.layers.17.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.358e-02' + mean: '-5.711e-06' + min: '-1.483e-02' + shape: + - 1024 + sum: '-5.848e-03' +grads.network.model.decoder.layers.17.self_attn_layer_norm.weight: + device: cuda:0 + max: '2.098e-02' + mean: '3.371e-06' + min: '-1.99e-02' + shape: + - 1024 + sum: '3.452e-03' +grads.network.model.decoder.layers.18.fc1.bias: + device: cuda:0 + max: '1.147e-02' + mean: '-5.311e-06' + min: '-7.232e-03' + shape: + - 4096 + sum: '-2.175e-02' +grads.network.model.decoder.layers.18.fc1.weight: + device: cuda:0 + max: '1.619e-01' + mean: '-9.185e-09' + min: '-3.223e-01' + shape: + - 4096 + - 1024 + sum: '-3.853e-02' +grads.network.model.decoder.layers.18.fc2.bias: + device: cuda:0 + max: '1.429e-02' + mean: '0.e+00' + min: '-1.499e-02' + shape: + - 1024 + sum: '0.e+00' +grads.network.model.decoder.layers.18.fc2.weight: + device: cuda:0 + max: '2.821e-02' + mean: '-2.274e-13' + min: '-2.067e-02' + shape: + - 1024 + - 4096 + sum: '-9.537e-07' +grads.network.model.decoder.layers.18.final_layer_norm.bias: + device: cuda:0 + max: '1.670e-02' + mean: '2.067e-04' + min: '-1.701e-02' + shape: + - 1024 + sum: '2.117e-01' +grads.network.model.decoder.layers.18.final_layer_norm.weight: + device: cuda:0 + max: '1.673e-02' + mean: '-3.888e-05' + min: '-1.522e-02' + shape: + - 1024 + sum: '-3.981e-02' +grads.network.model.decoder.layers.18.self_attn.k_proj.bias: + device: cuda:0 + max: '8.731e-10' + mean: '2.129e-12' + min: '-4.075e-10' + shape: + - 1024 + sum: '2.18e-09' +grads.network.model.decoder.layers.18.self_attn.k_proj.weight: + device: cuda:0 + max: '4.180e-02' + mean: '1.821e-14' + min: '-5.685e-02' + shape: + - 1024 + - 1024 + sum: '1.909e-08' +grads.network.model.decoder.layers.18.self_attn.out_proj.bias: + device: cuda:0 + max: '1.283e-02' + mean: '7.276e-12' + min: '-1.266e-02' + shape: + - 1024 + sum: '7.451e-09' +grads.network.model.decoder.layers.18.self_attn.out_proj.weight: + device: cuda:0 + max: '2.322e-03' + mean: '2.842e-14' + min: '-2.526e-03' + shape: + - 1024 + - 1024 + sum: '2.980e-08' +grads.network.model.decoder.layers.18.self_attn.q_proj.bias: + device: cuda:0 + max: '5.705e-03' + mean: '-1.891e-05' + min: '-5.284e-03' + shape: + - 1024 + sum: '-1.937e-02' +grads.network.model.decoder.layers.18.self_attn.q_proj.weight: + device: cuda:0 + max: '7.843e-02' + mean: '2.579e-07' + min: '-8.680e-02' + shape: + - 1024 + - 1024 + sum: '2.704e-01' +grads.network.model.decoder.layers.18.self_attn.v_proj.bias: + device: cuda:0 + max: '1.423e-02' + mean: '1.193e-04' + min: '-1.538e-02' + shape: + - 1024 + sum: '1.222e-01' +grads.network.model.decoder.layers.18.self_attn.v_proj.weight: + device: cuda:0 + max: '4.271e-01' + mean: '-1.627e-06' + min: '-3.934e-01' + shape: + - 1024 + - 1024 + sum: '-1.706e+00' +grads.network.model.decoder.layers.18.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.349e-02' + mean: '1.753e-06' + min: '-1.332e-02' + shape: + - 1024 + sum: '1.795e-03' +grads.network.model.decoder.layers.18.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.638e-02' + mean: '1.578e-06' + min: '-1.96e-02' + shape: + - 1024 + sum: '1.616e-03' +grads.network.model.decoder.layers.19.fc1.bias: + device: cuda:0 + max: '1.043e-02' + mean: '3.285e-06' + min: '-8.926e-03' + shape: + - 4096 + sum: '1.346e-02' +grads.network.model.decoder.layers.19.fc1.weight: + device: cuda:0 + max: '2.514e-01' + mean: '1.092e-08' + min: '-2.619e-01' + shape: + - 4096 + - 1024 + sum: '4.581e-02' +grads.network.model.decoder.layers.19.fc2.bias: + device: cuda:0 + max: '1.579e-02' + mean: '7.276e-12' + min: '-1.67e-02' + shape: + - 1024 + sum: '7.451e-09' +grads.network.model.decoder.layers.19.fc2.weight: + device: cuda:0 + max: '2.852e-02' + mean: '0.e+00' + min: '-2.674e-02' + shape: + - 1024 + - 4096 + sum: '0.e+00' +grads.network.model.decoder.layers.19.final_layer_norm.bias: + device: cuda:0 + max: '1.804e-02' + mean: '8.083e-05' + min: '-1.924e-02' + shape: + - 1024 + sum: '8.276e-02' +grads.network.model.decoder.layers.19.final_layer_norm.weight: + device: cuda:0 + max: '2.331e-02' + mean: '-1.504e-05' + min: '-1.230e-02' + shape: + - 1024 + sum: '-1.54e-02' +grads.network.model.decoder.layers.19.self_attn.k_proj.bias: + device: cuda:0 + max: '4.075e-10' + mean: '-1.247e-12' + min: '-4.948e-10' + shape: + - 1024 + sum: '-1.277e-09' +grads.network.model.decoder.layers.19.self_attn.k_proj.weight: + device: cuda:0 + max: '4.950e-02' + mean: '1.668e-13' + min: '-3.336e-02' + shape: + - 1024 + - 1024 + sum: '1.749e-07' +grads.network.model.decoder.layers.19.self_attn.out_proj.bias: + device: cuda:0 + max: '1.443e-02' + mean: '4.366e-11' + min: '-1.464e-02' + shape: + - 1024 + sum: '4.470e-08' +grads.network.model.decoder.layers.19.self_attn.out_proj.weight: + device: cuda:0 + max: '5.047e-03' + mean: '1.137e-13' + min: '-4.323e-03' + shape: + - 1024 + - 1024 + sum: '1.192e-07' +grads.network.model.decoder.layers.19.self_attn.q_proj.bias: + device: cuda:0 + max: '2.846e-03' + mean: '-5.669e-06' + min: '-2.716e-03' + shape: + - 1024 + sum: '-5.805e-03' +grads.network.model.decoder.layers.19.self_attn.q_proj.weight: + device: cuda:0 + max: '5.232e-02' + mean: '7.022e-08' + min: '-5.666e-02' + shape: + - 1024 + - 1024 + sum: '7.363e-02' +grads.network.model.decoder.layers.19.self_attn.v_proj.bias: + device: cuda:0 + max: '1.353e-02' + mean: '-1.046e-04' + min: '-1.307e-02' + shape: + - 1024 + sum: '-1.071e-01' +grads.network.model.decoder.layers.19.self_attn.v_proj.weight: + device: cuda:0 + max: '3.506e-01' + mean: '1.296e-06' + min: '-3.869e-01' + shape: + - 1024 + - 1024 + sum: '1.359e+00' +grads.network.model.decoder.layers.19.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.543e-02' + mean: '1.895e-05' + min: '-1.569e-02' + shape: + - 1024 + sum: '1.941e-02' +grads.network.model.decoder.layers.19.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.44e-02' + mean: '5.186e-07' + min: '-1.104e-02' + shape: + - 1024 + sum: '5.310e-04' +grads.network.model.decoder.layers.2.fc1.bias: + device: cuda:0 + max: '5.921e-03' + mean: '8.856e-06' + min: '-9.619e-03' + shape: + - 4096 + sum: '3.627e-02' +grads.network.model.decoder.layers.2.fc1.weight: + device: cuda:0 + max: '1.109e-01' + mean: '-1.692e-08' + min: '-1.033e-01' + shape: + - 4096 + - 1024 + sum: '-7.098e-02' +grads.network.model.decoder.layers.2.fc2.bias: + device: cuda:0 + max: '8.814e-03' + mean: '1.455e-11' + min: '-9.890e-03' + shape: + - 1024 + sum: '1.490e-08' +grads.network.model.decoder.layers.2.fc2.weight: + device: cuda:0 + max: '8.03e-03' + mean: '1.705e-13' + min: '-7.305e-03' + shape: + - 1024 + - 4096 + sum: '7.153e-07' +grads.network.model.decoder.layers.2.final_layer_norm.bias: + device: cuda:0 + max: '1.062e-02' + mean: '2.142e-05' + min: '-9.885e-03' + shape: + - 1024 + sum: '2.193e-02' +grads.network.model.decoder.layers.2.final_layer_norm.weight: + device: cuda:0 + max: '1.06e-02' + mean: '1.349e-05' + min: '-3.724e-02' + shape: + - 1024 + sum: '1.382e-02' +grads.network.model.decoder.layers.2.self_attn.k_proj.bias: + device: cuda:0 + max: '6.985e-10' + mean: '3.819e-13' + min: '-3.492e-10' + shape: + - 1024 + sum: '3.911e-10' +grads.network.model.decoder.layers.2.self_attn.k_proj.weight: + device: cuda:0 + max: '1.658e-02' + mean: '-6.373e-14' + min: '-1.493e-02' + shape: + - 1024 + - 1024 + sum: '-6.682e-08' +grads.network.model.decoder.layers.2.self_attn.out_proj.bias: + device: cuda:0 + max: '9.061e-03' + mean: '1.455e-11' + min: '-9.315e-03' + shape: + - 1024 + sum: '1.490e-08' +grads.network.model.decoder.layers.2.self_attn.out_proj.weight: + device: cuda:0 + max: '9.092e-03' + mean: '-1.421e-14' + min: '-8.389e-03' + shape: + - 1024 + - 1024 + sum: '-1.490e-08' +grads.network.model.decoder.layers.2.self_attn.q_proj.bias: + device: cuda:0 + max: '1.064e-03' + mean: '4.480e-06' + min: '-1.057e-03' + shape: + - 1024 + sum: '4.588e-03' +grads.network.model.decoder.layers.2.self_attn.q_proj.weight: + device: cuda:0 + max: '9.205e-03' + mean: '3.874e-08' + min: '-1.268e-02' + shape: + - 1024 + - 1024 + sum: '4.063e-02' +grads.network.model.decoder.layers.2.self_attn.v_proj.bias: + device: cuda:0 + max: '8.063e-03' + mean: '3.71e-05' + min: '-6.821e-03' + shape: + - 1024 + sum: '3.799e-02' +grads.network.model.decoder.layers.2.self_attn.v_proj.weight: + device: cuda:0 + max: '1.234e-01' + mean: '3.208e-07' + min: '-1.047e-01' + shape: + - 1024 + - 1024 + sum: '3.364e-01' +grads.network.model.decoder.layers.2.self_attn_layer_norm.bias: + device: cuda:0 + max: '9.170e-03' + mean: '-3.405e-05' + min: '-9.528e-03' + shape: + - 1024 + sum: '-3.486e-02' +grads.network.model.decoder.layers.2.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.376e-02' + mean: '3.953e-06' + min: '-3.395e-02' + shape: + - 1024 + sum: '4.048e-03' +grads.network.model.decoder.layers.20.fc1.bias: + device: cuda:0 + max: '7.671e-03' + mean: '-3.533e-07' + min: '-1.159e-02' + shape: + - 4096 + sum: '-1.447e-03' +grads.network.model.decoder.layers.20.fc1.weight: + device: cuda:0 + max: '3.498e-01' + mean: '-1.061e-09' + min: '-2.271e-01' + shape: + - 4096 + - 1024 + sum: '-4.449e-03' +grads.network.model.decoder.layers.20.fc2.bias: + device: cuda:0 + max: '1.901e-02' + mean: '-1.455e-11' + min: '-1.83e-02' + shape: + - 1024 + sum: '-1.490e-08' +grads.network.model.decoder.layers.20.fc2.weight: + device: cuda:0 + max: '8.356e-02' + mean: '5.684e-14' + min: '-8.36e-02' + shape: + - 1024 + - 4096 + sum: '2.384e-07' +grads.network.model.decoder.layers.20.final_layer_norm.bias: + device: cuda:0 + max: '2.215e-02' + mean: '2.282e-04' + min: '-2.103e-02' + shape: + - 1024 + sum: '2.337e-01' +grads.network.model.decoder.layers.20.final_layer_norm.weight: + device: cuda:0 + max: '2.260e-02' + mean: '-2.262e-05' + min: '-1.660e-02' + shape: + - 1024 + sum: '-2.316e-02' +grads.network.model.decoder.layers.20.self_attn.k_proj.bias: + device: cuda:0 + max: '3.492e-10' + mean: '1.942e-12' + min: '-3.347e-10' + shape: + - 1024 + sum: '1.989e-09' +grads.network.model.decoder.layers.20.self_attn.k_proj.weight: + device: cuda:0 + max: '3.529e-02' + mean: '-4.73e-14' + min: '-3.390e-02' + shape: + - 1024 + - 1024 + sum: '-4.959e-08' +grads.network.model.decoder.layers.20.self_attn.out_proj.bias: + device: cuda:0 + max: '1.786e-02' + mean: '1.455e-11' + min: '-1.611e-02' + shape: + - 1024 + sum: '1.490e-08' +grads.network.model.decoder.layers.20.self_attn.out_proj.weight: + device: cuda:0 + max: '8.450e-03' + mean: '-1.243e-14' + min: '-9.957e-03' + shape: + - 1024 + - 1024 + sum: '-1.304e-08' +grads.network.model.decoder.layers.20.self_attn.q_proj.bias: + device: cuda:0 + max: '1.168e-03' + mean: '1.373e-05' + min: '-1.461e-03' + shape: + - 1024 + sum: '1.406e-02' +grads.network.model.decoder.layers.20.self_attn.q_proj.weight: + device: cuda:0 + max: '3.718e-02' + mean: '-1.270e-07' + min: '-3.829e-02' + shape: + - 1024 + - 1024 + sum: '-1.332e-01' +grads.network.model.decoder.layers.20.self_attn.v_proj.bias: + device: cuda:0 + max: '1.316e-02' + mean: '1.595e-04' + min: '-1.22e-02' + shape: + - 1024 + sum: '1.634e-01' +grads.network.model.decoder.layers.20.self_attn.v_proj.weight: + device: cuda:0 + max: '3.578e-01' + mean: '-1.476e-06' + min: '-3.892e-01' + shape: + - 1024 + - 1024 + sum: '-1.548e+00' +grads.network.model.decoder.layers.20.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.886e-02' + mean: '-2.963e-04' + min: '-1.759e-02' + shape: + - 1024 + sum: '-3.034e-01' +grads.network.model.decoder.layers.20.self_attn_layer_norm.weight: + device: cuda:0 + max: '2.024e-02' + mean: '9.812e-07' + min: '-1.449e-02' + shape: + - 1024 + sum: '1.005e-03' +grads.network.model.decoder.layers.21.fc1.bias: + device: cuda:0 + max: '1.159e-02' + mean: '-7.116e-06' + min: '-1.195e-02' + shape: + - 4096 + sum: '-2.915e-02' +grads.network.model.decoder.layers.21.fc1.weight: + device: cuda:0 + max: '3.364e-01' + mean: '-2.245e-08' + min: '-3.275e-01' + shape: + - 4096 + - 1024 + sum: '-9.418e-02' +grads.network.model.decoder.layers.21.fc2.bias: + device: cuda:0 + max: '2.210e-02' + mean: '1.455e-11' + min: '-2.116e-02' + shape: + - 1024 + sum: '1.490e-08' +grads.network.model.decoder.layers.21.fc2.weight: + device: cuda:0 + max: '1.082e-01' + mean: '-5.684e-14' + min: '-9.473e-02' + shape: + - 1024 + - 4096 + sum: '-2.384e-07' +grads.network.model.decoder.layers.21.final_layer_norm.bias: + device: cuda:0 + max: '2.494e-02' + mean: '2.162e-05' + min: '-2.386e-02' + shape: + - 1024 + sum: '2.214e-02' +grads.network.model.decoder.layers.21.final_layer_norm.weight: + device: cuda:0 + max: '2.376e-02' + mean: '7.015e-06' + min: '-1.133e-02' + shape: + - 1024 + sum: '7.184e-03' +grads.network.model.decoder.layers.21.self_attn.k_proj.bias: + device: cuda:0 + max: '4.002e-10' + mean: '-1.572e-12' + min: '-3.638e-10' + shape: + - 1024 + sum: '-1.61e-09' +grads.network.model.decoder.layers.21.self_attn.k_proj.weight: + device: cuda:0 + max: '2.533e-02' + mean: '2.293e-13' + min: '-3.203e-02' + shape: + - 1024 + - 1024 + sum: '2.405e-07' +grads.network.model.decoder.layers.21.self_attn.out_proj.bias: + device: cuda:0 + max: '1.854e-02' + mean: '0.e+00' + min: '-1.843e-02' + shape: + - 1024 + sum: '0.e+00' +grads.network.model.decoder.layers.21.self_attn.out_proj.weight: + device: cuda:0 + max: '1.236e-02' + mean: '1.137e-13' + min: '-1.02e-02' + shape: + - 1024 + - 1024 + sum: '1.192e-07' +grads.network.model.decoder.layers.21.self_attn.q_proj.bias: + device: cuda:0 + max: '1.768e-03' + mean: '1.468e-05' + min: '-1.166e-03' + shape: + - 1024 + sum: '1.503e-02' +grads.network.model.decoder.layers.21.self_attn.q_proj.weight: + device: cuda:0 + max: '1.766e-02' + mean: '-1.343e-07' + min: '-2.628e-02' + shape: + - 1024 + - 1024 + sum: '-1.408e-01' +grads.network.model.decoder.layers.21.self_attn.v_proj.bias: + device: cuda:0 + max: '1.447e-02' + mean: '1.302e-05' + min: '-1.778e-02' + shape: + - 1024 + sum: '1.333e-02' +grads.network.model.decoder.layers.21.self_attn.v_proj.weight: + device: cuda:0 + max: '4.942e-01' + mean: '-1.191e-07' + min: '-4.252e-01' + shape: + - 1024 + - 1024 + sum: '-1.249e-01' +grads.network.model.decoder.layers.21.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.995e-02' + mean: '1.246e-05' + min: '-1.996e-02' + shape: + - 1024 + sum: '1.276e-02' +grads.network.model.decoder.layers.21.self_attn_layer_norm.weight: + device: cuda:0 + max: '2.301e-02' + mean: '1.724e-06' + min: '-1.395e-02' + shape: + - 1024 + sum: '1.766e-03' +grads.network.model.decoder.layers.22.fc1.bias: + device: cuda:0 + max: '1.418e-02' + mean: '1.925e-05' + min: '-3.796e-02' + shape: + - 4096 + sum: '7.886e-02' +grads.network.model.decoder.layers.22.fc1.weight: + device: cuda:0 + max: '4.455e-01' + mean: '1.533e-08' + min: '-3.281e-01' + shape: + - 4096 + - 1024 + sum: '6.429e-02' +grads.network.model.decoder.layers.22.fc2.bias: + device: cuda:0 + max: '2.107e-02' + mean: '-2.183e-11' + min: '-1.798e-02' + shape: + - 1024 + sum: '-2.235e-08' +grads.network.model.decoder.layers.22.fc2.weight: + device: cuda:0 + max: '3.631e-02' + mean: '-1.137e-13' + min: '-5.145e-02' + shape: + - 1024 + - 4096 + sum: '-4.768e-07' +grads.network.model.decoder.layers.22.final_layer_norm.bias: + device: cuda:0 + max: '2.261e-02' + mean: '-3.098e-04' + min: '-1.996e-02' + shape: + - 1024 + sum: '-3.173e-01' +grads.network.model.decoder.layers.22.final_layer_norm.weight: + device: cuda:0 + max: '1.112e-01' + mean: '1.792e-05' + min: '-7.273e-03' + shape: + - 1024 + sum: '1.835e-02' +grads.network.model.decoder.layers.22.self_attn.k_proj.bias: + device: cuda:0 + max: '2.838e-10' + mean: '1.338e-12' + min: '-2.328e-10' + shape: + - 1024 + sum: '1.37e-09' +grads.network.model.decoder.layers.22.self_attn.k_proj.weight: + device: cuda:0 + max: '1.521e-02' + mean: '-6.001e-14' + min: '-1.506e-02' + shape: + - 1024 + - 1024 + sum: '-6.292e-08' +grads.network.model.decoder.layers.22.self_attn.out_proj.bias: + device: cuda:0 + max: '1.797e-02' + mean: '2.910e-11' + min: '-1.645e-02' + shape: + - 1024 + sum: '2.980e-08' +grads.network.model.decoder.layers.22.self_attn.out_proj.weight: + device: cuda:0 + max: '1.489e-02' + mean: '-2.132e-13' + min: '-1.383e-02' + shape: + - 1024 + - 1024 + sum: '-2.235e-07' +grads.network.model.decoder.layers.22.self_attn.q_proj.bias: + device: cuda:0 + max: '1.432e-03' + mean: '-1.077e-05' + min: '-1.380e-03' + shape: + - 1024 + sum: '-1.103e-02' +grads.network.model.decoder.layers.22.self_attn.q_proj.weight: + device: cuda:0 + max: '1.757e-02' + mean: '6.216e-08' + min: '-1.876e-02' + shape: + - 1024 + - 1024 + sum: '6.518e-02' +grads.network.model.decoder.layers.22.self_attn.v_proj.bias: + device: cuda:0 + max: '1.04e-02' + mean: '9.040e-05' + min: '-1.207e-02' + shape: + - 1024 + sum: '9.257e-02' +grads.network.model.decoder.layers.22.self_attn.v_proj.weight: + device: cuda:0 + max: '3.492e-01' + mean: '-5.219e-07' + min: '-2.943e-01' + shape: + - 1024 + - 1024 + sum: '-5.472e-01' +grads.network.model.decoder.layers.22.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.879e-02' + mean: '-5.430e-05' + min: '-1.734e-02' + shape: + - 1024 + sum: '-5.561e-02' +grads.network.model.decoder.layers.22.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.860e-02' + mean: '-1.348e-05' + min: '-3.154e-02' + shape: + - 1024 + sum: '-1.380e-02' +grads.network.model.decoder.layers.23.fc1.bias: + device: cuda:0 + max: '1.947e-02' + mean: '2.517e-05' + min: '-1.008e-02' + shape: + - 4096 + sum: '1.031e-01' +grads.network.model.decoder.layers.23.fc1.weight: + device: cuda:0 + max: '1.458e-01' + mean: '4.279e-08' + min: '-2.653e-01' + shape: + - 4096 + - 1024 + sum: '1.795e-01' +grads.network.model.decoder.layers.23.fc2.bias: + device: cuda:0 + max: '9.512e-03' + mean: '1.819e-12' + min: '-9.348e-03' + shape: + - 1024 + sum: '1.863e-09' +grads.network.model.decoder.layers.23.fc2.weight: + device: cuda:0 + max: '2.092e-02' + mean: '-4.547e-13' + min: '-1.892e-02' + shape: + - 1024 + - 4096 + sum: '-1.907e-06' +grads.network.model.decoder.layers.23.final_layer_norm.bias: + device: cuda:0 + max: '1.005e-02' + mean: '-9.368e-05' + min: '-9.654e-03' + shape: + - 1024 + sum: '-9.593e-02' +grads.network.model.decoder.layers.23.final_layer_norm.weight: + device: cuda:0 + max: '9.125e-03' + mean: '2.809e-04' + min: '-8.498e-03' + shape: + - 1024 + sum: '2.876e-01' +grads.network.model.decoder.layers.23.self_attn.k_proj.bias: + device: cuda:0 + max: '1.048e-09' + mean: '-2.047e-13' + min: '-1.513e-09' + shape: + - 1024 + sum: '-2.096e-10' +grads.network.model.decoder.layers.23.self_attn.k_proj.weight: + device: cuda:0 + max: '7.757e-02' + mean: '-1.006e-13' + min: '-1.167e-01' + shape: + - 1024 + - 1024 + sum: '-1.055e-07' +grads.network.model.decoder.layers.23.self_attn.out_proj.bias: + device: cuda:0 + max: '9.025e-03' + mean: '-5.457e-12' + min: '-8.085e-03' + shape: + - 1024 + sum: '-5.588e-09' +grads.network.model.decoder.layers.23.self_attn.out_proj.weight: + device: cuda:0 + max: '4.444e-03' + mean: '-6.395e-14' + min: '-4.31e-03' + shape: + - 1024 + - 1024 + sum: '-6.706e-08' +grads.network.model.decoder.layers.23.self_attn.q_proj.bias: + device: cuda:0 + max: '6.065e-03' + mean: '3.442e-05' + min: '-5.142e-03' + shape: + - 1024 + sum: '3.525e-02' +grads.network.model.decoder.layers.23.self_attn.q_proj.weight: + device: cuda:0 + max: '7.615e-02' + mean: '-1.647e-07' + min: '-8.673e-02' + shape: + - 1024 + - 1024 + sum: '-1.727e-01' +grads.network.model.decoder.layers.23.self_attn.v_proj.bias: + device: cuda:0 + max: '1.326e-02' + mean: '-5.18e-05' + min: '-1.957e-02' + shape: + - 1024 + sum: '-5.304e-02' +grads.network.model.decoder.layers.23.self_attn.v_proj.weight: + device: cuda:0 + max: '5.156e-01' + mean: '2.478e-07' + min: '-3.333e-01' + shape: + - 1024 + - 1024 + sum: '2.599e-01' +grads.network.model.decoder.layers.23.self_attn_layer_norm.bias: + device: cuda:0 + max: '9.140e-03' + mean: '1.168e-04' + min: '-7.772e-03' + shape: + - 1024 + sum: '1.196e-01' +grads.network.model.decoder.layers.23.self_attn_layer_norm.weight: + device: cuda:0 + max: '5.779e-03' + mean: '4.173e-06' + min: '-1.385e-02' + shape: + - 1024 + sum: '4.273e-03' +grads.network.model.decoder.layers.3.fc1.bias: + device: cuda:0 + max: '5.954e-03' + mean: '1.316e-05' + min: '-8.344e-03' + shape: + - 4096 + sum: '5.389e-02' +grads.network.model.decoder.layers.3.fc1.weight: + device: cuda:0 + max: '1.064e-01' + mean: '-6.116e-09' + min: '-9.593e-02' + shape: + - 4096 + - 1024 + sum: '-2.565e-02' +grads.network.model.decoder.layers.3.fc2.bias: + device: cuda:0 + max: '8.140e-03' + mean: '-3.638e-12' + min: '-1.140e-02' + shape: + - 1024 + sum: '-3.725e-09' +grads.network.model.decoder.layers.3.fc2.weight: + device: cuda:0 + max: '1.384e-02' + mean: '4.547e-13' + min: '-1.706e-02' + shape: + - 1024 + - 4096 + sum: '1.907e-06' +grads.network.model.decoder.layers.3.final_layer_norm.bias: + device: cuda:0 + max: '9.449e-03' + mean: '2.546e-05' + min: '-1.205e-02' + shape: + - 1024 + sum: '2.607e-02' +grads.network.model.decoder.layers.3.final_layer_norm.weight: + device: cuda:0 + max: '2.066e-02' + mean: '-4.079e-05' + min: '-3.198e-02' + shape: + - 1024 + sum: '-4.177e-02' +grads.network.model.decoder.layers.3.self_attn.k_proj.bias: + device: cuda:0 + max: '3.056e-10' + mean: '-1.023e-12' + min: '-2.983e-10' + shape: + - 1024 + sum: '-1.047e-09' +grads.network.model.decoder.layers.3.self_attn.k_proj.weight: + device: cuda:0 + max: '1.167e-02' + mean: '-1.421e-14' + min: '-1.363e-02' + shape: + - 1024 + - 1024 + sum: '-1.490e-08' +grads.network.model.decoder.layers.3.self_attn.out_proj.bias: + device: cuda:0 + max: '7.554e-03' + mean: '1.819e-11' + min: '-1.130e-02' + shape: + - 1024 + sum: '1.863e-08' +grads.network.model.decoder.layers.3.self_attn.out_proj.weight: + device: cuda:0 + max: '1.395e-02' + mean: '7.105e-14' + min: '-9.944e-03' + shape: + - 1024 + - 1024 + sum: '7.451e-08' +grads.network.model.decoder.layers.3.self_attn.q_proj.bias: + device: cuda:0 + max: '1.262e-03' + mean: '1.523e-05' + min: '-1.661e-03' + shape: + - 1024 + sum: '1.560e-02' +grads.network.model.decoder.layers.3.self_attn.q_proj.weight: + device: cuda:0 + max: '1.264e-02' + mean: '1.393e-07' + min: '-1.569e-02' + shape: + - 1024 + - 1024 + sum: '1.461e-01' +grads.network.model.decoder.layers.3.self_attn.v_proj.bias: + device: cuda:0 + max: '6.315e-03' + mean: '3.350e-05' + min: '-1.044e-02' + shape: + - 1024 + sum: '3.431e-02' +grads.network.model.decoder.layers.3.self_attn.v_proj.weight: + device: cuda:0 + max: '1.511e-01' + mean: '3.064e-07' + min: '-1.489e-01' + shape: + - 1024 + - 1024 + sum: '3.212e-01' +grads.network.model.decoder.layers.3.self_attn_layer_norm.bias: + device: cuda:0 + max: '7.629e-03' + mean: '2.019e-05' + min: '-1.149e-02' + shape: + - 1024 + sum: '2.068e-02' +grads.network.model.decoder.layers.3.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.384e-02' + mean: '1.535e-06' + min: '-3.271e-02' + shape: + - 1024 + sum: '1.572e-03' +grads.network.model.decoder.layers.4.fc1.bias: + device: cuda:0 + max: '8.716e-03' + mean: '-6.134e-06' + min: '-3.885e-03' + shape: + - 4096 + sum: '-2.513e-02' +grads.network.model.decoder.layers.4.fc1.weight: + device: cuda:0 + max: '9.354e-02' + mean: '-1.18e-09' + min: '-1.037e-01' + shape: + - 4096 + - 1024 + sum: '-4.948e-03' +grads.network.model.decoder.layers.4.fc2.bias: + device: cuda:0 + max: '7.127e-03' + mean: '-1.455e-11' + min: '-8.873e-03' + shape: + - 1024 + sum: '-1.490e-08' +grads.network.model.decoder.layers.4.fc2.weight: + device: cuda:0 + max: '1.011e-02' + mean: '-2.274e-13' + min: '-1.157e-02' + shape: + - 1024 + - 4096 + sum: '-9.537e-07' +grads.network.model.decoder.layers.4.final_layer_norm.bias: + device: cuda:0 + max: '7.855e-03' + mean: '-2.88e-05' + min: '-9.680e-03' + shape: + - 1024 + sum: '-2.949e-02' +grads.network.model.decoder.layers.4.final_layer_norm.weight: + device: cuda:0 + max: '1.503e-02' + mean: '1.502e-06' + min: '-1.015e-02' + shape: + - 1024 + sum: '1.538e-03' +grads.network.model.decoder.layers.4.self_attn.k_proj.bias: + device: cuda:0 + max: '4.511e-10' + mean: '-4.124e-12' + min: '-2.838e-10' + shape: + - 1024 + sum: '-4.223e-09' +grads.network.model.decoder.layers.4.self_attn.k_proj.weight: + device: cuda:0 + max: '2.309e-02' + mean: '-2.882e-13' + min: '-2.746e-02' + shape: + - 1024 + - 1024 + sum: '-3.022e-07' +grads.network.model.decoder.layers.4.self_attn.out_proj.bias: + device: cuda:0 + max: '7.763e-03' + mean: '-7.276e-12' + min: '-1.027e-02' + shape: + - 1024 + sum: '-7.451e-09' +grads.network.model.decoder.layers.4.self_attn.out_proj.weight: + device: cuda:0 + max: '1.258e-02' + mean: '-5.684e-14' + min: '-8.443e-03' + shape: + - 1024 + - 1024 + sum: '-5.960e-08' +grads.network.model.decoder.layers.4.self_attn.q_proj.bias: + device: cuda:0 + max: '1.406e-03' + mean: '8.718e-06' + min: '-1.263e-03' + shape: + - 1024 + sum: '8.927e-03' +grads.network.model.decoder.layers.4.self_attn.q_proj.weight: + device: cuda:0 + max: '1.614e-02' + mean: '5.714e-08' + min: '-1.253e-02' + shape: + - 1024 + - 1024 + sum: '5.992e-02' +grads.network.model.decoder.layers.4.self_attn.v_proj.bias: + device: cuda:0 + max: '7.103e-03' + mean: '4.113e-05' + min: '-7.943e-03' + shape: + - 1024 + sum: '4.212e-02' +grads.network.model.decoder.layers.4.self_attn.v_proj.weight: + device: cuda:0 + max: '1.551e-01' + mean: '2.696e-07' + min: '-1.392e-01' + shape: + - 1024 + - 1024 + sum: '2.827e-01' +grads.network.model.decoder.layers.4.self_attn_layer_norm.bias: + device: cuda:0 + max: '8.028e-03' + mean: '7.166e-06' + min: '-1.046e-02' + shape: + - 1024 + sum: '7.338e-03' +grads.network.model.decoder.layers.4.self_attn_layer_norm.weight: + device: cuda:0 + max: '8.643e-03' + mean: '-1.091e-05' + min: '-2.483e-02' + shape: + - 1024 + sum: '-1.117e-02' +grads.network.model.decoder.layers.5.fc1.bias: + device: cuda:0 + max: '4.748e-03' + mean: '4.587e-06' + min: '-5.883e-03' + shape: + - 4096 + sum: '1.879e-02' +grads.network.model.decoder.layers.5.fc1.weight: + device: cuda:0 + max: '9.723e-02' + mean: '-2.199e-09' + min: '-1.125e-01' + shape: + - 4096 + - 1024 + sum: '-9.221e-03' +grads.network.model.decoder.layers.5.fc2.bias: + device: cuda:0 + max: '7.651e-03' + mean: '2.183e-11' + min: '-1.023e-02' + shape: + - 1024 + sum: '2.235e-08' +grads.network.model.decoder.layers.5.fc2.weight: + device: cuda:0 + max: '1.427e-02' + mean: '4.547e-13' + min: '-1.743e-02' + shape: + - 1024 + - 4096 + sum: '1.907e-06' +grads.network.model.decoder.layers.5.final_layer_norm.bias: + device: cuda:0 + max: '8.459e-03' + mean: '-6.824e-05' + min: '-1.104e-02' + shape: + - 1024 + sum: '-6.988e-02' +grads.network.model.decoder.layers.5.final_layer_norm.weight: + device: cuda:0 + max: '2.276e-02' + mean: '1.546e-05' + min: '-1.198e-02' + shape: + - 1024 + sum: '1.583e-02' +grads.network.model.decoder.layers.5.self_attn.k_proj.bias: + device: cuda:0 + max: '4.366e-10' + mean: '2.527e-12' + min: '-3.929e-10' + shape: + - 1024 + sum: '2.588e-09' +grads.network.model.decoder.layers.5.self_attn.k_proj.weight: + device: cuda:0 + max: '2.063e-02' + mean: '6.717e-14' + min: '-1.871e-02' + shape: + - 1024 + - 1024 + sum: '7.043e-08' +grads.network.model.decoder.layers.5.self_attn.out_proj.bias: + device: cuda:0 + max: '7.647e-03' + mean: '1.455e-11' + min: '-1.1e-02' + shape: + - 1024 + sum: '1.490e-08' +grads.network.model.decoder.layers.5.self_attn.out_proj.weight: + device: cuda:0 + max: '1.146e-02' + mean: '-1.137e-13' + min: '-7.558e-03' + shape: + - 1024 + - 1024 + sum: '-1.192e-07' +grads.network.model.decoder.layers.5.self_attn.q_proj.bias: + device: cuda:0 + max: '1.232e-03' + mean: '5.46e-06' + min: '-1.171e-03' + shape: + - 1024 + sum: '5.591e-03' +grads.network.model.decoder.layers.5.self_attn.q_proj.weight: + device: cuda:0 + max: '1.892e-02' + mean: '1.393e-08' + min: '-1.640e-02' + shape: + - 1024 + - 1024 + sum: '1.461e-02' +grads.network.model.decoder.layers.5.self_attn.v_proj.bias: + device: cuda:0 + max: '7.63e-03' + mean: '2.826e-05' + min: '-6.905e-03' + shape: + - 1024 + sum: '2.894e-02' +grads.network.model.decoder.layers.5.self_attn.v_proj.weight: + device: cuda:0 + max: '1.549e-01' + mean: '7.210e-08' + min: '-1.564e-01' + shape: + - 1024 + - 1024 + sum: '7.561e-02' +grads.network.model.decoder.layers.5.self_attn_layer_norm.bias: + device: cuda:0 + max: '7.75e-03' + mean: '-6.064e-05' + min: '-1.140e-02' + shape: + - 1024 + sum: '-6.21e-02' +grads.network.model.decoder.layers.5.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.310e-02' + mean: '-7.533e-06' + min: '-1.207e-02' + shape: + - 1024 + sum: '-7.714e-03' +grads.network.model.decoder.layers.6.fc1.bias: + device: cuda:0 + max: '8.689e-03' + mean: '-1.853e-05' + min: '-5.812e-03' + shape: + - 4096 + sum: '-7.588e-02' +grads.network.model.decoder.layers.6.fc1.weight: + device: cuda:0 + max: '1.247e-01' + mean: '2.587e-11' + min: '-1.671e-01' + shape: + - 4096 + - 1024 + sum: '1.085e-04' +grads.network.model.decoder.layers.6.fc2.bias: + device: cuda:0 + max: '8.694e-03' + mean: '-3.638e-12' + min: '-8.964e-03' + shape: + - 1024 + sum: '-3.725e-09' +grads.network.model.decoder.layers.6.fc2.weight: + device: cuda:0 + max: '2.818e-02' + mean: '-1.99e-13' + min: '-2.423e-02' + shape: + - 1024 + - 4096 + sum: '-8.345e-07' +grads.network.model.decoder.layers.6.final_layer_norm.bias: + device: cuda:0 + max: '9.466e-03' + mean: '1.768e-05' + min: '-9.583e-03' + shape: + - 1024 + sum: '1.811e-02' +grads.network.model.decoder.layers.6.final_layer_norm.weight: + device: cuda:0 + max: '3.202e-02' + mean: '1.739e-05' + min: '-1.373e-02' + shape: + - 1024 + sum: '1.780e-02' +grads.network.model.decoder.layers.6.self_attn.k_proj.bias: + device: cuda:0 + max: '1.048e-09' + mean: '2.847e-12' + min: '-5.821e-10' + shape: + - 1024 + sum: '2.915e-09' +grads.network.model.decoder.layers.6.self_attn.k_proj.weight: + device: cuda:0 + max: '7.468e-02' + mean: '3.264e-14' + min: '-7.459e-02' + shape: + - 1024 + - 1024 + sum: '3.423e-08' +grads.network.model.decoder.layers.6.self_attn.out_proj.bias: + device: cuda:0 + max: '9.673e-03' + mean: '-7.276e-12' + min: '-9.632e-03' + shape: + - 1024 + sum: '-7.451e-09' +grads.network.model.decoder.layers.6.self_attn.out_proj.weight: + device: cuda:0 + max: '1.069e-02' + mean: '-2.558e-13' + min: '-1.237e-02' + shape: + - 1024 + - 1024 + sum: '-2.682e-07' +grads.network.model.decoder.layers.6.self_attn.q_proj.bias: + device: cuda:0 + max: '1.893e-03' + mean: '-1.271e-05' + min: '-3.243e-03' + shape: + - 1024 + sum: '-1.302e-02' +grads.network.model.decoder.layers.6.self_attn.q_proj.weight: + device: cuda:0 + max: '4.317e-02' + mean: '-5.287e-09' + min: '-5.174e-02' + shape: + - 1024 + - 1024 + sum: '-5.543e-03' +grads.network.model.decoder.layers.6.self_attn.v_proj.bias: + device: cuda:0 + max: '6.756e-03' + mean: '8.55e-05' + min: '-5.219e-03' + shape: + - 1024 + sum: '8.755e-02' +grads.network.model.decoder.layers.6.self_attn.v_proj.weight: + device: cuda:0 + max: '1.221e-01' + mean: '3.555e-08' + min: '-1.883e-01' + shape: + - 1024 + - 1024 + sum: '3.728e-02' +grads.network.model.decoder.layers.6.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.004e-02' + mean: '2.542e-06' + min: '-9.872e-03' + shape: + - 1024 + sum: '2.603e-03' +grads.network.model.decoder.layers.6.self_attn_layer_norm.weight: + device: cuda:0 + max: '2.376e-02' + mean: '-1.475e-05' + min: '-1.311e-02' + shape: + - 1024 + sum: '-1.511e-02' +grads.network.model.decoder.layers.7.fc1.bias: + device: cuda:0 + max: '1.040e-02' + mean: '-1.111e-05' + min: '-5.846e-03' + shape: + - 4096 + sum: '-4.551e-02' +grads.network.model.decoder.layers.7.fc1.weight: + device: cuda:0 + max: '1.282e-01' + mean: '-2.034e-09' + min: '-2.541e-01' + shape: + - 4096 + - 1024 + sum: '-8.530e-03' +grads.network.model.decoder.layers.7.fc2.bias: + device: cuda:0 + max: '8.647e-03' + mean: '-1.819e-12' + min: '-1.108e-02' + shape: + - 1024 + sum: '-1.863e-09' +grads.network.model.decoder.layers.7.fc2.weight: + device: cuda:0 + max: '2.036e-02' + mean: '-2.274e-13' + min: '-2.125e-02' + shape: + - 1024 + - 4096 + sum: '-9.537e-07' +grads.network.model.decoder.layers.7.final_layer_norm.bias: + device: cuda:0 + max: '9.436e-03' + mean: '1.051e-04' + min: '-1.201e-02' + shape: + - 1024 + sum: '1.076e-01' +grads.network.model.decoder.layers.7.final_layer_norm.weight: + device: cuda:0 + max: '2.502e-02' + mean: '-2.608e-06' + min: '-1.341e-02' + shape: + - 1024 + sum: '-2.670e-03' +grads.network.model.decoder.layers.7.self_attn.k_proj.bias: + device: cuda:0 + max: '4.075e-10' + mean: '1.863e-13' + min: '-3.492e-10' + shape: + - 1024 + sum: '1.908e-10' +grads.network.model.decoder.layers.7.self_attn.k_proj.weight: + device: cuda:0 + max: '3.309e-02' + mean: '6.817e-14' + min: '-4.19e-02' + shape: + - 1024 + - 1024 + sum: '7.148e-08' +grads.network.model.decoder.layers.7.self_attn.out_proj.bias: + device: cuda:0 + max: '7.477e-03' + mean: '-5.457e-12' + min: '-9.228e-03' + shape: + - 1024 + sum: '-5.588e-09' +grads.network.model.decoder.layers.7.self_attn.out_proj.weight: + device: cuda:0 + max: '1.003e-02' + mean: '-1.563e-13' + min: '-7.771e-03' + shape: + - 1024 + - 1024 + sum: '-1.639e-07' +grads.network.model.decoder.layers.7.self_attn.q_proj.bias: + device: cuda:0 + max: '2.209e-03' + mean: '-4.411e-06' + min: '-1.604e-03' + shape: + - 1024 + sum: '-4.517e-03' +grads.network.model.decoder.layers.7.self_attn.q_proj.weight: + device: cuda:0 + max: '3.379e-02' + mean: '5.986e-10' + min: '-2.946e-02' + shape: + - 1024 + - 1024 + sum: '6.277e-04' +grads.network.model.decoder.layers.7.self_attn.v_proj.bias: + device: cuda:0 + max: '6.926e-03' + mean: '5.966e-05' + min: '-6.282e-03' + shape: + - 1024 + sum: '6.109e-02' +grads.network.model.decoder.layers.7.self_attn.v_proj.weight: + device: cuda:0 + max: '1.424e-01' + mean: '-8.094e-09' + min: '-1.385e-01' + shape: + - 1024 + - 1024 + sum: '-8.487e-03' +grads.network.model.decoder.layers.7.self_attn_layer_norm.bias: + device: cuda:0 + max: '7.795e-03' + mean: '8.083e-05' + min: '-9.428e-03' + shape: + - 1024 + sum: '8.277e-02' +grads.network.model.decoder.layers.7.self_attn_layer_norm.weight: + device: cuda:0 + max: '3.435e-02' + mean: '-2.633e-06' + min: '-1.194e-02' + shape: + - 1024 + sum: '-2.696e-03' +grads.network.model.decoder.layers.8.fc1.bias: + device: cuda:0 + max: '9.447e-03' + mean: '-1.000e-05' + min: '-1.029e-02' + shape: + - 4096 + sum: '-4.096e-02' +grads.network.model.decoder.layers.8.fc1.weight: + device: cuda:0 + max: '1.788e-01' + mean: '-1.028e-08' + min: '-1.565e-01' + shape: + - 4096 + - 1024 + sum: '-4.31e-02' +grads.network.model.decoder.layers.8.fc2.bias: + device: cuda:0 + max: '9.312e-03' + mean: '1.819e-11' + min: '-9.654e-03' + shape: + - 1024 + sum: '1.863e-08' +grads.network.model.decoder.layers.8.fc2.weight: + device: cuda:0 + max: '2.393e-02' + mean: '6.821e-13' + min: '-1.897e-02' + shape: + - 1024 + - 4096 + sum: '2.861e-06' +grads.network.model.decoder.layers.8.final_layer_norm.bias: + device: cuda:0 + max: '1.033e-02' + mean: '-9.404e-05' + min: '-1.074e-02' + shape: + - 1024 + sum: '-9.63e-02' +grads.network.model.decoder.layers.8.final_layer_norm.weight: + device: cuda:0 + max: '8.312e-03' + mean: '-3.398e-05' + min: '-2.52e-02' + shape: + - 1024 + sum: '-3.479e-02' +grads.network.model.decoder.layers.8.self_attn.k_proj.bias: + device: cuda:0 + max: '4.657e-10' + mean: '1.157e-12' + min: '-7.567e-10' + shape: + - 1024 + sum: '1.185e-09' +grads.network.model.decoder.layers.8.self_attn.k_proj.weight: + device: cuda:0 + max: '2.660e-02' + mean: '-1.255e-14' + min: '-2.215e-02' + shape: + - 1024 + - 1024 + sum: '-1.315e-08' +grads.network.model.decoder.layers.8.self_attn.out_proj.bias: + device: cuda:0 + max: '8.574e-03' + mean: '-1.091e-11' + min: '-1.133e-02' + shape: + - 1024 + sum: '-1.118e-08' +grads.network.model.decoder.layers.8.self_attn.out_proj.weight: + device: cuda:0 + max: '5.791e-03' + mean: '1.776e-13' + min: '-7.842e-03' + shape: + - 1024 + - 1024 + sum: '1.863e-07' +grads.network.model.decoder.layers.8.self_attn.q_proj.bias: + device: cuda:0 + max: '2.176e-03' + mean: '1.136e-05' + min: '-1.464e-03' + shape: + - 1024 + sum: '1.164e-02' +grads.network.model.decoder.layers.8.self_attn.q_proj.weight: + device: cuda:0 + max: '2.919e-02' + mean: '-1.766e-08' + min: '-3.662e-02' + shape: + - 1024 + - 1024 + sum: '-1.852e-02' +grads.network.model.decoder.layers.8.self_attn.v_proj.bias: + device: cuda:0 + max: '7.759e-03' + mean: '5.574e-05' + min: '-1.002e-02' + shape: + - 1024 + sum: '5.708e-02' +grads.network.model.decoder.layers.8.self_attn.v_proj.weight: + device: cuda:0 + max: '2.583e-01' + mean: '-8.663e-08' + min: '-1.763e-01' + shape: + - 1024 + - 1024 + sum: '-9.083e-02' +grads.network.model.decoder.layers.8.self_attn_layer_norm.bias: + device: cuda:0 + max: '8.934e-03' + mean: '3.720e-05' + min: '-1.170e-02' + shape: + - 1024 + sum: '3.81e-02' +grads.network.model.decoder.layers.8.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.159e-02' + mean: '-3.363e-06' + min: '-1.334e-02' + shape: + - 1024 + sum: '-3.444e-03' +grads.network.model.decoder.layers.9.fc1.bias: + device: cuda:0 + max: '1.084e-02' + mean: '-1.724e-05' + min: '-8.211e-03' + shape: + - 4096 + sum: '-7.062e-02' +grads.network.model.decoder.layers.9.fc1.weight: + device: cuda:0 + max: '1.987e-01' + mean: '-1.661e-08' + min: '-2.721e-01' + shape: + - 4096 + - 1024 + sum: '-6.966e-02' +grads.network.model.decoder.layers.9.fc2.bias: + device: cuda:0 + max: '1.032e-02' + mean: '-7.276e-12' + min: '-1.013e-02' + shape: + - 1024 + sum: '-7.451e-09' +grads.network.model.decoder.layers.9.fc2.weight: + device: cuda:0 + max: '2.487e-02' + mean: '-5.684e-13' + min: '-2.754e-02' + shape: + - 1024 + - 4096 + sum: '-2.384e-06' +grads.network.model.decoder.layers.9.final_layer_norm.bias: + device: cuda:0 + max: '1.148e-02' + mean: '-7.486e-05' + min: '-1.105e-02' + shape: + - 1024 + sum: '-7.665e-02' +grads.network.model.decoder.layers.9.final_layer_norm.weight: + device: cuda:0 + max: '5.081e-02' + mean: '3.829e-06' + min: '-1.181e-02' + shape: + - 1024 + sum: '3.921e-03' +grads.network.model.decoder.layers.9.self_attn.k_proj.bias: + device: cuda:0 + max: '1.397e-09' + mean: '-3.783e-12' + min: '-2.095e-09' + shape: + - 1024 + sum: '-3.874e-09' +grads.network.model.decoder.layers.9.self_attn.k_proj.weight: + device: cuda:0 + max: '1.288e-01' + mean: '2.314e-13' + min: '-1.159e-01' + shape: + - 1024 + - 1024 + sum: '2.427e-07' +grads.network.model.decoder.layers.9.self_attn.out_proj.bias: + device: cuda:0 + max: '9.677e-03' + mean: '-2.183e-11' + min: '-9.679e-03' + shape: + - 1024 + sum: '-2.235e-08' +grads.network.model.decoder.layers.9.self_attn.out_proj.weight: + device: cuda:0 + max: '8.051e-03' + mean: '2.558e-13' + min: '-8.809e-03' + shape: + - 1024 + - 1024 + sum: '2.682e-07' +grads.network.model.decoder.layers.9.self_attn.q_proj.bias: + device: cuda:0 + max: '3.228e-03' + mean: '-6.335e-06' + min: '-4.683e-03' + shape: + - 1024 + sum: '-6.487e-03' +grads.network.model.decoder.layers.9.self_attn.q_proj.weight: + device: cuda:0 + max: '8.449e-02' + mean: '2.055e-08' + min: '-6.571e-02' + shape: + - 1024 + - 1024 + sum: '2.155e-02' +grads.network.model.decoder.layers.9.self_attn.v_proj.bias: + device: cuda:0 + max: '1.115e-02' + mean: '-3.493e-05' + min: '-9.448e-03' + shape: + - 1024 + sum: '-3.577e-02' +grads.network.model.decoder.layers.9.self_attn.v_proj.weight: + device: cuda:0 + max: '2.284e-01' + mean: '1.133e-07' + min: '-2.614e-01' + shape: + - 1024 + - 1024 + sum: '1.188e-01' +grads.network.model.decoder.layers.9.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.015e-02' + mean: '4.447e-05' + min: '-1.010e-02' + shape: + - 1024 + sum: '4.553e-02' +grads.network.model.decoder.layers.9.self_attn_layer_norm.weight: + device: cuda:0 + max: '9.655e-03' + mean: '2.292e-06' + min: '-2.027e-02' + shape: + - 1024 + sum: '2.347e-03' +grads.network.model.decoder.project_in.weight: + device: cuda:0 + max: '2.645e-02' + mean: '-3.396e-07' + min: '-2.839e-02' + shape: + - 1024 + - 512 + sum: '-1.780e-01' +grads.network.model.decoder.project_out.weight: + device: cuda:0 + max: '9.968e-02' + mean: '-3.139e-07' + min: '-1.016e-01' + shape: + - 512 + - 1024 + sum: '-1.646e-01' +outputs.loss: + device: cuda:0 + max: '4.05e+00' + mean: '4.05e+00' + min: '4.05e+00' + shape: [] + sum: '4.05e+00' diff --git a/.regression_files/project/algorithms/llm_finetuning_test/test_forward_pass_is_reproducible/cuda/llm_finetuning.yaml b/.regression_files/project/algorithms/llm_finetuning_test/test_forward_pass_is_reproducible/cuda/llm_finetuning.yaml new file mode 100644 index 00000000..41f33102 --- /dev/null +++ b/.regression_files/project/algorithms/llm_finetuning_test/test_forward_pass_is_reproducible/cuda/llm_finetuning.yaml @@ -0,0 +1,572 @@ +input.attention_mask: + device: cuda:0 + max: 1 + mean: '1.e+00' + min: 1 + shape: + - 8 + - 256 + sum: 2048 +input.input_ids: + device: cuda:0 + max: 50118 + mean: '5.447e+03' + min: 2 + shape: + - 8 + - 256 + sum: 11154886 +input.labels: + device: cuda:0 + max: 50118 + mean: '5.447e+03' + min: 2 + shape: + - 8 + - 256 + sum: 11154886 +out.logits: + device: cuda:0 + max: '3.537e+01' + mean: '-4.715e+00' + min: '-3.336e+01' + shape: + - 8 + - 256 + - 50272 + sum: '-4.855e+08' +out.loss: + device: cuda:0 + max: '4.05e+00' + mean: '4.05e+00' + min: '4.05e+00' + shape: [] + sum: '4.05e+00' +out.past_key_values.0.0: + device: cuda:0 + max: '1.824e+00' + mean: '-3.677e-03' + min: '-2.004e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-7.711e+03' +out.past_key_values.0.1: + device: cuda:0 + max: '1.91e-01' + mean: '6.668e-05' + min: '-1.719e-01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '1.398e+02' +out.past_key_values.1.0: + device: cuda:0 + max: '1.150e+01' + mean: '5.521e-03' + min: '-1.144e+01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '1.158e+04' +out.past_key_values.1.1: + device: cuda:0 + max: '4.35e+00' + mean: '2.593e-03' + min: '-4.527e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '5.439e+03' +out.past_key_values.10.0: + device: cuda:0 + max: '9.741e+00' + mean: '5.765e-02' + min: '-1.030e+01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '1.209e+05' +out.past_key_values.10.1: + device: cuda:0 + max: '5.526e+00' + mean: '1.023e-02' + min: '-5.248e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '2.145e+04' +out.past_key_values.11.0: + device: cuda:0 + max: '9.2e+00' + mean: '4.524e-02' + min: '-8.32e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '9.488e+04' +out.past_key_values.11.1: + device: cuda:0 + max: '4.676e+00' + mean: '7.994e-03' + min: '-4.337e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '1.676e+04' +out.past_key_values.12.0: + device: cuda:0 + max: '8.099e+00' + mean: '-4.339e-03' + min: '-8.358e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-9.101e+03' +out.past_key_values.12.1: + device: cuda:0 + max: '5.357e+00' + mean: '7.804e-03' + min: '-5.152e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '1.637e+04' +out.past_key_values.13.0: + device: cuda:0 + max: '8.449e+00' + mean: '-9.491e-03' + min: '-8.29e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-1.990e+04' +out.past_key_values.13.1: + device: cuda:0 + max: '4.555e+00' + mean: '3.872e-03' + min: '-5.178e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '8.120e+03' +out.past_key_values.14.0: + device: cuda:0 + max: '7.696e+00' + mean: '-4.042e-02' + min: '-8.394e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-8.477e+04' +out.past_key_values.14.1: + device: cuda:0 + max: '5.031e+00' + mean: '3.803e-03' + min: '-5.123e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '7.976e+03' +out.past_key_values.15.0: + device: cuda:0 + max: '8.108e+00' + mean: '2.572e-02' + min: '-1.000e+01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '5.394e+04' +out.past_key_values.15.1: + device: cuda:0 + max: '4.85e+00' + mean: '-8.774e-03' + min: '-4.855e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-1.840e+04' +out.past_key_values.16.0: + device: cuda:0 + max: '8.927e+00' + mean: '-1.676e-02' + min: '-8.144e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-3.515e+04' +out.past_key_values.16.1: + device: cuda:0 + max: '4.793e+00' + mean: '-1.081e-02' + min: '-5.854e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-2.268e+04' +out.past_key_values.17.0: + device: cuda:0 + max: '1.004e+01' + mean: '2.810e-02' + min: '-9.726e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '5.893e+04' +out.past_key_values.17.1: + device: cuda:0 + max: '5.284e+00' + mean: '5.285e-03' + min: '-5.681e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '1.108e+04' +out.past_key_values.18.0: + device: cuda:0 + max: '8.982e+00' + mean: '5.052e-02' + min: '-8.762e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '1.059e+05' +out.past_key_values.18.1: + device: cuda:0 + max: '4.748e+00' + mean: '-1.694e-03' + min: '-4.891e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-3.554e+03' +out.past_key_values.19.0: + device: cuda:0 + max: '9.813e+00' + mean: '1.273e-02' + min: '-9.707e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '2.670e+04' +out.past_key_values.19.1: + device: cuda:0 + max: '4.619e+00' + mean: '-1.924e-02' + min: '-4.700e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-4.036e+04' +out.past_key_values.2.0: + device: cuda:0 + max: '1.074e+01' + mean: '6.862e-02' + min: '-1.063e+01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '1.439e+05' +out.past_key_values.2.1: + device: cuda:0 + max: '4.396e+00' + mean: '2.223e-03' + min: '-4.462e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '4.662e+03' +out.past_key_values.20.0: + device: cuda:0 + max: '1.106e+01' + mean: '5.73e-02' + min: '-1.099e+01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '1.202e+05' +out.past_key_values.20.1: + device: cuda:0 + max: '4.813e+00' + mean: '6.246e-03' + min: '-5.477e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '1.31e+04' +out.past_key_values.21.0: + device: cuda:0 + max: '1.079e+01' + mean: '4.522e-02' + min: '-1.039e+01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '9.484e+04' +out.past_key_values.21.1: + device: cuda:0 + max: '4.631e+00' + mean: '1.379e-02' + min: '-4.818e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '2.891e+04' +out.past_key_values.22.0: + device: cuda:0 + max: '1.065e+01' + mean: '4.017e-02' + min: '-1.125e+01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '8.425e+04' +out.past_key_values.22.1: + device: cuda:0 + max: '5.105e+00' + mean: '5.328e-03' + min: '-4.445e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '1.117e+04' +out.past_key_values.23.0: + device: cuda:0 + max: '9.464e+00' + mean: '1.056e-02' + min: '-8.453e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '2.214e+04' +out.past_key_values.23.1: + device: cuda:0 + max: '4.379e+00' + mean: '-1.464e-03' + min: '-4.951e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-3.069e+03' +out.past_key_values.3.0: + device: cuda:0 + max: '1.142e+01' + mean: '4.512e-02' + min: '-1.147e+01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '9.462e+04' +out.past_key_values.3.1: + device: cuda:0 + max: '4.416e+00' + mean: '-3.978e-04' + min: '-4.476e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-8.342e+02' +out.past_key_values.4.0: + device: cuda:0 + max: '1.193e+01' + mean: '-3.041e-02' + min: '-1.091e+01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-6.377e+04' +out.past_key_values.4.1: + device: cuda:0 + max: '4.839e+00' + mean: '-4.185e-04' + min: '-5.120e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-8.776e+02' +out.past_key_values.5.0: + device: cuda:0 + max: '1.230e+01' + mean: '4.608e-02' + min: '-1.164e+01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '9.664e+04' +out.past_key_values.5.1: + device: cuda:0 + max: '5.191e+00' + mean: '1.398e-03' + min: '-4.402e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '2.932e+03' +out.past_key_values.6.0: + device: cuda:0 + max: '1.248e+01' + mean: '6.588e-03' + min: '-1.322e+01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '1.382e+04' +out.past_key_values.6.1: + device: cuda:0 + max: '4.148e+00' + mean: '5.169e-03' + min: '-4.295e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '1.084e+04' +out.past_key_values.7.0: + device: cuda:0 + max: '1.326e+01' + mean: '-1.400e-02' + min: '-1.272e+01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-2.936e+04' +out.past_key_values.7.1: + device: cuda:0 + max: '4.043e+00' + mean: '5.246e-03' + min: '-3.823e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '1.100e+04' +out.past_key_values.8.0: + device: cuda:0 + max: '1.329e+01' + mean: '1.543e-02' + min: '-1.222e+01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '3.235e+04' +out.past_key_values.8.1: + device: cuda:0 + max: '4.179e+00' + mean: '-1.275e-03' + min: '-4.191e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-2.674e+03' +out.past_key_values.9.0: + device: cuda:0 + max: '1.514e+01' + mean: '-1.051e-01' + min: '-1.701e+01' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '-2.204e+05' +out.past_key_values.9.1: + device: cuda:0 + max: '4.456e+00' + mean: '3.825e-04' + min: '-4.440e+00' + shape: + - 8 + - 16 + - 256 + - 64 + sum: '8.022e+02' diff --git a/.regression_files/project/algorithms/llm_finetuning_test/test_initialization_is_reproducible/cuda/llm_finetuning.yaml b/.regression_files/project/algorithms/llm_finetuning_test/test_initialization_is_reproducible/cuda/llm_finetuning.yaml new file mode 100644 index 00000000..9e7c6ffb --- /dev/null +++ b/.regression_files/project/algorithms/llm_finetuning_test/test_initialization_is_reproducible/cuda/llm_finetuning.yaml @@ -0,0 +1,3261 @@ +network.lm_head.weight: + device: cuda:0 + max: '2.372e-01' + mean: '-1.208e-03' + min: '-2.5e-01' + shape: + - 50272 + - 512 + sum: '-3.109e+04' +network.model.decoder.embed_positions.weight: + device: cuda:0 + max: '1.327e-01' + mean: '1.768e-05' + min: '-1.379e-01' + shape: + - 2050 + - 1024 + sum: '3.711e+01' +network.model.decoder.embed_tokens.weight: + device: cuda:0 + max: '2.372e-01' + mean: '-1.208e-03' + min: '-2.5e-01' + shape: + - 50272 + - 512 + sum: '-3.109e+04' +network.model.decoder.layers.0.fc1.bias: + device: cuda:0 + max: '1.249e-01' + mean: '-2.961e-02' + min: '-1.085e-01' + shape: + - 4096 + sum: '-1.213e+02' +network.model.decoder.layers.0.fc1.weight: + device: cuda:0 + max: '1.25e-01' + mean: '1.667e-04' + min: '-1.251e-01' + shape: + - 4096 + - 1024 + sum: '6.992e+02' +network.model.decoder.layers.0.fc2.bias: + device: cuda:0 + max: '7.88e-02' + mean: '-8.293e-05' + min: '-9.351e-02' + shape: + - 1024 + sum: '-8.492e-02' +network.model.decoder.layers.0.fc2.weight: + device: cuda:0 + max: '1.331e-01' + mean: '5.357e-06' + min: '-1.448e-01' + shape: + - 1024 + - 4096 + sum: '2.247e+01' +network.model.decoder.layers.0.final_layer_norm.bias: + device: cuda:0 + max: '1.256e-01' + mean: '7.015e-03' + min: '-1.204e-01' + shape: + - 1024 + sum: '7.183e+00' +network.model.decoder.layers.0.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.0.self_attn.k_proj.bias: + device: cuda:0 + max: '3.125e-02' + mean: '3.414e-04' + min: '-3.123e-02' + shape: + - 1024 + sum: '3.496e-01' +network.model.decoder.layers.0.self_attn.k_proj.weight: + device: cuda:0 + max: '1.256e-01' + mean: '-4.626e-05' + min: '-1.256e-01' + shape: + - 1024 + - 1024 + sum: '-4.850e+01' +network.model.decoder.layers.0.self_attn.out_proj.bias: + device: cuda:0 + max: '1.579e-02' + mean: '-2.766e-05' + min: '-1.138e-02' + shape: + - 1024 + sum: '-2.833e-02' +network.model.decoder.layers.0.self_attn.out_proj.weight: + device: cuda:0 + max: '1.283e-01' + mean: '-6.181e-06' + min: '-1.295e-01' + shape: + - 1024 + - 1024 + sum: '-6.481e+00' +network.model.decoder.layers.0.self_attn.q_proj.bias: + device: cuda:0 + max: '1.282e-01' + mean: '1.180e-03' + min: '-1.271e-01' + shape: + - 1024 + sum: '1.208e+00' +network.model.decoder.layers.0.self_attn.q_proj.weight: + device: cuda:0 + max: '1.267e-01' + mean: '-5.663e-05' + min: '-1.267e-01' + shape: + - 1024 + - 1024 + sum: '-5.938e+01' +network.model.decoder.layers.0.self_attn.v_proj.bias: + device: cuda:0 + max: '2.769e-02' + mean: '-2.715e-05' + min: '-2.669e-02' + shape: + - 1024 + sum: '-2.780e-02' +network.model.decoder.layers.0.self_attn.v_proj.weight: + device: cuda:0 + max: '8.795e-02' + mean: '1.917e-06' + min: '-8.508e-02' + shape: + - 1024 + - 1024 + sum: '2.011e+00' +network.model.decoder.layers.0.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.271e-01' + mean: '-2.03e-03' + min: '-1.248e-01' + shape: + - 1024 + sum: '-2.079e+00' +network.model.decoder.layers.0.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.1.fc1.bias: + device: cuda:0 + max: '1.236e-01' + mean: '-2.428e-02' + min: '-8.075e-02' + shape: + - 4096 + sum: '-9.946e+01' +network.model.decoder.layers.1.fc1.weight: + device: cuda:0 + max: '1.254e-01' + mean: '1.85e-04' + min: '-1.261e-01' + shape: + - 4096 + - 1024 + sum: '7.759e+02' +network.model.decoder.layers.1.fc2.bias: + device: cuda:0 + max: '8.911e-02' + mean: '2.946e-04' + min: '-8.362e-02' + shape: + - 1024 + sum: '3.017e-01' +network.model.decoder.layers.1.fc2.weight: + device: cuda:0 + max: '1.321e-01' + mean: '-2.468e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-1.035e+01' +network.model.decoder.layers.1.final_layer_norm.bias: + device: cuda:0 + max: '1.256e-01' + mean: '8.647e-03' + min: '-1.198e-01' + shape: + - 1024 + sum: '8.855e+00' +network.model.decoder.layers.1.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.1.self_attn.k_proj.bias: + device: cuda:0 + max: '7.153e-02' + mean: '7.902e-03' + min: '-7.874e-02' + shape: + - 1024 + sum: '8.092e+00' +network.model.decoder.layers.1.self_attn.k_proj.weight: + device: cuda:0 + max: '1.266e-01' + mean: '-1.284e-05' + min: '-1.272e-01' + shape: + - 1024 + - 1024 + sum: '-1.346e+01' +network.model.decoder.layers.1.self_attn.out_proj.bias: + device: cuda:0 + max: '8.606e-02' + mean: '-1.118e-04' + min: '-7.031e-02' + shape: + - 1024 + sum: '-1.144e-01' +network.model.decoder.layers.1.self_attn.out_proj.weight: + device: cuda:0 + max: '1.266e-01' + mean: '1.676e-06' + min: '-1.272e-01' + shape: + - 1024 + - 1024 + sum: '1.758e+00' +network.model.decoder.layers.1.self_attn.q_proj.bias: + device: cuda:0 + max: '1.254e-01' + mean: '-1.557e-03' + min: '-1.252e-01' + shape: + - 1024 + sum: '-1.595e+00' +network.model.decoder.layers.1.self_attn.q_proj.weight: + device: cuda:0 + max: '1.256e-01' + mean: '-3.561e-05' + min: '-1.26e-01' + shape: + - 1024 + - 1024 + sum: '-3.734e+01' +network.model.decoder.layers.1.self_attn.v_proj.bias: + device: cuda:0 + max: '5.002e-02' + mean: '3.967e-04' + min: '-4.831e-02' + shape: + - 1024 + sum: '4.062e-01' +network.model.decoder.layers.1.self_attn.v_proj.weight: + device: cuda:0 + max: '1.092e-01' + mean: '1.417e-05' + min: '-1.07e-01' + shape: + - 1024 + - 1024 + sum: '1.486e+01' +network.model.decoder.layers.1.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.304e-01' + mean: '-2.029e-03' + min: '-1.248e-01' + shape: + - 1024 + sum: '-2.078e+00' +network.model.decoder.layers.1.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.10.fc1.bias: + device: cuda:0 + max: '5.505e-02' + mean: '-2.099e-02' + min: '-8.49e-02' + shape: + - 4096 + sum: '-8.599e+01' +network.model.decoder.layers.10.fc1.weight: + device: cuda:0 + max: '1.27e-01' + mean: '1.603e-05' + min: '-1.296e-01' + shape: + - 4096 + - 1024 + sum: '6.723e+01' +network.model.decoder.layers.10.fc2.bias: + device: cuda:0 + max: '6.293e-02' + mean: '-1.937e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-1.983e-01' +network.model.decoder.layers.10.fc2.weight: + device: cuda:0 + max: '1.281e-01' + mean: '-1.624e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-6.81e+00' +network.model.decoder.layers.10.final_layer_norm.bias: + device: cuda:0 + max: '8.020e-02' + mean: '-9.374e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '-9.599e+00' +network.model.decoder.layers.10.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.10.self_attn.k_proj.bias: + device: cuda:0 + max: '7.422e-02' + mean: '7.871e-03' + min: '-7.428e-02' + shape: + - 1024 + sum: '8.06e+00' +network.model.decoder.layers.10.self_attn.k_proj.weight: + device: cuda:0 + max: '1.318e-01' + mean: '-1.478e-05' + min: '-1.285e-01' + shape: + - 1024 + - 1024 + sum: '-1.55e+01' +network.model.decoder.layers.10.self_attn.out_proj.bias: + device: cuda:0 + max: '7.031e-02' + mean: '-2.308e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '-2.363e-02' +network.model.decoder.layers.10.self_attn.out_proj.weight: + device: cuda:0 + max: '1.321e-01' + mean: '1.384e-06' + min: '-1.316e-01' + shape: + - 1024 + - 1024 + sum: '1.452e+00' +network.model.decoder.layers.10.self_attn.q_proj.bias: + device: cuda:0 + max: '1.089e-01' + mean: '-1.708e-03' + min: '-1.009e-01' + shape: + - 1024 + sum: '-1.749e+00' +network.model.decoder.layers.10.self_attn.q_proj.weight: + device: cuda:0 + max: '1.300e-01' + mean: '5.200e-06' + min: '-1.311e-01' + shape: + - 1024 + - 1024 + sum: '5.453e+00' +network.model.decoder.layers.10.self_attn.v_proj.bias: + device: cuda:0 + max: '5.096e-02' + mean: '3.204e-04' + min: '-5.444e-02' + shape: + - 1024 + sum: '3.281e-01' +network.model.decoder.layers.10.self_attn.v_proj.weight: + device: cuda:0 + max: '1.241e-01' + mean: '1.173e-05' + min: '-1.152e-01' + shape: + - 1024 + - 1024 + sum: '1.229e+01' +network.model.decoder.layers.10.self_attn_layer_norm.bias: + device: cuda:0 + max: '8.594e-02' + mean: '1.188e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '1.217e+00' +network.model.decoder.layers.10.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.11.fc1.bias: + device: cuda:0 + max: '6.107e-02' + mean: '-2.344e-02' + min: '-8.850e-02' + shape: + - 4096 + sum: '-9.601e+01' +network.model.decoder.layers.11.fc1.weight: + device: cuda:0 + max: '1.257e-01' + mean: '-1.888e-04' + min: '-1.263e-01' + shape: + - 4096 + - 1024 + sum: '-7.920e+02' +network.model.decoder.layers.11.fc2.bias: + device: cuda:0 + max: '6.47e-02' + mean: '1.148e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '1.176e-01' +network.model.decoder.layers.11.fc2.weight: + device: cuda:0 + max: '1.26e-01' + mean: '3.113e-07' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '1.306e+00' +network.model.decoder.layers.11.final_layer_norm.bias: + device: cuda:0 + max: '7.886e-02' + mean: '-1.455e-02' + min: '-1.25e-01' + shape: + - 1024 + sum: '-1.489e+01' +network.model.decoder.layers.11.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.11.self_attn.k_proj.bias: + device: cuda:0 + max: '7.074e-02' + mean: '5.886e-03' + min: '-6.482e-02' + shape: + - 1024 + sum: '6.027e+00' +network.model.decoder.layers.11.self_attn.k_proj.weight: + device: cuda:0 + max: '1.331e-01' + mean: '1.017e-05' + min: '-1.31e-01' + shape: + - 1024 + - 1024 + sum: '1.066e+01' +network.model.decoder.layers.11.self_attn.out_proj.bias: + device: cuda:0 + max: '6.311e-02' + mean: '-3.316e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-3.396e-01' +network.model.decoder.layers.11.self_attn.out_proj.weight: + device: cuda:0 + max: '1.514e-01' + mean: '1.601e-05' + min: '-1.647e-01' + shape: + - 1024 + - 1024 + sum: '1.679e+01' +network.model.decoder.layers.11.self_attn.q_proj.bias: + device: cuda:0 + max: '1.105e-01' + mean: '-2.709e-03' + min: '-1.172e-01' + shape: + - 1024 + sum: '-2.774e+00' +network.model.decoder.layers.11.self_attn.q_proj.weight: + device: cuda:0 + max: '1.287e-01' + mean: '5.092e-06' + min: '-1.26e-01' + shape: + - 1024 + - 1024 + sum: '5.339e+00' +network.model.decoder.layers.11.self_attn.v_proj.bias: + device: cuda:0 + max: '3.922e-02' + mean: '4.083e-04' + min: '-4.712e-02' + shape: + - 1024 + sum: '4.180e-01' +network.model.decoder.layers.11.self_attn.v_proj.weight: + device: cuda:0 + max: '1.234e-01' + mean: '-8.525e-05' + min: '-1.197e-01' + shape: + - 1024 + - 1024 + sum: '-8.939e+01' +network.model.decoder.layers.11.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.046e-01' + mean: '4.110e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '4.209e+00' +network.model.decoder.layers.11.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.12.fc1.bias: + device: cuda:0 + max: '7.367e-02' + mean: '-2.188e-02' + min: '-7.434e-02' + shape: + - 4096 + sum: '-8.961e+01' +network.model.decoder.layers.12.fc1.weight: + device: cuda:0 + max: '1.274e-01' + mean: '-2.221e-04' + min: '-1.266e-01' + shape: + - 4096 + - 1024 + sum: '-9.314e+02' +network.model.decoder.layers.12.fc2.bias: + device: cuda:0 + max: '7.233e-02' + mean: '-3.044e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-3.118e-01' +network.model.decoder.layers.12.fc2.weight: + device: cuda:0 + max: '1.265e-01' + mean: '1.128e-07' + min: '-1.393e-01' + shape: + - 1024 + - 4096 + sum: '4.732e-01' +network.model.decoder.layers.12.final_layer_norm.bias: + device: cuda:0 + max: '1.241e-01' + mean: '-1.53e-02' + min: '-1.254e-01' + shape: + - 1024 + sum: '-1.566e+01' +network.model.decoder.layers.12.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.12.self_attn.k_proj.bias: + device: cuda:0 + max: '1.177e-01' + mean: '6.118e-03' + min: '-8.82e-02' + shape: + - 1024 + sum: '6.265e+00' +network.model.decoder.layers.12.self_attn.k_proj.weight: + device: cuda:0 + max: '1.274e-01' + mean: '2.051e-05' + min: '-1.263e-01' + shape: + - 1024 + - 1024 + sum: '2.151e+01' +network.model.decoder.layers.12.self_attn.out_proj.bias: + device: cuda:0 + max: '6.604e-02' + mean: '-4.053e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-4.151e-01' +network.model.decoder.layers.12.self_attn.out_proj.weight: + device: cuda:0 + max: '1.273e-01' + mean: '6.458e-06' + min: '-1.268e-01' + shape: + - 1024 + - 1024 + sum: '6.772e+00' +network.model.decoder.layers.12.self_attn.q_proj.bias: + device: cuda:0 + max: '1.249e-01' + mean: '3.377e-04' + min: '-1.248e-01' + shape: + - 1024 + sum: '3.458e-01' +network.model.decoder.layers.12.self_attn.q_proj.weight: + device: cuda:0 + max: '1.262e-01' + mean: '-4.44e-05' + min: '-1.266e-01' + shape: + - 1024 + - 1024 + sum: '-4.655e+01' +network.model.decoder.layers.12.self_attn.v_proj.bias: + device: cuda:0 + max: '5.71e-02' + mean: '1.127e-04' + min: '-4.361e-02' + shape: + - 1024 + sum: '1.155e-01' +network.model.decoder.layers.12.self_attn.v_proj.weight: + device: cuda:0 + max: '1.246e-01' + mean: '5.265e-05' + min: '-1.251e-01' + shape: + - 1024 + - 1024 + sum: '5.521e+01' +network.model.decoder.layers.12.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.025e-01' + mean: '4.391e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '4.497e+00' +network.model.decoder.layers.12.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.13.fc1.bias: + device: cuda:0 + max: '9.039e-02' + mean: '-2.392e-02' + min: '-7.361e-02' + shape: + - 4096 + sum: '-9.798e+01' +network.model.decoder.layers.13.fc1.weight: + device: cuda:0 + max: '1.263e-01' + mean: '-2.766e-04' + min: '-1.261e-01' + shape: + - 4096 + - 1024 + sum: '-1.160e+03' +network.model.decoder.layers.13.fc2.bias: + device: cuda:0 + max: '7.214e-02' + mean: '2.524e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.584e-01' +network.model.decoder.layers.13.fc2.weight: + device: cuda:0 + max: '1.256e-01' + mean: '-2.636e-06' + min: '-1.754e-01' + shape: + - 1024 + - 4096 + sum: '-1.106e+01' +network.model.decoder.layers.13.final_layer_norm.bias: + device: cuda:0 + max: '1.246e-01' + mean: '-2.340e-02' + min: '-1.254e-01' + shape: + - 1024 + sum: '-2.396e+01' +network.model.decoder.layers.13.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.13.self_attn.k_proj.bias: + device: cuda:0 + max: '7.465e-02' + mean: '5.789e-03' + min: '-7.758e-02' + shape: + - 1024 + sum: '5.928e+00' +network.model.decoder.layers.13.self_attn.k_proj.weight: + device: cuda:0 + max: '1.281e-01' + mean: '3.542e-05' + min: '-1.283e-01' + shape: + - 1024 + - 1024 + sum: '3.714e+01' +network.model.decoder.layers.13.self_attn.out_proj.bias: + device: cuda:0 + max: '6.506e-02' + mean: '-2.055e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-2.104e-01' +network.model.decoder.layers.13.self_attn.out_proj.weight: + device: cuda:0 + max: '1.277e-01' + mean: '-1.117e-05' + min: '-1.268e-01' + shape: + - 1024 + - 1024 + sum: '-1.171e+01' +network.model.decoder.layers.13.self_attn.q_proj.bias: + device: cuda:0 + max: '1.247e-01' + mean: '-2.867e-03' + min: '-1.138e-01' + shape: + - 1024 + sum: '-2.936e+00' +network.model.decoder.layers.13.self_attn.q_proj.weight: + device: cuda:0 + max: '1.265e-01' + mean: '3.923e-05' + min: '-1.273e-01' + shape: + - 1024 + - 1024 + sum: '4.114e+01' +network.model.decoder.layers.13.self_attn.v_proj.bias: + device: cuda:0 + max: '4.150e-02' + mean: '-2.426e-04' + min: '-4.178e-02' + shape: + - 1024 + sum: '-2.485e-01' +network.model.decoder.layers.13.self_attn.v_proj.weight: + device: cuda:0 + max: '1.262e-01' + mean: '-6.461e-05' + min: '-1.251e-01' + shape: + - 1024 + - 1024 + sum: '-6.775e+01' +network.model.decoder.layers.13.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.247e-01' + mean: '3.063e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '3.137e+00' +network.model.decoder.layers.13.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.14.fc1.bias: + device: cuda:0 + max: '6.329e-02' + mean: '-2.279e-02' + min: '-6.866e-02' + shape: + - 4096 + sum: '-9.333e+01' +network.model.decoder.layers.14.fc1.weight: + device: cuda:0 + max: '1.261e-01' + mean: '-1.687e-04' + min: '-1.256e-01' + shape: + - 4096 + - 1024 + sum: '-7.075e+02' +network.model.decoder.layers.14.fc2.bias: + device: cuda:0 + max: '8.209e-02' + mean: '2.395e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.453e-01' +network.model.decoder.layers.14.fc2.weight: + device: cuda:0 + max: '1.265e-01' + mean: '-1.073e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-4.501e+00' +network.model.decoder.layers.14.final_layer_norm.bias: + device: cuda:0 + max: '1.249e-01' + mean: '-2.171e-02' + min: '-1.277e-01' + shape: + - 1024 + sum: '-2.223e+01' +network.model.decoder.layers.14.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.14.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '4.583e-03' + min: '-1.03e-01' + shape: + - 1024 + sum: '4.693e+00' +network.model.decoder.layers.14.self_attn.k_proj.weight: + device: cuda:0 + max: '1.265e-01' + mean: '3.023e-05' + min: '-1.266e-01' + shape: + - 1024 + - 1024 + sum: '3.170e+01' +network.model.decoder.layers.14.self_attn.out_proj.bias: + device: cuda:0 + max: '6.335e-02' + mean: '-2.293e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-2.348e-01' +network.model.decoder.layers.14.self_attn.out_proj.weight: + device: cuda:0 + max: '1.292e-01' + mean: '-1.601e-05' + min: '-1.316e-01' + shape: + - 1024 + - 1024 + sum: '-1.679e+01' +network.model.decoder.layers.14.self_attn.q_proj.bias: + device: cuda:0 + max: '1.237e-01' + mean: '-1.509e-03' + min: '-1.181e-01' + shape: + - 1024 + sum: '-1.546e+00' +network.model.decoder.layers.14.self_attn.q_proj.weight: + device: cuda:0 + max: '1.263e-01' + mean: '3.587e-05' + min: '-1.265e-01' + shape: + - 1024 + - 1024 + sum: '3.761e+01' +network.model.decoder.layers.14.self_attn.v_proj.bias: + device: cuda:0 + max: '4.108e-02' + mean: '4.279e-04' + min: '-3.915e-02' + shape: + - 1024 + sum: '4.381e-01' +network.model.decoder.layers.14.self_attn.v_proj.weight: + device: cuda:0 + max: '1.249e-01' + mean: '6.315e-06' + min: '-1.249e-01' + shape: + - 1024 + - 1024 + sum: '6.622e+00' +network.model.decoder.layers.14.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '9.48e-04' + min: '-1.285e-01' + shape: + - 1024 + sum: '9.707e-01' +network.model.decoder.layers.14.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.15.fc1.bias: + device: cuda:0 + max: '6.256e-02' + mean: '-2.178e-02' + min: '-7.373e-02' + shape: + - 4096 + sum: '-8.921e+01' +network.model.decoder.layers.15.fc1.weight: + device: cuda:0 + max: '1.262e-01' + mean: '-2.048e-04' + min: '-1.274e-01' + shape: + - 4096 + - 1024 + sum: '-8.590e+02' +network.model.decoder.layers.15.fc2.bias: + device: cuda:0 + max: '7.629e-02' + mean: '-2.647e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-2.711e-01' +network.model.decoder.layers.15.fc2.weight: + device: cuda:0 + max: '1.273e-01' + mean: '-1.300e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-5.454e+00' +network.model.decoder.layers.15.final_layer_norm.bias: + device: cuda:0 + max: '1.251e-01' + mean: '-2.09e-02' + min: '-1.271e-01' + shape: + - 1024 + sum: '-2.14e+01' +network.model.decoder.layers.15.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.15.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '5.291e-03' + min: '-8.069e-02' + shape: + - 1024 + sum: '5.418e+00' +network.model.decoder.layers.15.self_attn.k_proj.weight: + device: cuda:0 + max: '1.259e-01' + mean: '3.431e-05' + min: '-1.272e-01' + shape: + - 1024 + - 1024 + sum: '3.598e+01' +network.model.decoder.layers.15.self_attn.out_proj.bias: + device: cuda:0 + max: '6.873e-02' + mean: '2.003e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.051e-02' +network.model.decoder.layers.15.self_attn.out_proj.weight: + device: cuda:0 + max: '1.798e-01' + mean: '1.003e-06' + min: '-1.726e-01' + shape: + - 1024 + - 1024 + sum: '1.052e+00' +network.model.decoder.layers.15.self_attn.q_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '1.456e-03' + min: '-1.242e-01' + shape: + - 1024 + sum: '1.491e+00' +network.model.decoder.layers.15.self_attn.q_proj.weight: + device: cuda:0 + max: '1.271e-01' + mean: '-2.108e-05' + min: '-1.259e-01' + shape: + - 1024 + - 1024 + sum: '-2.21e+01' +network.model.decoder.layers.15.self_attn.v_proj.bias: + device: cuda:0 + max: '4.312e-02' + mean: '-6.573e-04' + min: '-4.214e-02' + shape: + - 1024 + sum: '-6.731e-01' +network.model.decoder.layers.15.self_attn.v_proj.weight: + device: cuda:0 + max: '1.246e-01' + mean: '-1.231e-04' + min: '-1.249e-01' + shape: + - 1024 + - 1024 + sum: '-1.291e+02' +network.model.decoder.layers.15.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '1.033e-03' + min: '-1.627e-01' + shape: + - 1024 + sum: '1.058e+00' +network.model.decoder.layers.15.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.16.fc1.bias: + device: cuda:0 + max: '1.138e-01' + mean: '-2.057e-02' + min: '-8.105e-02' + shape: + - 4096 + sum: '-8.427e+01' +network.model.decoder.layers.16.fc1.weight: + device: cuda:0 + max: '1.261e-01' + mean: '-1.731e-04' + min: '-1.263e-01' + shape: + - 4096 + - 1024 + sum: '-7.259e+02' +network.model.decoder.layers.16.fc2.bias: + device: cuda:0 + max: '7.257e-02' + mean: '-1.059e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-1.085e-01' +network.model.decoder.layers.16.fc2.weight: + device: cuda:0 + max: '1.387e-01' + mean: '-4.515e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-1.894e+01' +network.model.decoder.layers.16.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-1.704e-02' + min: '-1.285e-01' + shape: + - 1024 + sum: '-1.745e+01' +network.model.decoder.layers.16.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.16.self_attn.k_proj.bias: + device: cuda:0 + max: '1.117e-01' + mean: '6.356e-03' + min: '-9.009e-02' + shape: + - 1024 + sum: '6.508e+00' +network.model.decoder.layers.16.self_attn.k_proj.weight: + device: cuda:0 + max: '1.27e-01' + mean: '-1.634e-05' + min: '-1.265e-01' + shape: + - 1024 + - 1024 + sum: '-1.713e+01' +network.model.decoder.layers.16.self_attn.out_proj.bias: + device: cuda:0 + max: '8.398e-02' + mean: '4.806e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '4.921e-02' +network.model.decoder.layers.16.self_attn.out_proj.weight: + device: cuda:0 + max: '1.553e-01' + mean: '-3.501e-06' + min: '-1.626e-01' + shape: + - 1024 + - 1024 + sum: '-3.671e+00' +network.model.decoder.layers.16.self_attn.q_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-1.884e-04' + min: '-1.246e-01' + shape: + - 1024 + sum: '-1.929e-01' +network.model.decoder.layers.16.self_attn.q_proj.weight: + device: cuda:0 + max: '1.261e-01' + mean: '2.789e-06' + min: '-1.278e-01' + shape: + - 1024 + - 1024 + sum: '2.924e+00' +network.model.decoder.layers.16.self_attn.v_proj.bias: + device: cuda:0 + max: '4.462e-02' + mean: '-7.8e-04' + min: '-4.309e-02' + shape: + - 1024 + sum: '-7.987e-01' +network.model.decoder.layers.16.self_attn.v_proj.weight: + device: cuda:0 + max: '1.257e-01' + mean: '-9.28e-05' + min: '-1.259e-01' + shape: + - 1024 + - 1024 + sum: '-9.731e+01' +network.model.decoder.layers.16.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.252e-01' + mean: '1.154e-03' + min: '-2.112e-01' + shape: + - 1024 + sum: '1.182e+00' +network.model.decoder.layers.16.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.17.fc1.bias: + device: cuda:0 + max: '1.113e-01' + mean: '-2.007e-02' + min: '-7.483e-02' + shape: + - 4096 + sum: '-8.219e+01' +network.model.decoder.layers.17.fc1.weight: + device: cuda:0 + max: '1.27e-01' + mean: '-1.176e-04' + min: '-1.266e-01' + shape: + - 4096 + - 1024 + sum: '-4.934e+02' +network.model.decoder.layers.17.fc2.bias: + device: cuda:0 + max: '6.415e-02' + mean: '2.448e-06' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.507e-03' +network.model.decoder.layers.17.fc2.weight: + device: cuda:0 + max: '1.431e-01' + mean: '-1.922e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-8.062e+00' +network.model.decoder.layers.17.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-1.363e-02' + min: '-1.307e-01' + shape: + - 1024 + sum: '-1.396e+01' +network.model.decoder.layers.17.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.17.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '3.524e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '3.609e+00' +network.model.decoder.layers.17.self_attn.k_proj.weight: + device: cuda:0 + max: '1.257e-01' + mean: '-6.266e-06' + min: '-1.268e-01' + shape: + - 1024 + - 1024 + sum: '-6.571e+00' +network.model.decoder.layers.17.self_attn.out_proj.bias: + device: cuda:0 + max: '8.557e-02' + mean: '7.932e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '8.123e-02' +network.model.decoder.layers.17.self_attn.out_proj.weight: + device: cuda:0 + max: '1.682e-01' + mean: '1.080e-05' + min: '-1.591e-01' + shape: + - 1024 + - 1024 + sum: '1.133e+01' +network.model.decoder.layers.17.self_attn.q_proj.bias: + device: cuda:0 + max: '1.081e-01' + mean: '8.627e-04' + min: '-1.006e-01' + shape: + - 1024 + sum: '8.834e-01' +network.model.decoder.layers.17.self_attn.q_proj.weight: + device: cuda:0 + max: '1.265e-01' + mean: '-1.448e-05' + min: '-1.262e-01' + shape: + - 1024 + - 1024 + sum: '-1.518e+01' +network.model.decoder.layers.17.self_attn.v_proj.bias: + device: cuda:0 + max: '4.285e-02' + mean: '4.112e-04' + min: '-4.175e-02' + shape: + - 1024 + sum: '4.211e-01' +network.model.decoder.layers.17.self_attn.v_proj.weight: + device: cuda:0 + max: '1.254e-01' + mean: '-1.06e-05' + min: '-1.25e-01' + shape: + - 1024 + - 1024 + sum: '-1.111e+01' +network.model.decoder.layers.17.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.251e-01' + mean: '1.74e-04' + min: '-1.978e-01' + shape: + - 1024 + sum: '1.781e-01' +network.model.decoder.layers.17.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.18.fc1.bias: + device: cuda:0 + max: '6.793e-02' + mean: '-1.838e-02' + min: '-8.258e-02' + shape: + - 4096 + sum: '-7.527e+01' +network.model.decoder.layers.18.fc1.weight: + device: cuda:0 + max: '1.266e-01' + mean: '-1.719e-04' + min: '-1.256e-01' + shape: + - 4096 + - 1024 + sum: '-7.209e+02' +network.model.decoder.layers.18.fc2.bias: + device: cuda:0 + max: '6.201e-02' + mean: '-3.286e-06' + min: '-1.06e-01' + shape: + - 1024 + sum: '-3.364e-03' +network.model.decoder.layers.18.fc2.weight: + device: cuda:0 + max: '1.271e-01' + mean: '2.113e-06' + min: '-1.885e-01' + shape: + - 1024 + - 4096 + sum: '8.863e+00' +network.model.decoder.layers.18.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-1.239e-02' + min: '-1.262e-01' + shape: + - 1024 + sum: '-1.268e+01' +network.model.decoder.layers.18.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.18.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '5.307e-03' + min: '-1.218e-01' + shape: + - 1024 + sum: '5.434e+00' +network.model.decoder.layers.18.self_attn.k_proj.weight: + device: cuda:0 + max: '1.26e-01' + mean: '1.154e-05' + min: '-1.27e-01' + shape: + - 1024 + - 1024 + sum: '1.210e+01' +network.model.decoder.layers.18.self_attn.out_proj.bias: + device: cuda:0 + max: '7.617e-02' + mean: '-8.257e-06' + min: '-1.25e-01' + shape: + - 1024 + sum: '-8.455e-03' +network.model.decoder.layers.18.self_attn.out_proj.weight: + device: cuda:0 + max: '1.453e-01' + mean: '-6.184e-06' + min: '-1.554e-01' + shape: + - 1024 + - 1024 + sum: '-6.484e+00' +network.model.decoder.layers.18.self_attn.q_proj.bias: + device: cuda:0 + max: '1.002e-01' + mean: '-2.302e-03' + min: '-1.179e-01' + shape: + - 1024 + sum: '-2.357e+00' +network.model.decoder.layers.18.self_attn.q_proj.weight: + device: cuda:0 + max: '1.274e-01' + mean: '-2.129e-05' + min: '-1.27e-01' + shape: + - 1024 + - 1024 + sum: '-2.233e+01' +network.model.decoder.layers.18.self_attn.v_proj.bias: + device: cuda:0 + max: '4.874e-02' + mean: '-1.296e-04' + min: '-4.315e-02' + shape: + - 1024 + sum: '-1.327e-01' +network.model.decoder.layers.18.self_attn.v_proj.weight: + device: cuda:0 + max: '1.249e-01' + mean: '-5.472e-05' + min: '-1.25e-01' + shape: + - 1024 + - 1024 + sum: '-5.738e+01' +network.model.decoder.layers.18.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.251e-01' + mean: '1.729e-03' + min: '-1.528e-01' + shape: + - 1024 + sum: '1.771e+00' +network.model.decoder.layers.18.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.19.fc1.bias: + device: cuda:0 + max: '9.674e-02' + mean: '-1.617e-02' + min: '-7.123e-02' + shape: + - 4096 + sum: '-6.623e+01' +network.model.decoder.layers.19.fc1.weight: + device: cuda:0 + max: '1.276e-01' + mean: '-1.816e-04' + min: '-1.266e-01' + shape: + - 4096 + - 1024 + sum: '-7.616e+02' +network.model.decoder.layers.19.fc2.bias: + device: cuda:0 + max: '6.439e-02' + mean: '-2.292e-04' + min: '-7.587e-02' + shape: + - 1024 + sum: '-2.347e-01' +network.model.decoder.layers.19.fc2.weight: + device: cuda:0 + max: '1.273e-01' + mean: '6.639e-06' + min: '-1.782e-01' + shape: + - 1024 + - 4096 + sum: '2.785e+01' +network.model.decoder.layers.19.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-9.252e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '-9.474e+00' +network.model.decoder.layers.19.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.19.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '7.829e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '8.017e+00' +network.model.decoder.layers.19.self_attn.k_proj.weight: + device: cuda:0 + max: '1.265e-01' + mean: '-2.187e-05' + min: '-1.265e-01' + shape: + - 1024 + - 1024 + sum: '-2.294e+01' +network.model.decoder.layers.19.self_attn.out_proj.bias: + device: cuda:0 + max: '6.445e-02' + mean: '2.324e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.380e-01' +network.model.decoder.layers.19.self_attn.out_proj.weight: + device: cuda:0 + max: '1.454e-01' + mean: '-5.801e-08' + min: '-1.431e-01' + shape: + - 1024 + - 1024 + sum: '-6.083e-02' +network.model.decoder.layers.19.self_attn.q_proj.bias: + device: cuda:0 + max: '1.252e-01' + mean: '-2.284e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '-2.338e+00' +network.model.decoder.layers.19.self_attn.q_proj.weight: + device: cuda:0 + max: '1.276e-01' + mean: '8.971e-05' + min: '-1.281e-01' + shape: + - 1024 + - 1024 + sum: '9.406e+01' +network.model.decoder.layers.19.self_attn.v_proj.bias: + device: cuda:0 + max: '4.413e-02' + mean: '-1.693e-04' + min: '-4.315e-02' + shape: + - 1024 + sum: '-1.733e-01' +network.model.decoder.layers.19.self_attn.v_proj.weight: + device: cuda:0 + max: '1.249e-01' + mean: '-6.37e-05' + min: '-1.249e-01' + shape: + - 1024 + - 1024 + sum: '-6.679e+01' +network.model.decoder.layers.19.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '3.325e-03' + min: '-1.936e-01' + shape: + - 1024 + sum: '3.405e+00' +network.model.decoder.layers.19.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.2.fc1.bias: + device: cuda:0 + max: '7.135e-02' + mean: '-2.341e-02' + min: '-6.665e-02' + shape: + - 4096 + sum: '-9.591e+01' +network.model.decoder.layers.2.fc1.weight: + device: cuda:0 + max: '1.25e-01' + mean: '2.334e-04' + min: '-1.255e-01' + shape: + - 4096 + - 1024 + sum: '9.791e+02' +network.model.decoder.layers.2.fc2.bias: + device: cuda:0 + max: '7.172e-02' + mean: '3.129e-04' + min: '-7.66e-02' + shape: + - 1024 + sum: '3.204e-01' +network.model.decoder.layers.2.fc2.weight: + device: cuda:0 + max: '1.294e-01' + mean: '-1.695e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-7.109e+00' +network.model.decoder.layers.2.final_layer_norm.bias: + device: cuda:0 + max: '1.257e-01' + mean: '9.144e-03' + min: '-1.251e-01' + shape: + - 1024 + sum: '9.364e+00' +network.model.decoder.layers.2.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.2.self_attn.k_proj.bias: + device: cuda:0 + max: '6.384e-02' + mean: '8.869e-03' + min: '-6.445e-02' + shape: + - 1024 + sum: '9.082e+00' +network.model.decoder.layers.2.self_attn.k_proj.weight: + device: cuda:0 + max: '1.292e-01' + mean: '2.489e-05' + min: '-1.265e-01' + shape: + - 1024 + - 1024 + sum: '2.61e+01' +network.model.decoder.layers.2.self_attn.out_proj.bias: + device: cuda:0 + max: '1.234e-01' + mean: '3.411e-04' + min: '-8.948e-02' + shape: + - 1024 + sum: '3.493e-01' +network.model.decoder.layers.2.self_attn.out_proj.weight: + device: cuda:0 + max: '1.317e-01' + mean: '-6.495e-06' + min: '-1.283e-01' + shape: + - 1024 + - 1024 + sum: '-6.811e+00' +network.model.decoder.layers.2.self_attn.q_proj.bias: + device: cuda:0 + max: '1.249e-01' + mean: '9.792e-04' + min: '-1.255e-01' + shape: + - 1024 + sum: '1.003e+00' +network.model.decoder.layers.2.self_attn.q_proj.weight: + device: cuda:0 + max: '1.257e-01' + mean: '1.202e-05' + min: '-1.271e-01' + shape: + - 1024 + - 1024 + sum: '1.260e+01' +network.model.decoder.layers.2.self_attn.v_proj.bias: + device: cuda:0 + max: '4.211e-02' + mean: '-9.478e-05' + min: '-3.799e-02' + shape: + - 1024 + sum: '-9.706e-02' +network.model.decoder.layers.2.self_attn.v_proj.weight: + device: cuda:0 + max: '1.234e-01' + mean: '3.971e-05' + min: '-1.171e-01' + shape: + - 1024 + - 1024 + sum: '4.164e+01' +network.model.decoder.layers.2.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.309e-01' + mean: '-1.911e-03' + min: '-1.254e-01' + shape: + - 1024 + sum: '-1.957e+00' +network.model.decoder.layers.2.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.20.fc1.bias: + device: cuda:0 + max: '7.928e-02' + mean: '-1.524e-02' + min: '-7.220e-02' + shape: + - 4096 + sum: '-6.244e+01' +network.model.decoder.layers.20.fc1.weight: + device: cuda:0 + max: '1.277e-01' + mean: '-1.853e-04' + min: '-1.271e-01' + shape: + - 4096 + - 1024 + sum: '-7.770e+02' +network.model.decoder.layers.20.fc2.bias: + device: cuda:0 + max: '6.787e-02' + mean: '-1.132e-04' + min: '-7.617e-02' + shape: + - 1024 + sum: '-1.159e-01' +network.model.decoder.layers.20.fc2.weight: + device: cuda:0 + max: '1.27e-01' + mean: '6.366e-06' + min: '-2.393e-01' + shape: + - 1024 + - 4096 + sum: '2.670e+01' +network.model.decoder.layers.20.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-9.149e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '-9.369e+00' +network.model.decoder.layers.20.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.20.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '1.126e-02' + min: '-1.25e-01' + shape: + - 1024 + sum: '1.153e+01' +network.model.decoder.layers.20.self_attn.k_proj.weight: + device: cuda:0 + max: '1.356e-01' + mean: '4.825e-05' + min: '-1.333e-01' + shape: + - 1024 + - 1024 + sum: '5.059e+01' +network.model.decoder.layers.20.self_attn.out_proj.bias: + device: cuda:0 + max: '6.512e-02' + mean: '-8.754e-05' + min: '-1.215e-01' + shape: + - 1024 + sum: '-8.964e-02' +network.model.decoder.layers.20.self_attn.out_proj.weight: + device: cuda:0 + max: '1.334e-01' + mean: '8.321e-06' + min: '-1.311e-01' + shape: + - 1024 + - 1024 + sum: '8.725e+00' +network.model.decoder.layers.20.self_attn.q_proj.bias: + device: cuda:0 + max: '1.252e-01' + mean: '-2.386e-03' + min: '-1.256e-01' + shape: + - 1024 + sum: '-2.444e+00' +network.model.decoder.layers.20.self_attn.q_proj.weight: + device: cuda:0 + max: '1.278e-01' + mean: '1.178e-07' + min: '-1.279e-01' + shape: + - 1024 + - 1024 + sum: '1.235e-01' +network.model.decoder.layers.20.self_attn.v_proj.bias: + device: cuda:0 + max: '4.395e-02' + mean: '-3.544e-04' + min: '-4.248e-02' + shape: + - 1024 + sum: '-3.629e-01' +network.model.decoder.layers.20.self_attn.v_proj.weight: + device: cuda:0 + max: '1.246e-01' + mean: '1.676e-06' + min: '-1.249e-01' + shape: + - 1024 + - 1024 + sum: '1.757e+00' +network.model.decoder.layers.20.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '3.003e-03' + min: '-1.256e-01' + shape: + - 1024 + sum: '3.075e+00' +network.model.decoder.layers.20.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.21.fc1.bias: + device: cuda:0 + max: '8.362e-02' + mean: '-1.634e-02' + min: '-9.613e-02' + shape: + - 4096 + sum: '-6.693e+01' +network.model.decoder.layers.21.fc1.weight: + device: cuda:0 + max: '1.289e-01' + mean: '-1.814e-04' + min: '-1.299e-01' + shape: + - 4096 + - 1024 + sum: '-7.611e+02' +network.model.decoder.layers.21.fc2.bias: + device: cuda:0 + max: '9.045e-02' + mean: '5.474e-05' + min: '-7.306e-02' + shape: + - 1024 + sum: '5.605e-02' +network.model.decoder.layers.21.fc2.weight: + device: cuda:0 + max: '1.322e-01' + mean: '3.575e-07' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '1.499e+00' +network.model.decoder.layers.21.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-5.773e-03' + min: '-1.249e-01' + shape: + - 1024 + sum: '-5.912e+00' +network.model.decoder.layers.21.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.21.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '9.81e-03' + min: '-1.318e-01' + shape: + - 1024 + sum: '1.005e+01' +network.model.decoder.layers.21.self_attn.k_proj.weight: + device: cuda:0 + max: '1.425e-01' + mean: '-2.337e-05' + min: '-1.454e-01' + shape: + - 1024 + - 1024 + sum: '-2.450e+01' +network.model.decoder.layers.21.self_attn.out_proj.bias: + device: cuda:0 + max: '7.263e-02' + mean: '-6.624e-05' + min: '-9.937e-02' + shape: + - 1024 + sum: '-6.783e-02' +network.model.decoder.layers.21.self_attn.out_proj.weight: + device: cuda:0 + max: '1.294e-01' + mean: '1.762e-06' + min: '-1.285e-01' + shape: + - 1024 + - 1024 + sum: '1.847e+00' +network.model.decoder.layers.21.self_attn.q_proj.bias: + device: cuda:0 + max: '1.257e-01' + mean: '-1.89e-03' + min: '-1.257e-01' + shape: + - 1024 + sum: '-1.935e+00' +network.model.decoder.layers.21.self_attn.q_proj.weight: + device: cuda:0 + max: '1.327e-01' + mean: '-1.882e-05' + min: '-1.31e-01' + shape: + - 1024 + - 1024 + sum: '-1.974e+01' +network.model.decoder.layers.21.self_attn.v_proj.bias: + device: cuda:0 + max: '4.669e-02' + mean: '-2.74e-04' + min: '-4.211e-02' + shape: + - 1024 + sum: '-2.806e-01' +network.model.decoder.layers.21.self_attn.v_proj.weight: + device: cuda:0 + max: '1.25e-01' + mean: '-7.892e-05' + min: '-1.249e-01' + shape: + - 1024 + - 1024 + sum: '-8.276e+01' +network.model.decoder.layers.21.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '3.155e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '3.231e+00' +network.model.decoder.layers.21.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.22.fc1.bias: + device: cuda:0 + max: '1.251e-01' + mean: '-1.548e-02' + min: '-1.254e-01' + shape: + - 4096 + sum: '-6.341e+01' +network.model.decoder.layers.22.fc1.weight: + device: cuda:0 + max: '1.278e-01' + mean: '-1.567e-04' + min: '-1.277e-01' + shape: + - 4096 + - 1024 + sum: '-6.574e+02' +network.model.decoder.layers.22.fc2.bias: + device: cuda:0 + max: '7.642e-02' + mean: '1.103e-04' + min: '-7.037e-02' + shape: + - 1024 + sum: '1.13e-01' +network.model.decoder.layers.22.fc2.weight: + device: cuda:0 + max: '1.279e-01' + mean: '1.737e-06' + min: '-1.288e-01' + shape: + - 1024 + - 4096 + sum: '7.287e+00' +network.model.decoder.layers.22.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-4.785e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '-4.9e+00' +network.model.decoder.layers.22.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.22.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '6.801e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '6.964e+00' +network.model.decoder.layers.22.self_attn.k_proj.weight: + device: cuda:0 + max: '1.401e-01' + mean: '-8.573e-06' + min: '-1.409e-01' + shape: + - 1024 + - 1024 + sum: '-8.99e+00' +network.model.decoder.layers.22.self_attn.out_proj.bias: + device: cuda:0 + max: '7.709e-02' + mean: '-1.158e-05' + min: '-8.099e-02' + shape: + - 1024 + sum: '-1.186e-02' +network.model.decoder.layers.22.self_attn.out_proj.weight: + device: cuda:0 + max: '1.302e-01' + mean: '-1.088e-06' + min: '-1.293e-01' + shape: + - 1024 + - 1024 + sum: '-1.141e+00' +network.model.decoder.layers.22.self_attn.q_proj.bias: + device: cuda:0 + max: '1.013e-01' + mean: '-1.666e-03' + min: '-1.021e-01' + shape: + - 1024 + sum: '-1.706e+00' +network.model.decoder.layers.22.self_attn.q_proj.weight: + device: cuda:0 + max: '1.331e-01' + mean: '-2.958e-05' + min: '-1.338e-01' + shape: + - 1024 + - 1024 + sum: '-3.102e+01' +network.model.decoder.layers.22.self_attn.v_proj.bias: + device: cuda:0 + max: '4.211e-02' + mean: '5.506e-04' + min: '-4.501e-02' + shape: + - 1024 + sum: '5.638e-01' +network.model.decoder.layers.22.self_attn.v_proj.weight: + device: cuda:0 + max: '1.257e-01' + mean: '-2.981e-05' + min: '-1.25e-01' + shape: + - 1024 + - 1024 + sum: '-3.125e+01' +network.model.decoder.layers.22.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '7.961e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '8.152e-01' +network.model.decoder.layers.22.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.23.fc1.bias: + device: cuda:0 + max: '1.25e-01' + mean: '2.694e-03' + min: '-1.278e-01' + shape: + - 4096 + sum: '1.103e+01' +network.model.decoder.layers.23.fc1.weight: + device: cuda:0 + max: '2.107e-01' + mean: '8.400e-05' + min: '-2.146e-01' + shape: + - 4096 + - 1024 + sum: '3.523e+02' +network.model.decoder.layers.23.fc2.bias: + device: cuda:0 + max: '6.299e-02' + mean: '1.316e-03' + min: '-6.311e-02' + shape: + - 1024 + sum: '1.348e+00' +network.model.decoder.layers.23.fc2.weight: + device: cuda:0 + max: '2.5e-01' + mean: '1.024e-05' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '4.294e+01' +network.model.decoder.layers.23.final_layer_norm.bias: + device: cuda:0 + max: '7.251e-02' + mean: '9.345e-03' + min: '-7.196e-02' + shape: + - 1024 + sum: '9.57e+00' +network.model.decoder.layers.23.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.23.self_attn.k_proj.bias: + device: cuda:0 + max: '2.219e-01' + mean: '3.647e-03' + min: '-1.824e-01' + shape: + - 1024 + sum: '3.734e+00' +network.model.decoder.layers.23.self_attn.k_proj.weight: + device: cuda:0 + max: '1.294e-01' + mean: '-1.63e-05' + min: '-1.304e-01' + shape: + - 1024 + - 1024 + sum: '-1.709e+01' +network.model.decoder.layers.23.self_attn.out_proj.bias: + device: cuda:0 + max: '7.605e-02' + mean: '-1.183e-04' + min: '-6.47e-02' + shape: + - 1024 + sum: '-1.212e-01' +network.model.decoder.layers.23.self_attn.out_proj.weight: + device: cuda:0 + max: '2.5e-01' + mean: '-1.078e-05' + min: '-2.5e-01' + shape: + - 1024 + - 1024 + sum: '-1.130e+01' +network.model.decoder.layers.23.self_attn.q_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-2.744e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-2.809e-01' +network.model.decoder.layers.23.self_attn.q_proj.weight: + device: cuda:0 + max: '1.338e-01' + mean: '2.096e-05' + min: '-1.337e-01' + shape: + - 1024 + - 1024 + sum: '2.197e+01' +network.model.decoder.layers.23.self_attn.v_proj.bias: + device: cuda:0 + max: '4.068e-02' + mean: '2.158e-05' + min: '-4.48e-02' + shape: + - 1024 + sum: '2.210e-02' +network.model.decoder.layers.23.self_attn.v_proj.weight: + device: cuda:0 + max: '1.267e-01' + mean: '6.273e-05' + min: '-1.256e-01' + shape: + - 1024 + - 1024 + sum: '6.577e+01' +network.model.decoder.layers.23.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '1.700e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '1.741e+00' +network.model.decoder.layers.23.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.3.fc1.bias: + device: cuda:0 + max: '8.453e-02' + mean: '-2.474e-02' + min: '-1.194e-01' + shape: + - 4096 + sum: '-1.013e+02' +network.model.decoder.layers.3.fc1.weight: + device: cuda:0 + max: '1.251e-01' + mean: '1.348e-04' + min: '-1.252e-01' + shape: + - 4096 + - 1024 + sum: '5.654e+02' +network.model.decoder.layers.3.fc2.bias: + device: cuda:0 + max: '7.086e-02' + mean: '1.769e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '1.811e-01' +network.model.decoder.layers.3.fc2.weight: + device: cuda:0 + max: '1.276e-01' + mean: '1.857e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '7.790e+00' +network.model.decoder.layers.3.final_layer_norm.bias: + device: cuda:0 + max: '1.254e-01' + mean: '6.555e-03' + min: '-1.254e-01' + shape: + - 1024 + sum: '6.712e+00' +network.model.decoder.layers.3.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.3.self_attn.k_proj.bias: + device: cuda:0 + max: '6.372e-02' + mean: '8.278e-03' + min: '-3.555e-02' + shape: + - 1024 + sum: '8.477e+00' +network.model.decoder.layers.3.self_attn.k_proj.weight: + device: cuda:0 + max: '1.266e-01' + mean: '-1.901e-05' + min: '-1.266e-01' + shape: + - 1024 + - 1024 + sum: '-1.993e+01' +network.model.decoder.layers.3.self_attn.out_proj.bias: + device: cuda:0 + max: '1.240e-01' + mean: '1.084e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '1.11e-01' +network.model.decoder.layers.3.self_attn.out_proj.weight: + device: cuda:0 + max: '1.764e-01' + mean: '-1.601e-06' + min: '-1.614e-01' + shape: + - 1024 + - 1024 + sum: '-1.679e+00' +network.model.decoder.layers.3.self_attn.q_proj.bias: + device: cuda:0 + max: '1.248e-01' + mean: '-2.804e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-2.871e-01' +network.model.decoder.layers.3.self_attn.q_proj.weight: + device: cuda:0 + max: '1.266e-01' + mean: '-1.642e-05' + min: '-1.266e-01' + shape: + - 1024 + - 1024 + sum: '-1.721e+01' +network.model.decoder.layers.3.self_attn.v_proj.bias: + device: cuda:0 + max: '3.882e-02' + mean: '-9.93e-04' + min: '-4.312e-02' + shape: + - 1024 + sum: '-1.017e+00' +network.model.decoder.layers.3.self_attn.v_proj.weight: + device: cuda:0 + max: '1.216e-01' + mean: '-9.011e-05' + min: '-1.204e-01' + shape: + - 1024 + - 1024 + sum: '-9.449e+01' +network.model.decoder.layers.3.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.290e-01' + mean: '-4.648e-04' + min: '-1.259e-01' + shape: + - 1024 + sum: '-4.76e-01' +network.model.decoder.layers.3.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.4.fc1.bias: + device: cuda:0 + max: '7.648e-02' + mean: '-2.333e-02' + min: '-1.11e-01' + shape: + - 4096 + sum: '-9.556e+01' +network.model.decoder.layers.4.fc1.weight: + device: cuda:0 + max: '1.252e-01' + mean: '7.858e-05' + min: '-1.261e-01' + shape: + - 4096 + - 1024 + sum: '3.296e+02' +network.model.decoder.layers.4.fc2.bias: + device: cuda:0 + max: '6.671e-02' + mean: '6.644e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '6.803e-01' +network.model.decoder.layers.4.fc2.weight: + device: cuda:0 + max: '1.281e-01' + mean: '2.081e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '8.729e+00' +network.model.decoder.layers.4.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '2.551e-03' + min: '-1.259e-01' + shape: + - 1024 + sum: '2.613e+00' +network.model.decoder.layers.4.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.4.self_attn.k_proj.bias: + device: cuda:0 + max: '6.433e-02' + mean: '9.123e-03' + min: '-6.219e-02' + shape: + - 1024 + sum: '9.342e+00' +network.model.decoder.layers.4.self_attn.k_proj.weight: + device: cuda:0 + max: '1.298e-01' + mean: '3.159e-05' + min: '-1.27e-01' + shape: + - 1024 + - 1024 + sum: '3.312e+01' +network.model.decoder.layers.4.self_attn.out_proj.bias: + device: cuda:0 + max: '1.113e-01' + mean: '3.284e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '3.363e-01' +network.model.decoder.layers.4.self_attn.out_proj.weight: + device: cuda:0 + max: '1.307e-01' + mean: '5.154e-06' + min: '-1.296e-01' + shape: + - 1024 + - 1024 + sum: '5.404e+00' +network.model.decoder.layers.4.self_attn.q_proj.bias: + device: cuda:0 + max: '1.251e-01' + mean: '1.442e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '1.477e+00' +network.model.decoder.layers.4.self_attn.q_proj.weight: + device: cuda:0 + max: '1.277e-01' + mean: '-1.649e-06' + min: '-1.267e-01' + shape: + - 1024 + - 1024 + sum: '-1.729e+00' +network.model.decoder.layers.4.self_attn.v_proj.bias: + device: cuda:0 + max: '3.711e-02' + mean: '1.497e-04' + min: '-3.909e-02' + shape: + - 1024 + sum: '1.533e-01' +network.model.decoder.layers.4.self_attn.v_proj.weight: + device: cuda:0 + max: '1.139e-01' + mean: '6.411e-05' + min: '-1.227e-01' + shape: + - 1024 + - 1024 + sum: '6.722e+01' +network.model.decoder.layers.4.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.271e-01' + mean: '1.923e-04' + min: '-1.272e-01' + shape: + - 1024 + sum: '1.969e-01' +network.model.decoder.layers.4.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.5.fc1.bias: + device: cuda:0 + max: '9.772e-02' + mean: '-2.182e-02' + min: '-1.219e-01' + shape: + - 4096 + sum: '-8.94e+01' +network.model.decoder.layers.5.fc1.weight: + device: cuda:0 + max: '1.257e-01' + mean: '1.105e-04' + min: '-1.254e-01' + shape: + - 4096 + - 1024 + sum: '4.637e+02' +network.model.decoder.layers.5.fc2.bias: + device: cuda:0 + max: '6.384e-02' + mean: '9.162e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '9.382e-02' +network.model.decoder.layers.5.fc2.weight: + device: cuda:0 + max: '1.262e-01' + mean: '4.982e-07' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '2.089e+00' +network.model.decoder.layers.5.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '4.158e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '4.258e-01' +network.model.decoder.layers.5.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.5.self_attn.k_proj.bias: + device: cuda:0 + max: '7.245e-02' + mean: '1.13e-02' + min: '-5.319e-02' + shape: + - 1024 + sum: '1.157e+01' +network.model.decoder.layers.5.self_attn.k_proj.weight: + device: cuda:0 + max: '1.263e-01' + mean: '-5.184e-05' + min: '-1.263e-01' + shape: + - 1024 + - 1024 + sum: '-5.436e+01' +network.model.decoder.layers.5.self_attn.out_proj.bias: + device: cuda:0 + max: '1.068e-01' + mean: '2.054e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.103e-01' +network.model.decoder.layers.5.self_attn.out_proj.weight: + device: cuda:0 + max: '1.582e-01' + mean: '2.069e-05' + min: '-1.821e-01' + shape: + - 1024 + - 1024 + sum: '2.169e+01' +network.model.decoder.layers.5.self_attn.q_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-6.643e-04' + min: '-1.254e-01' + shape: + - 1024 + sum: '-6.802e-01' +network.model.decoder.layers.5.self_attn.q_proj.weight: + device: cuda:0 + max: '1.261e-01' + mean: '1.035e-05' + min: '-1.27e-01' + shape: + - 1024 + - 1024 + sum: '1.086e+01' +network.model.decoder.layers.5.self_attn.v_proj.bias: + device: cuda:0 + max: '4.800e-02' + mean: '5.821e-04' + min: '-4.202e-02' + shape: + - 1024 + sum: '5.960e-01' +network.model.decoder.layers.5.self_attn.v_proj.weight: + device: cuda:0 + max: '1.182e-01' + mean: '1.019e-05' + min: '-1.202e-01' + shape: + - 1024 + - 1024 + sum: '1.068e+01' +network.model.decoder.layers.5.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.263e-01' + mean: '-4.794e-04' + min: '-1.257e-01' + shape: + - 1024 + sum: '-4.909e-01' +network.model.decoder.layers.5.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.6.fc1.bias: + device: cuda:0 + max: '1.191e-01' + mean: '-2.029e-02' + min: '-9.454e-02' + shape: + - 4096 + sum: '-8.312e+01' +network.model.decoder.layers.6.fc1.weight: + device: cuda:0 + max: '1.282e-01' + mean: '1.416e-04' + min: '-1.27e-01' + shape: + - 4096 + - 1024 + sum: '5.939e+02' +network.model.decoder.layers.6.fc2.bias: + device: cuda:0 + max: '6.439e-02' + mean: '-1.532e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-1.569e-01' +network.model.decoder.layers.6.fc2.weight: + device: cuda:0 + max: '1.343e-01' + mean: '-3.220e-07' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-1.351e+00' +network.model.decoder.layers.6.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-1.357e-04' + min: '-1.254e-01' + shape: + - 1024 + sum: '-1.389e-01' +network.model.decoder.layers.6.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.6.self_attn.k_proj.bias: + device: cuda:0 + max: '8.856e-02' + mean: '1.296e-02' + min: '-6.641e-02' + shape: + - 1024 + sum: '1.327e+01' +network.model.decoder.layers.6.self_attn.k_proj.weight: + device: cuda:0 + max: '1.300e-01' + mean: '1.62e-05' + min: '-1.300e-01' + shape: + - 1024 + - 1024 + sum: '1.698e+01' +network.model.decoder.layers.6.self_attn.out_proj.bias: + device: cuda:0 + max: '6.47e-02' + mean: '-1.618e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-1.657e-01' +network.model.decoder.layers.6.self_attn.out_proj.weight: + device: cuda:0 + max: '1.340e-01' + mean: '9.419e-06' + min: '-1.305e-01' + shape: + - 1024 + - 1024 + sum: '9.877e+00' +network.model.decoder.layers.6.self_attn.q_proj.bias: + device: cuda:0 + max: '1.256e-01' + mean: '2.037e-03' + min: '-1.257e-01' + shape: + - 1024 + sum: '2.086e+00' +network.model.decoder.layers.6.self_attn.q_proj.weight: + device: cuda:0 + max: '1.272e-01' + mean: '4.741e-06' + min: '-1.276e-01' + shape: + - 1024 + - 1024 + sum: '4.972e+00' +network.model.decoder.layers.6.self_attn.v_proj.bias: + device: cuda:0 + max: '4.633e-02' + mean: '3.225e-05' + min: '-4.407e-02' + shape: + - 1024 + sum: '3.303e-02' +network.model.decoder.layers.6.self_attn.v_proj.weight: + device: cuda:0 + max: '1.147e-01' + mean: '4.657e-05' + min: '-1.19e-01' + shape: + - 1024 + - 1024 + sum: '4.883e+01' +network.model.decoder.layers.6.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-1.389e-06' + min: '-1.257e-01' + shape: + - 1024 + sum: '-1.423e-03' +network.model.decoder.layers.6.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.7.fc1.bias: + device: cuda:0 + max: '1.077e-01' + mean: '-2.155e-02' + min: '-1.226e-01' + shape: + - 4096 + sum: '-8.828e+01' +network.model.decoder.layers.7.fc1.weight: + device: cuda:0 + max: '1.284e-01' + mean: '1.858e-04' + min: '-1.311e-01' + shape: + - 4096 + - 1024 + sum: '7.793e+02' +network.model.decoder.layers.7.fc2.bias: + device: cuda:0 + max: '6.897e-02' + mean: '4.677e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '4.789e-02' +network.model.decoder.layers.7.fc2.weight: + device: cuda:0 + max: '1.459e-01' + mean: '-4.578e-07' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-1.92e+00' +network.model.decoder.layers.7.final_layer_norm.bias: + device: cuda:0 + max: '1.093e-01' + mean: '-1.554e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '-1.591e+00' +network.model.decoder.layers.7.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.7.self_attn.k_proj.bias: + device: cuda:0 + max: '1.021e-01' + mean: '1.303e-02' + min: '-6.25e-02' + shape: + - 1024 + sum: '1.334e+01' +network.model.decoder.layers.7.self_attn.k_proj.weight: + device: cuda:0 + max: '1.323e-01' + mean: '1.285e-05' + min: '-1.333e-01' + shape: + - 1024 + - 1024 + sum: '1.348e+01' +network.model.decoder.layers.7.self_attn.out_proj.bias: + device: cuda:0 + max: '5.948e-02' + mean: '2.333e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.389e-01' +network.model.decoder.layers.7.self_attn.out_proj.weight: + device: cuda:0 + max: '1.316e-01' + mean: '-1.173e-06' + min: '-1.301e-01' + shape: + - 1024 + - 1024 + sum: '-1.230e+00' +network.model.decoder.layers.7.self_attn.q_proj.bias: + device: cuda:0 + max: '1.252e-01' + mean: '3.876e-03' + min: '-1.261e-01' + shape: + - 1024 + sum: '3.969e+00' +network.model.decoder.layers.7.self_attn.q_proj.weight: + device: cuda:0 + max: '1.272e-01' + mean: '-3.278e-06' + min: '-1.292e-01' + shape: + - 1024 + - 1024 + sum: '-3.437e+00' +network.model.decoder.layers.7.self_attn.v_proj.bias: + device: cuda:0 + max: '4.297e-02' + mean: '4.138e-04' + min: '-4.077e-02' + shape: + - 1024 + sum: '4.237e-01' +network.model.decoder.layers.7.self_attn.v_proj.weight: + device: cuda:0 + max: '1.183e-01' + mean: '-3.309e-05' + min: '-1.174e-01' + shape: + - 1024 + - 1024 + sum: '-3.47e+01' +network.model.decoder.layers.7.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '1.830e-04' + min: '-1.267e-01' + shape: + - 1024 + sum: '1.874e-01' +network.model.decoder.layers.7.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.8.fc1.bias: + device: cuda:0 + max: '6.335e-02' + mean: '-2.258e-02' + min: '-1.26e-01' + shape: + - 4096 + sum: '-9.249e+01' +network.model.decoder.layers.8.fc1.weight: + device: cuda:0 + max: '1.278e-01' + mean: '5.06e-05' + min: '-1.271e-01' + shape: + - 4096 + - 1024 + sum: '2.122e+02' +network.model.decoder.layers.8.fc2.bias: + device: cuda:0 + max: '6.818e-02' + mean: '-1.369e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-1.402e-01' +network.model.decoder.layers.8.fc2.weight: + device: cuda:0 + max: '1.392e-01' + mean: '-4.149e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-1.740e+01' +network.model.decoder.layers.8.final_layer_norm.bias: + device: cuda:0 + max: '6.47e-02' + mean: '-3.244e-03' + min: '-1.252e-01' + shape: + - 1024 + sum: '-3.322e+00' +network.model.decoder.layers.8.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.8.self_attn.k_proj.bias: + device: cuda:0 + max: '9.65e-02' + mean: '1.109e-02' + min: '-6.247e-02' + shape: + - 1024 + sum: '1.136e+01' +network.model.decoder.layers.8.self_attn.k_proj.weight: + device: cuda:0 + max: '1.318e-01' + mean: '8.991e-06' + min: '-1.32e-01' + shape: + - 1024 + - 1024 + sum: '9.428e+00' +network.model.decoder.layers.8.self_attn.out_proj.bias: + device: cuda:0 + max: '6.317e-02' + mean: '-7.463e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '-7.643e-02' +network.model.decoder.layers.8.self_attn.out_proj.weight: + device: cuda:0 + max: '1.306e-01' + mean: '6.679e-06' + min: '-1.327e-01' + shape: + - 1024 + - 1024 + sum: '7.003e+00' +network.model.decoder.layers.8.self_attn.q_proj.bias: + device: cuda:0 + max: '1.256e-01' + mean: '1.131e-05' + min: '-1.257e-01' + shape: + - 1024 + sum: '1.159e-02' +network.model.decoder.layers.8.self_attn.q_proj.weight: + device: cuda:0 + max: '1.311e-01' + mean: '-4.181e-07' + min: '-1.293e-01' + shape: + - 1024 + - 1024 + sum: '-4.384e-01' +network.model.decoder.layers.8.self_attn.v_proj.bias: + device: cuda:0 + max: '4.486e-02' + mean: '5.294e-04' + min: '-4.657e-02' + shape: + - 1024 + sum: '5.421e-01' +network.model.decoder.layers.8.self_attn.v_proj.weight: + device: cuda:0 + max: '1.242e-01' + mean: '1.489e-05' + min: '-1.243e-01' + shape: + - 1024 + - 1024 + sum: '1.561e+01' +network.model.decoder.layers.8.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '1.027e-03' + min: '-1.254e-01' + shape: + - 1024 + sum: '1.052e+00' +network.model.decoder.layers.8.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.9.fc1.bias: + device: cuda:0 + max: '7.355e-02' + mean: '-2.086e-02' + min: '-8.301e-02' + shape: + - 4096 + sum: '-8.545e+01' +network.model.decoder.layers.9.fc1.weight: + device: cuda:0 + max: '1.256e-01' + mean: '2.51e-05' + min: '-1.265e-01' + shape: + - 4096 + - 1024 + sum: '1.053e+02' +network.model.decoder.layers.9.fc2.bias: + device: cuda:0 + max: '6.647e-02' + mean: '2.622e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.685e-01' +network.model.decoder.layers.9.fc2.weight: + device: cuda:0 + max: '1.256e-01' + mean: '-3.312e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-1.389e+01' +network.model.decoder.layers.9.final_layer_norm.bias: + device: cuda:0 + max: '7.349e-02' + mean: '-8.035e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '-8.227e+00' +network.model.decoder.layers.9.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.9.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '8.960e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '9.175e+00' +network.model.decoder.layers.9.self_attn.k_proj.weight: + device: cuda:0 + max: '1.346e-01' + mean: '4.302e-05' + min: '-1.346e-01' + shape: + - 1024 + - 1024 + sum: '4.511e+01' +network.model.decoder.layers.9.self_attn.out_proj.bias: + device: cuda:0 + max: '6.616e-02' + mean: '-8.681e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '-8.89e-02' +network.model.decoder.layers.9.self_attn.out_proj.weight: + device: cuda:0 + max: '1.497e-01' + mean: '-7.002e-06' + min: '-1.382e-01' + shape: + - 1024 + - 1024 + sum: '-7.342e+00' +network.model.decoder.layers.9.self_attn.q_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '2.336e-03' + min: '-1.208e-01' + shape: + - 1024 + sum: '2.392e+00' +network.model.decoder.layers.9.self_attn.q_proj.weight: + device: cuda:0 + max: '1.344e-01' + mean: '-1.583e-05' + min: '-1.379e-01' + shape: + - 1024 + - 1024 + sum: '-1.66e+01' +network.model.decoder.layers.9.self_attn.v_proj.bias: + device: cuda:0 + max: '6.241e-02' + mean: '2.777e-04' + min: '-6.464e-02' + shape: + - 1024 + sum: '2.844e-01' +network.model.decoder.layers.9.self_attn.v_proj.weight: + device: cuda:0 + max: '1.131e-01' + mean: '-2.935e-05' + min: '-1.183e-01' + shape: + - 1024 + - 1024 + sum: '-3.077e+01' +network.model.decoder.layers.9.self_attn_layer_norm.bias: + device: cuda:0 + max: '7.812e-02' + mean: '9.632e-04' + min: '-1.255e-01' + shape: + - 1024 + sum: '9.864e-01' +network.model.decoder.layers.9.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.project_in.weight: + device: cuda:0 + max: '1.305e-01' + mean: '3.482e-05' + min: '-1.318e-01' + shape: + - 1024 + - 512 + sum: '1.826e+01' +network.model.decoder.project_out.weight: + device: cuda:0 + max: '1.373e-01' + mean: '8.706e-05' + min: '-1.376e-01' + shape: + - 512 + - 1024 + sum: '4.564e+01' diff --git a/.regression_files/project/algorithms/llm_finetuning_test/test_training_batch_doesnt_change/llm_finetuning.yaml b/.regression_files/project/algorithms/llm_finetuning_test/test_training_batch_doesnt_change/llm_finetuning.yaml new file mode 100644 index 00000000..84eb1516 --- /dev/null +++ b/.regression_files/project/algorithms/llm_finetuning_test/test_training_batch_doesnt_change/llm_finetuning.yaml @@ -0,0 +1,27 @@ +attention_mask: + device: cuda:0 + max: 1 + mean: '1.e+00' + min: 1 + shape: + - 8 + - 256 + sum: 2048 +input_ids: + device: cuda:0 + max: 50118 + mean: '5.447e+03' + min: 2 + shape: + - 8 + - 256 + sum: 11154886 +labels: + device: cuda:0 + max: 50118 + mean: '5.447e+03' + min: 2 + shape: + - 8 + - 256 + sum: 11154886 diff --git a/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_test.yaml b/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_test.yaml deleted file mode 100644 index 37d8958b..00000000 --- a/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_test.yaml +++ /dev/null @@ -1,35 +0,0 @@ -attention_mask: - device: cpu - max: 1 - mean: '1.021e-01' - min: 0 - shape: - - 32 - - 128 - sum: 418 -input_ids: - device: cpu - max: 29043 - mean: '1.648e+02' - min: 0 - shape: - - 32 - - 128 - sum: 675172 -labels: - device: cpu - max: -1 - mean: '-1.e+00' - min: -1 - shape: - - 32 - sum: -32 -token_type_ids: - device: cpu - max: 0 - mean: '0.e+00' - min: 0 - shape: - - 32 - - 128 - sum: 0 diff --git a/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_train.yaml b/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_train.yaml deleted file mode 100644 index 89d6925e..00000000 --- a/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_train.yaml +++ /dev/null @@ -1,35 +0,0 @@ -attention_mask: - device: cpu - max: 1 - mean: '8.374e-02' - min: 0 - shape: - - 32 - - 128 - sum: 343 -input_ids: - device: cpu - max: 26101 - mean: '1.597e+02' - min: 0 - shape: - - 32 - - 128 - sum: 654306 -labels: - device: cpu - max: 1 - mean: '7.188e-01' - min: 0 - shape: - - 32 - sum: 23 -token_type_ids: - device: cpu - max: 0 - mean: '0.e+00' - min: 0 - shape: - - 32 - - 128 - sum: 0 diff --git a/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_validate.yaml b/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_validate.yaml deleted file mode 100644 index ef5d1104..00000000 --- a/.regression_files/project/datamodules/datamodules_test/test_first_batch/hf_text_algorithm_no_op_validate.yaml +++ /dev/null @@ -1,35 +0,0 @@ -attention_mask: - device: cpu - max: 1 - mean: '9.277e-02' - min: 0 - shape: - - 32 - - 128 - sum: 380 -input_ids: - device: cpu - max: 29043 - mean: '1.362e+02' - min: 0 - shape: - - 32 - - 128 - sum: 557879 -labels: - device: cpu - max: 1 - mean: '7.5e-01' - min: 0 - shape: - - 32 - sum: 24 -token_type_ids: - device: cpu - max: 0 - mean: '0.e+00' - min: 0 - shape: - - 32 - - 128 - sum: 0 diff --git a/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_test.yaml b/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_test.yaml deleted file mode 100644 index 8e49803a..00000000 --- a/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_test.yaml +++ /dev/null @@ -1,19 +0,0 @@ -'0': - device: cpu - max: '1.e+00' - mean: '4.611e-01' - min: '0.e+00' - shape: - - 64 - - 3 - - 32 - - 32 - sum: '9.065e+04' -'1': - device: cpu - max: 987 - mean: '5.432e+02' - min: 49 - shape: - - 64 - sum: 34767 diff --git a/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_train.yaml b/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_train.yaml deleted file mode 100644 index 214d5795..00000000 --- a/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_train.yaml +++ /dev/null @@ -1,19 +0,0 @@ -'0': - device: cpu - max: '2.640e+00' - mean: '3.701e-03' - min: '-2.118e+00' - shape: - - 64 - - 3 - - 32 - - 32 - sum: '7.277e+02' -'1': - device: cpu - max: 993 - mean: '4.871e+02' - min: 1 - shape: - - 64 - sum: 31176 diff --git a/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_validate.yaml b/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_validate.yaml deleted file mode 100644 index 2cf23250..00000000 --- a/.regression_files/project/datamodules/datamodules_test/test_first_batch/imagenet32_algorithm_no_op_validate.yaml +++ /dev/null @@ -1,19 +0,0 @@ -'0': - device: cpu - max: '1.e+00' - mean: '4.266e-01' - min: '0.e+00' - shape: - - 64 - - 3 - - 32 - - 32 - sum: '8.388e+04' -'1': - device: cpu - max: 973 - mean: '4.845e+02' - min: 21 - shape: - - 64 - sum: 31006 diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 5dba41f0..a65eb75e 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -7,11 +7,11 @@ * [Thorough automated testing on SLURM clusters](features/testing.md) * features/*.md * [Examples 🧪](examples/index.md) - * [Image Classification (⚡)](examples/torch_sl_example.md) - * [Image Classification (jax+⚡)](examples/jax_sl_example.md) + * [Image Classification (⚡)](examples/image_classification.md) + * [Image Classification (jax+⚡)](examples/jax_image_classification.md) * [Text Classification (🤗+⚡)](examples/text_classification.md) * [Fine-tuning an LLM (🤗+⚡)](examples/llm_finetuning.md) - * [RL (jax)](examples/jax_rl_example.md) + * [Reinforcement Learning (jax)](examples/jax_rl.md) * [Running sweeps](examples/sweeps.md) * [Profiling your code📎](examples/profiling.md) * examples/*.md diff --git a/docs/examples/image_classification.md b/docs/examples/image_classification.md new file mode 100644 index 00000000..b8f83160 --- /dev/null +++ b/docs/examples/image_classification.md @@ -0,0 +1,29 @@ +--- +additional_python_references: + - project.algorithms.image_classifier + - lightning.pytorch.core.module +--- + +# Supervised Learning (PyTorch) + + +## ImageClassifier + +The `ImageClassifier` is a simple `LightningModule` for image classification. +It accepts a vision datamodule as input. + +??? note "Click to show the code of the ImageClassifier class." + {{ inline('project.algorithms.image_classifier.ImageClassifier', 4) }} + +## Running the example + +Here is a configuration file that you can use to launch a simple experiment: + +??? note "Click to show the yaml config file" + {{ inline('project/configs/experiment/example.yaml', 4) }} + +You can use it like so: + +```console +python project/main.py experiment=example +``` diff --git a/docs/examples/index.md b/docs/examples/index.md index 4278e2a4..91600c14 100644 --- a/docs/examples/index.md +++ b/docs/examples/index.md @@ -1,9 +1,9 @@ --- additional_python_references: - - project.algorithms.jax_rl_example - - project.algorithms.example - - project.algorithms.jax_example - - project.algorithms.text_classification_example + - project.algorithms.jax_ppo + - project.algorithms.image_classifier + - project.algorithms.jax_image_classifier + - project.algorithms.text_classifier - project.algorithms.llm_finetuning - project.trainers.jax_trainer --- @@ -12,10 +12,10 @@ additional_python_references: This template includes examples that use either Jax, PyTorch, or both! -| Example link | Research Area | Reference link | Frameworks | -| --------------------------------------------------- | ------------------------------------------ | --------------------------- | --------------- | -| [ExampleAlgorithm](torch_sl_example.md) | Supervised Learning (image classification) | `ExampleAlgorithm` | Torch + ⚡ | -| [JaxExample](jax_sl_example.md) | Supervised Learning (image classification) | `JaxExample` | Torch + Jax + ⚡ | -| [TextClassificationExample](text_classification.md) | NLP (text classification) | `TextClassificationExample` | Torch + 🤗 + ⚡ | -| [JaxRLExample](jax_rl_example.md) | RL | `JaxRLExample` | Jax | -| [LLMFinetuningExample](llm_finetuning.md) | NLP (Causal language modeling) | `LLMFineTuningExample` | Torch + 🤗 + ⚡ | +| Example link | Research Area | Reference link | Frameworks | +| --------------------------------------------------------- | ------------------------------------------ | ---------------------- | --------------- | +| [Image Classification](image_classification.md) | Supervised Learning (image classification) | `ImageClassifier` | Torch + ⚡ | +| [Image Classification (Jax)](jax_image_classification.md) | Supervised Learning (image classification) | `JaxImageClassifier` | Torch + Jax + ⚡ | +| [Text Classification](text_classification.md) | NLP (text classification) | `TextClassifier` | Torch + 🤗 + ⚡ | +| [Reinforcement Learning (Jax)](jax_rl.md) | RL | `JaxRLExample` | Jax | +| [LLM Fine-tuning](llm_finetuning.md) | NLP (Causal language modeling) | `LLMFineTuningExample` | Torch + 🤗 + ⚡ | diff --git a/docs/examples/jax_sl_example.md b/docs/examples/jax_image_classification.md similarity index 64% rename from docs/examples/jax_sl_example.md rename to docs/examples/jax_image_classification.md index 1491f7b3..ee1ddc99 100644 --- a/docs/examples/jax_sl_example.md +++ b/docs/examples/jax_image_classification.md @@ -1,8 +1,14 @@ +--- +additional_python_references: + - project.algorithms.jax_image_classifier + - project.trainers.jax_trainer +--- + # Jax + PyTorch-Lightning ⚡ -## `JaxExample`: a LightningModule that trains a Jax network +## A LightningModule that trains a Jax network -The [JaxExample][project.algorithms.jax_example.JaxExample] algorithm uses a network which is a [flax.linen.Module](https://flax.readthedocs.io/en/latest/). +The `JaxImageClassifier` algorithm uses a network which is a [flax.linen.Module](https://flax.readthedocs.io/en/latest/). The network is wrapped with `torch_jax_interop.JaxFunction`, so that it can accept torch tensors as inputs, produces torch tensors as outputs, and the parameters are saved as as `torch.nn.Parameter`s (which use the same underlying memory as the jax arrays). In this example, the loss function and optimizers are in PyTorch, while the network forward and backward passes are written in Jax. @@ -16,24 +22,24 @@ pass uses Jax to calculate the gradients, and the weights are updated by a PyTor !!! question "What about end-to-end training in Jax?" - See the [Jax RL Example](../examples/jax_rl_example.md)! :smile: + See the [Jax RL Example](../examples/jax_rl.md)! :smile: ### Jax Network -{{ inline('project.algorithms.jax_example.CNN') }} +{{ inline('project.algorithms.jax_image_classifier.JaxCNN') }} ### Jax Algorithm -{{ inline('project.algorithms.jax_example.JaxExample') }} +{{ inline('project.algorithms.jax_image_classifier.JaxImageClassifier') }} ### Configs -#### JaxExample algorithm config +#### LightningModule config -{{ inline('project/configs/algorithm/jax_example.yaml') }} +{{ inline('project/configs/algorithm/jax_image_classifier.yaml') }} ## Running the example ```console -$ python project/main.py algorithm=jax_example network=jax_cnn datamodule=cifar10 +$ python project/main.py algorithm=jax_image_classifier network=jax_cnn datamodule=cifar10 ``` diff --git a/docs/examples/jax_rl_example.md b/docs/examples/jax_rl.md similarity index 92% rename from docs/examples/jax_rl_example.md rename to docs/examples/jax_rl.md index e41e6269..ac20b0d5 100644 --- a/docs/examples/jax_rl_example.md +++ b/docs/examples/jax_rl.md @@ -1,6 +1,6 @@ --- additional_python_references: - - project.algorithms.jax_rl_example + - project.algorithms.jax_ppo - project.trainers.jax_trainer --- @@ -31,7 +31,7 @@ It follows the structure of a `JaxModule`, and is trained with a `JaxTrainer`. ??? note "Click to show the code for JaxRLExample" - {{ inline('project.algorithms.jax_rl_example.JaxRLExample', 4) }} + {{ inline('project.algorithms.jax_ppo.JaxRLExample', 4) }} ## JaxModule diff --git a/docs/examples/llm_finetuning.md b/docs/examples/llm_finetuning.md index 0a3d07de..908a7eb7 100644 --- a/docs/examples/llm_finetuning.md +++ b/docs/examples/llm_finetuning.md @@ -7,6 +7,7 @@ additional_python_references: This example is based on [this language modeling example from the HuggingFace transformers documentation](https://huggingface.co/docs/transformers/en/tasks/language_modeling). To better understand what's going on in this example, it is a good idea to read through these tutorials first: + * [Causal language modeling simple example - HuggingFace docs](https://huggingface.co/docs/transformers/en/tasks/language_modeling) * [Fine-tune a language model - Colab Notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb#scrollTo=X6HrpprwIrIz) diff --git a/docs/examples/text_classification.md b/docs/examples/text_classification.md index 68122bc5..1ebc1c00 100644 --- a/docs/examples/text_classification.md +++ b/docs/examples/text_classification.md @@ -1,22 +1,28 @@ -# Text Classification ( + 🤗) +--- +additional_python_references: + - project.algorithms.text_classifier + - project.datamodules.text.text_classification +--- + +# Text Classification (⚡ + 🤗) ## Overview -The [TextClassificationExample][project.algorithms.text_classification_example.TextClassificationExample] is a [LightningModule][lightning.pytorch.core.module.LightningModule] for a simple text classification task. +The `TextClassifier` is a [LightningModule][lightning.pytorch.core.module.LightningModule] for a simple text classification task. -It accepts a [TextClassificationDataModule][project.datamodules.text.TextClassificationDataModule] as input, along with a network. +It accepts a `TextClassificationDataModule` as input, along with a network. -??? note "Click to show the code for HFExample" - {{ inline('project.algorithms.text_classification_example.TextClassificationExample', 4) }} +??? note "Click to show the code of the lightningmodule" + {{ inline('project.algorithms.text_classifier.TextClassifier', 4) }} ## Config files ### Algorithm config ??? note "Click to show the Algorithm config" - Source: project/configs/algorithm/text_classification_example.yaml + Source: project/configs/algorithm/text_classifier.yaml - {{ inline('project/configs/algorithm/text_classification_example.yaml', 4) }} + {{ inline('project/configs/algorithm/text_classifier.yaml', 4) }} ### Datamodule config diff --git a/docs/examples/torch_sl_example.md b/docs/examples/torch_sl_example.md deleted file mode 100644 index 842b8cc9..00000000 --- a/docs/examples/torch_sl_example.md +++ /dev/null @@ -1,17 +0,0 @@ -# Supervised Learning (PyTorch) - -The [ExampleAlgorithm][project.algorithms.ExampleAlgorithm] is a simple [LightningModule][lightning.pytorch.core.module.LightningModule] for image classification. - -??? note "Click to show the code for ExampleAlgorithm" - {{ inline('project.algorithms.example.ExampleAlgorithm', 4) }} - -Here is a configuration file that you can use to launch a simple experiment: - -??? note "Click to show the yaml config file" - {{ inline('project/configs/experiment/example.yaml', 4) }} - -You can use it like so: - -```console -python project/main.py experiment=example -``` diff --git a/docs/features/jax.md b/docs/features/jax.md index e54d4b19..41c67fd3 100644 --- a/docs/features/jax.md +++ b/docs/features/jax.md @@ -1,9 +1,9 @@ --- additional_python_references: - - project.algorithms.jax_rl_example - - project.algorithms.example - - project.algorithms.jax_example - - project.algorithms.text_classification_example + - project.algorithms.jax_ppo + - project.algorithms.image_classifier + - project.algorithms.jax_image_classifier + - project.algorithms.text_classifier - project.trainers.jax_trainer --- @@ -12,18 +12,10 @@ additional_python_references: > 🔥 NOTE: This is a feature that is entirely unique to this template! 🔥 This template includes examples that use either Jax, PyTorch, or both! +There's a table describing each example [here](../examples/index.md#examples). - -| Example link | Reference | Framework | Lightning? | -| --------------------------------------------------------------- | --------------------------- | ----------- | ------------ | -| [ExampleAlgorithm](../examples/jax_sl_example.md) | `ExampleAlgorithm` | Torch | yes | -| [JaxExample](../examples/jax_sl_example.md) | `JaxExample` | Torch + Jax | yes | -| [TextClassificationExample](../examples/text_classification.md) | `TextClassificationExample` | Torch + 🤗 | yes | -| [JaxRLExample](../examples/jax_rl_example.md) | `JaxRLExample` | Jax | no (almost!) | - - -In fact, here you can mix and match both Jax and Torch code. For example, you can use Jax for your dataloading, your network, or the learning algorithm, all while still benefiting from the nice stuff that comes from using PyTorch-Lightning. +You can mix and match both Jax and Torch code. For example, you can use Jax for your dataloading, your network, or the learning algorithm, all while still benefiting from the nice stuff that comes from using PyTorch-Lightning. ??? note "**How does this work?**" Well, we use [torch-jax-interop](https://www.github.com/lebrice/torch_jax_interop), another package developed here at Mila 😎, that allows easy interop between torch and jax code. Feel free to take a look at it if you'd like to use it as part of your own project. 😁 @@ -40,12 +32,12 @@ training loop as usual, you can! The [lightning.Trainer][lightning.pytorch.trainer.trainer.Trainer] will not be able to tell that you're using Jax! -**Take a look at [this image classification example that uses a Jax network](../examples/jax_sl_example.md).** +**Take a look at [this image classification example that uses a Jax network](../examples/jax_image_classification.md).** ## End-to-end training in Jax: the `JaxTrainer` -The `JaxTrainer`, used in the [Jax RL Example](../examples/jax_rl_example.md), follows a similar structure as the lightning Trainer. However, instead of training LightningModules, it trains `JaxModule`s, which are a simplified, jax-based look-alike of `lightning.LightningModule`s. +The `JaxTrainer`, used in the [Jax RL Example](../examples/jax_rl.md), follows a similar structure as the lightning Trainer. However, instead of training LightningModules, it trains `JaxModule`s, which are a simplified, jax-based look-alike of `lightning.LightningModule`s. The "algorithm" needs to match the `JaxModule` protocol: diff --git a/docs/features/testing.md b/docs/features/testing.md index 8e621fd1..e9ea31f2 100644 --- a/docs/features/testing.md +++ b/docs/features/testing.md @@ -55,7 +55,7 @@ The built-in tests cover the following: - forward pass is deterministic & reproducibile; - backward pass is deterministic & reproducibile; -Take a look at [project.algorithms.testsuites.algorithm_tests][] to see the included base tests for algorithms. +Take a look at [project.algorithms.testsuites.lightning_module_tests][] to see the included base tests for algorithms. If you use [Visual Studio Code](https://code.visualstudio.com/), you may want to look into adding the "test explorer" tab to your editor. Then, you'll be able to see and debug the tests using the GUI. @@ -93,7 +93,7 @@ pytest -x -v --slow ## Continuous Integration