diff --git a/.regression_files/project/algorithms/llm_finetuning_test/test_backward_pass_is_reproducible/llm_finetuning.yaml b/.regression_files/project/algorithms/llm_finetuning_test/test_backward_pass_is_reproducible/llm_finetuning.yaml new file mode 100644 index 00000000..e1932620 --- /dev/null +++ b/.regression_files/project/algorithms/llm_finetuning_test/test_backward_pass_is_reproducible/llm_finetuning.yaml @@ -0,0 +1,3286 @@ +batch.attention_mask: + device: cuda:0 + max: 1 + mean: '1.e+00' + min: 1 + shape: + - 8 + - 256 + sum: 2048 +batch.input_ids: + device: cuda:0 + max: 50118 + mean: '5.447e+03' + min: 2 + shape: + - 8 + - 256 + sum: 11154886 +batch.labels: + device: cuda:0 + max: 50118 + mean: '5.447e+03' + min: 2 + shape: + - 8 + - 256 + sum: 11154886 +grads.network.model.decoder.embed_positions.weight: + device: cuda:0 + max: '2.549e-02' + mean: '2.795e-07' + min: '-2.530e-02' + shape: + - 2050 + - 1024 + sum: '5.867e-01' +grads.network.model.decoder.embed_tokens.weight: + device: cuda:0 + max: '7.65e-01' + mean: '-2.928e-07' + min: '-9.832e-01' + shape: + - 50272 + - 512 + sum: '-7.537e+00' +grads.network.model.decoder.layers.0.fc1.bias: + device: cuda:0 + max: '2.624e-03' + mean: '-2.445e-06' + min: '-8.882e-03' + shape: + - 4096 + sum: '-1.001e-02' +grads.network.model.decoder.layers.0.fc1.weight: + device: cuda:0 + max: '8.724e-02' + mean: '4.963e-09' + min: '-1.222e-01' + shape: + - 4096 + - 1024 + sum: '2.082e-02' +grads.network.model.decoder.layers.0.fc2.bias: + device: cuda:0 + max: '1.031e-02' + mean: '7.276e-12' + min: '-1.265e-02' + shape: + - 1024 + sum: '7.451e-09' +grads.network.model.decoder.layers.0.fc2.weight: + device: cuda:0 + max: '1.836e-02' + mean: '0.e+00' + min: '-1.480e-02' + shape: + - 1024 + - 4096 + sum: '0.e+00' +grads.network.model.decoder.layers.0.final_layer_norm.bias: + device: cuda:0 + max: '1.124e-02' + mean: '2.244e-06' + min: '-1.343e-02' + shape: + - 1024 + sum: '2.298e-03' +grads.network.model.decoder.layers.0.final_layer_norm.weight: + device: cuda:0 + max: '9.238e-03' + mean: '-1.765e-05' + min: '-5.406e-02' + shape: + - 1024 + sum: '-1.807e-02' +grads.network.model.decoder.layers.0.self_attn.k_proj.bias: + device: cuda:0 + max: '1.455e-10' + mean: '1.036e-12' + min: '-1.673e-10' + shape: + - 1024 + sum: '1.061e-09' +grads.network.model.decoder.layers.0.self_attn.k_proj.weight: + device: cuda:0 + max: '1.895e-04' + mean: '6.07e-11' + min: '-1.679e-04' + shape: + - 1024 + - 1024 + sum: '6.365e-05' +grads.network.model.decoder.layers.0.self_attn.out_proj.bias: + device: cuda:0 + max: '2.459e-01' + mean: '-8.149e-10' + min: '-2.594e-01' + shape: + - 1024 + sum: '-8.345e-07' +grads.network.model.decoder.layers.0.self_attn.out_proj.weight: + device: cuda:0 + max: '7.433e-03' + mean: '1.705e-13' + min: '-7.011e-03' + shape: + - 1024 + - 1024 + sum: '1.788e-07' +grads.network.model.decoder.layers.0.self_attn.q_proj.bias: + device: cuda:0 + max: '4.872e-04' + mean: '3.458e-07' + min: '-5.13e-04' + shape: + - 1024 + sum: '3.541e-04' +grads.network.model.decoder.layers.0.self_attn.q_proj.weight: + device: cuda:0 + max: '3.873e-04' + mean: '3.472e-09' + min: '-4.093e-04' + shape: + - 1024 + - 1024 + sum: '3.641e-03' +grads.network.model.decoder.layers.0.self_attn.v_proj.bias: + device: cuda:0 + max: '1.222e-01' + mean: '5.112e-04' + min: '-1.374e-01' + shape: + - 1024 + sum: '5.235e-01' +grads.network.model.decoder.layers.0.self_attn.v_proj.weight: + device: cuda:0 + max: '7.942e-02' + mean: '3.069e-07' + min: '-7.008e-02' + shape: + - 1024 + - 1024 + sum: '3.218e-01' +grads.network.model.decoder.layers.0.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.182e-02' + mean: '-1.809e-05' + min: '-1.26e-02' + shape: + - 1024 + sum: '-1.852e-02' +grads.network.model.decoder.layers.0.self_attn_layer_norm.weight: + device: cuda:0 + max: '9.642e-03' + mean: '-9.916e-07' + min: '-4.965e-02' + shape: + - 1024 + sum: '-1.015e-03' +grads.network.model.decoder.layers.1.fc1.bias: + device: cuda:0 + max: '5.562e-03' + mean: '-1.470e-06' + min: '-7.369e-03' + shape: + - 4096 + sum: '-6.023e-03' +grads.network.model.decoder.layers.1.fc1.weight: + device: cuda:0 + max: '6.877e-02' + mean: '2.984e-09' + min: '-9.409e-02' + shape: + - 4096 + - 1024 + sum: '1.251e-02' +grads.network.model.decoder.layers.1.fc2.bias: + device: cuda:0 + max: '1.038e-02' + mean: '1.819e-11' + min: '-1.155e-02' + shape: + - 1024 + sum: '1.863e-08' +grads.network.model.decoder.layers.1.fc2.weight: + device: cuda:0 + max: '1.431e-02' + mean: '2.558e-13' + min: '-1.138e-02' + shape: + - 1024 + - 4096 + sum: '1.073e-06' +grads.network.model.decoder.layers.1.final_layer_norm.bias: + device: cuda:0 + max: '1.17e-02' + mean: '-9.708e-05' + min: '-1.293e-02' + shape: + - 1024 + sum: '-9.941e-02' +grads.network.model.decoder.layers.1.final_layer_norm.weight: + device: cuda:0 + max: '1.304e-02' + mean: '1.814e-05' + min: '-3.518e-02' + shape: + - 1024 + sum: '1.858e-02' +grads.network.model.decoder.layers.1.self_attn.k_proj.bias: + device: cuda:0 + max: '6.403e-10' + mean: '6.279e-13' + min: '-1.397e-09' + shape: + - 1024 + sum: '6.430e-10' +grads.network.model.decoder.layers.1.self_attn.k_proj.weight: + device: cuda:0 + max: '3.312e-02' + mean: '3.22e-15' + min: '-3.174e-02' + shape: + - 1024 + - 1024 + sum: '3.376e-09' +grads.network.model.decoder.layers.1.self_attn.out_proj.bias: + device: cuda:0 + max: '9.799e-03' + mean: '2.183e-11' + min: '-1.048e-02' + shape: + - 1024 + sum: '2.235e-08' +grads.network.model.decoder.layers.1.self_attn.out_proj.weight: + device: cuda:0 + max: '1.020e-02' + mean: '-1.705e-13' + min: '-1.033e-02' + shape: + - 1024 + - 1024 + sum: '-1.788e-07' +grads.network.model.decoder.layers.1.self_attn.q_proj.bias: + device: cuda:0 + max: '1.236e-03' + mean: '-3.821e-06' + min: '-2.06e-03' + shape: + - 1024 + sum: '-3.913e-03' +grads.network.model.decoder.layers.1.self_attn.q_proj.weight: + device: cuda:0 + max: '1.833e-02' + mean: '-2.680e-08' + min: '-1.194e-02' + shape: + - 1024 + - 1024 + sum: '-2.811e-02' +grads.network.model.decoder.layers.1.self_attn.v_proj.bias: + device: cuda:0 + max: '1.296e-02' + mean: '1.047e-04' + min: '-9.251e-03' + shape: + - 1024 + sum: '1.072e-01' +grads.network.model.decoder.layers.1.self_attn.v_proj.weight: + device: cuda:0 + max: '2.234e-01' + mean: '7.347e-07' + min: '-1.650e-01' + shape: + - 1024 + - 1024 + sum: '7.704e-01' +grads.network.model.decoder.layers.1.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.000e-02' + mean: '-4.235e-05' + min: '-1.078e-02' + shape: + - 1024 + sum: '-4.337e-02' +grads.network.model.decoder.layers.1.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.163e-02' + mean: '5.549e-06' + min: '-3.955e-02' + shape: + - 1024 + sum: '5.682e-03' +grads.network.model.decoder.layers.10.fc1.bias: + device: cuda:0 + max: '1.167e-02' + mean: '-1.093e-05' + min: '-4.407e-03' + shape: + - 4096 + sum: '-4.475e-02' +grads.network.model.decoder.layers.10.fc1.weight: + device: cuda:0 + max: '1.255e-01' + mean: '-1.298e-08' + min: '-2.335e-01' + shape: + - 4096 + - 1024 + sum: '-5.445e-02' +grads.network.model.decoder.layers.10.fc2.bias: + device: cuda:0 + max: '9.324e-03' + mean: '3.638e-12' + min: '-9.376e-03' + shape: + - 1024 + sum: '3.725e-09' +grads.network.model.decoder.layers.10.fc2.weight: + device: cuda:0 + max: '1.888e-02' + mean: '1.137e-13' + min: '-1.95e-02' + shape: + - 1024 + - 4096 + sum: '4.768e-07' +grads.network.model.decoder.layers.10.final_layer_norm.bias: + device: cuda:0 + max: '1.063e-02' + mean: '1.763e-04' + min: '-1.049e-02' + shape: + - 1024 + sum: '1.805e-01' +grads.network.model.decoder.layers.10.final_layer_norm.weight: + device: cuda:0 + max: '1.245e-02' + mean: '1.566e-05' + min: '-1.95e-02' + shape: + - 1024 + sum: '1.604e-02' +grads.network.model.decoder.layers.10.self_attn.k_proj.bias: + device: cuda:0 + max: '1.863e-09' + mean: '-8.787e-12' + min: '-1.164e-09' + shape: + - 1024 + sum: '-8.998e-09' +grads.network.model.decoder.layers.10.self_attn.k_proj.weight: + device: cuda:0 + max: '1.065e-01' + mean: '1.164e-13' + min: '-1.330e-01' + shape: + - 1024 + - 1024 + sum: '1.220e-07' +grads.network.model.decoder.layers.10.self_attn.out_proj.bias: + device: cuda:0 + max: '8.365e-03' + mean: '1.819e-11' + min: '-8.918e-03' + shape: + - 1024 + sum: '1.863e-08' +grads.network.model.decoder.layers.10.self_attn.out_proj.weight: + device: cuda:0 + max: '7.876e-03' + mean: '3.126e-13' + min: '-7.644e-03' + shape: + - 1024 + - 1024 + sum: '3.278e-07' +grads.network.model.decoder.layers.10.self_attn.q_proj.bias: + device: cuda:0 + max: '3.907e-03' + mean: '-1.607e-05' + min: '-4.692e-03' + shape: + - 1024 + sum: '-1.645e-02' +grads.network.model.decoder.layers.10.self_attn.q_proj.weight: + device: cuda:0 + max: '3.358e-02' + mean: '1.291e-07' + min: '-4.45e-02' + shape: + - 1024 + - 1024 + sum: '1.354e-01' +grads.network.model.decoder.layers.10.self_attn.v_proj.bias: + device: cuda:0 + max: '9.312e-03' + mean: '-8.616e-05' + min: '-9.148e-03' + shape: + - 1024 + sum: '-8.822e-02' +grads.network.model.decoder.layers.10.self_attn.v_proj.weight: + device: cuda:0 + max: '2.466e-01' + mean: '6.922e-07' + min: '-2.438e-01' + shape: + - 1024 + - 1024 + sum: '7.259e-01' +grads.network.model.decoder.layers.10.self_attn_layer_norm.bias: + device: cuda:0 + max: '8.563e-03' + mean: '-2.205e-05' + min: '-9.231e-03' + shape: + - 1024 + sum: '-2.258e-02' +grads.network.model.decoder.layers.10.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.004e-02' + mean: '8.82e-06' + min: '-2.064e-02' + shape: + - 1024 + sum: '9.032e-03' +grads.network.model.decoder.layers.11.fc1.bias: + device: cuda:0 + max: '4.537e-03' + mean: '-1.97e-05' + min: '-1.077e-02' + shape: + - 4096 + sum: '-8.069e-02' +grads.network.model.decoder.layers.11.fc1.weight: + device: cuda:0 + max: '1.921e-01' + mean: '-8.097e-08' + min: '-1.258e-01' + shape: + - 4096 + - 1024 + sum: '-3.396e-01' +grads.network.model.decoder.layers.11.fc2.bias: + device: cuda:0 + max: '9.747e-03' + mean: '0.e+00' + min: '-1.146e-02' + shape: + - 1024 + sum: '0.e+00' +grads.network.model.decoder.layers.11.fc2.weight: + device: cuda:0 + max: '2.297e-02' + mean: '-2.274e-13' + min: '-2.611e-02' + shape: + - 1024 + - 4096 + sum: '-9.537e-07' +grads.network.model.decoder.layers.11.final_layer_norm.bias: + device: cuda:0 + max: '1.074e-02' + mean: '-1.697e-04' + min: '-1.309e-02' + shape: + - 1024 + sum: '-1.738e-01' +grads.network.model.decoder.layers.11.final_layer_norm.weight: + device: cuda:0 + max: '4.611e-02' + mean: '-1.405e-05' + min: '-1.679e-02' + shape: + - 1024 + sum: '-1.439e-02' +grads.network.model.decoder.layers.11.self_attn.k_proj.bias: + device: cuda:0 + max: '4.075e-10' + mean: '3.897e-12' + min: '-5.239e-10' + shape: + - 1024 + sum: '3.990e-09' +grads.network.model.decoder.layers.11.self_attn.k_proj.weight: + device: cuda:0 + max: '3.695e-02' + mean: '-2.855e-13' + min: '-3.176e-02' + shape: + - 1024 + - 1024 + sum: '-2.994e-07' +grads.network.model.decoder.layers.11.self_attn.out_proj.bias: + device: cuda:0 + max: '1.050e-02' + mean: '1.819e-12' + min: '-1.04e-02' + shape: + - 1024 + sum: '1.863e-09' +grads.network.model.decoder.layers.11.self_attn.out_proj.weight: + device: cuda:0 + max: '4.005e-03' + mean: '-4.619e-14' + min: '-3.44e-03' + shape: + - 1024 + - 1024 + sum: '-4.843e-08' +grads.network.model.decoder.layers.11.self_attn.q_proj.bias: + device: cuda:0 + max: '1.21e-03' + mean: '-1.349e-05' + min: '-2.133e-03' + shape: + - 1024 + sum: '-1.382e-02' +grads.network.model.decoder.layers.11.self_attn.q_proj.weight: + device: cuda:0 + max: '2.495e-02' + mean: '1.265e-07' + min: '-2.483e-02' + shape: + - 1024 + - 1024 + sum: '1.326e-01' +grads.network.model.decoder.layers.11.self_attn.v_proj.bias: + device: cuda:0 + max: '9.094e-03' + mean: '-1.657e-05' + min: '-1.120e-02' + shape: + - 1024 + sum: '-1.697e-02' +grads.network.model.decoder.layers.11.self_attn.v_proj.weight: + device: cuda:0 + max: '2.806e-01' + mean: '1.554e-07' + min: '-2.307e-01' + shape: + - 1024 + - 1024 + sum: '1.629e-01' +grads.network.model.decoder.layers.11.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.090e-02' + mean: '4.103e-05' + min: '-1.074e-02' + shape: + - 1024 + sum: '4.202e-02' +grads.network.model.decoder.layers.11.self_attn_layer_norm.weight: + device: cuda:0 + max: '9.913e-03' + mean: '8.734e-06' + min: '-2.563e-02' + shape: + - 1024 + sum: '8.943e-03' +grads.network.model.decoder.layers.12.fc1.bias: + device: cuda:0 + max: '4.174e-03' + mean: '-9.494e-06' + min: '-5.266e-03' + shape: + - 4096 + sum: '-3.889e-02' +grads.network.model.decoder.layers.12.fc1.weight: + device: cuda:0 + max: '1.308e-01' + mean: '-4.169e-08' + min: '-1.225e-01' + shape: + - 4096 + - 1024 + sum: '-1.749e-01' +grads.network.model.decoder.layers.12.fc2.bias: + device: cuda:0 + max: '9.381e-03' + mean: '0.e+00' + min: '-9.925e-03' + shape: + - 1024 + sum: '0.e+00' +grads.network.model.decoder.layers.12.fc2.weight: + device: cuda:0 + max: '1.477e-02' + mean: '-1.137e-13' + min: '-1.799e-02' + shape: + - 1024 + - 4096 + sum: '-4.768e-07' +grads.network.model.decoder.layers.12.final_layer_norm.bias: + device: cuda:0 + max: '1.085e-02' + mean: '-6.289e-05' + min: '-1.164e-02' + shape: + - 1024 + sum: '-6.440e-02' +grads.network.model.decoder.layers.12.final_layer_norm.weight: + device: cuda:0 + max: '2.347e-02' + mean: '1.717e-05' + min: '-3.135e-02' + shape: + - 1024 + sum: '1.758e-02' +grads.network.model.decoder.layers.12.self_attn.k_proj.bias: + device: cuda:0 + max: '6.694e-10' + mean: '8.309e-13' + min: '-4.948e-10' + shape: + - 1024 + sum: '8.508e-10' +grads.network.model.decoder.layers.12.self_attn.k_proj.weight: + device: cuda:0 + max: '7.397e-02' + mean: '-2.175e-13' + min: '-9.768e-02' + shape: + - 1024 + - 1024 + sum: '-2.281e-07' +grads.network.model.decoder.layers.12.self_attn.out_proj.bias: + device: cuda:0 + max: '9.249e-03' + mean: '-7.276e-12' + min: '-9.731e-03' + shape: + - 1024 + sum: '-7.451e-09' +grads.network.model.decoder.layers.12.self_attn.out_proj.weight: + device: cuda:0 + max: '4.412e-03' + mean: '1.421e-13' + min: '-4.588e-03' + shape: + - 1024 + - 1024 + sum: '1.490e-07' +grads.network.model.decoder.layers.12.self_attn.q_proj.bias: + device: cuda:0 + max: '3.407e-03' + mean: '2.445e-05' + min: '-1.779e-03' + shape: + - 1024 + sum: '2.504e-02' +grads.network.model.decoder.layers.12.self_attn.q_proj.weight: + device: cuda:0 + max: '4.225e-02' + mean: '-3.557e-07' + min: '-4.189e-02' + shape: + - 1024 + - 1024 + sum: '-3.729e-01' +grads.network.model.decoder.layers.12.self_attn.v_proj.bias: + device: cuda:0 + max: '8.426e-03' + mean: '2.616e-05' + min: '-1.041e-02' + shape: + - 1024 + sum: '2.679e-02' +grads.network.model.decoder.layers.12.self_attn.v_proj.weight: + device: cuda:0 + max: '2.573e-01' + mean: '-3.806e-07' + min: '-2.223e-01' + shape: + - 1024 + - 1024 + sum: '-3.990e-01' +grads.network.model.decoder.layers.12.self_attn_layer_norm.bias: + device: cuda:0 + max: '9.540e-03' + mean: '1.539e-05' + min: '-1.009e-02' + shape: + - 1024 + sum: '1.576e-02' +grads.network.model.decoder.layers.12.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.112e-02' + mean: '6.956e-06' + min: '-3.292e-02' + shape: + - 1024 + sum: '7.123e-03' +grads.network.model.decoder.layers.13.fc1.bias: + device: cuda:0 + max: '4.255e-03' + mean: '-6.284e-06' + min: '-3.659e-03' + shape: + - 4096 + sum: '-2.574e-02' +grads.network.model.decoder.layers.13.fc1.weight: + device: cuda:0 + max: '9.864e-02' + mean: '-1.925e-08' + min: '-8.668e-02' + shape: + - 4096 + - 1024 + sum: '-8.074e-02' +grads.network.model.decoder.layers.13.fc2.bias: + device: cuda:0 + max: '8.901e-03' + mean: '-9.095e-12' + min: '-9.272e-03' + shape: + - 1024 + sum: '-9.313e-09' +grads.network.model.decoder.layers.13.fc2.weight: + device: cuda:0 + max: '9.958e-03' + mean: '-1.137e-13' + min: '-1.159e-02' + shape: + - 1024 + - 4096 + sum: '-4.768e-07' +grads.network.model.decoder.layers.13.final_layer_norm.bias: + device: cuda:0 + max: '1.098e-02' + mean: '1.136e-04' + min: '-1.088e-02' + shape: + - 1024 + sum: '1.163e-01' +grads.network.model.decoder.layers.13.final_layer_norm.weight: + device: cuda:0 + max: '3.056e-02' + mean: '2.505e-06' + min: '-2.49e-02' + shape: + - 1024 + sum: '2.565e-03' +grads.network.model.decoder.layers.13.self_attn.k_proj.bias: + device: cuda:0 + max: '3.056e-10' + mean: '-3.326e-12' + min: '-4.657e-10' + shape: + - 1024 + sum: '-3.406e-09' +grads.network.model.decoder.layers.13.self_attn.k_proj.weight: + device: cuda:0 + max: '3.654e-02' + mean: '2.432e-13' + min: '-4.357e-02' + shape: + - 1024 + - 1024 + sum: '2.551e-07' +grads.network.model.decoder.layers.13.self_attn.out_proj.bias: + device: cuda:0 + max: '7.424e-03' + mean: '-3.638e-12' + min: '-9.317e-03' + shape: + - 1024 + sum: '-3.725e-09' +grads.network.model.decoder.layers.13.self_attn.out_proj.weight: + device: cuda:0 + max: '3.228e-03' + mean: '7.105e-14' + min: '-2.774e-03' + shape: + - 1024 + - 1024 + sum: '7.451e-08' +grads.network.model.decoder.layers.13.self_attn.q_proj.bias: + device: cuda:0 + max: '2.412e-03' + mean: '1.546e-05' + min: '-1.678e-03' + shape: + - 1024 + sum: '1.583e-02' +grads.network.model.decoder.layers.13.self_attn.q_proj.weight: + device: cuda:0 + max: '1.646e-02' + mean: '-2.364e-07' + min: '-1.986e-02' + shape: + - 1024 + - 1024 + sum: '-2.479e-01' +grads.network.model.decoder.layers.13.self_attn.v_proj.bias: + device: cuda:0 + max: '9.358e-03' + mean: '-2.785e-05' + min: '-8.192e-03' + shape: + - 1024 + sum: '-2.851e-02' +grads.network.model.decoder.layers.13.self_attn.v_proj.weight: + device: cuda:0 + max: '2.093e-01' + mean: '4.26e-07' + min: '-2.454e-01' + shape: + - 1024 + - 1024 + sum: '4.467e-01' +grads.network.model.decoder.layers.13.self_attn_layer_norm.bias: + device: cuda:0 + max: '7.755e-03' + mean: '4.027e-05' + min: '-9.616e-03' + shape: + - 1024 + sum: '4.124e-02' +grads.network.model.decoder.layers.13.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.237e-02' + mean: '2.634e-06' + min: '-3.056e-02' + shape: + - 1024 + sum: '2.697e-03' +grads.network.model.decoder.layers.14.fc1.bias: + device: cuda:0 + max: '3.368e-03' + mean: '-4.94e-06' + min: '-4.024e-03' + shape: + - 4096 + sum: '-2.023e-02' +grads.network.model.decoder.layers.14.fc1.weight: + device: cuda:0 + max: '1.023e-01' + mean: '-4.683e-09' + min: '-8.753e-02' + shape: + - 4096 + - 1024 + sum: '-1.964e-02' +grads.network.model.decoder.layers.14.fc2.bias: + device: cuda:0 + max: '9.881e-03' + mean: '-2.183e-11' + min: '-9.016e-03' + shape: + - 1024 + sum: '-2.235e-08' +grads.network.model.decoder.layers.14.fc2.weight: + device: cuda:0 + max: '1.668e-02' + mean: '-1.592e-12' + min: '-1.498e-02' + shape: + - 1024 + - 4096 + sum: '-6.676e-06' +grads.network.model.decoder.layers.14.final_layer_norm.bias: + device: cuda:0 + max: '1.219e-02' + mean: '2.743e-05' + min: '-1.083e-02' + shape: + - 1024 + sum: '2.809e-02' +grads.network.model.decoder.layers.14.final_layer_norm.weight: + device: cuda:0 + max: '1.590e-02' + mean: '-4.36e-06' + min: '-3.127e-02' + shape: + - 1024 + sum: '-4.464e-03' +grads.network.model.decoder.layers.14.self_attn.k_proj.bias: + device: cuda:0 + max: '3.929e-10' + mean: '-2.173e-12' + min: '-3.056e-10' + shape: + - 1024 + sum: '-2.226e-09' +grads.network.model.decoder.layers.14.self_attn.k_proj.weight: + device: cuda:0 + max: '5.135e-02' + mean: '-5.795e-14' + min: '-4.326e-02' + shape: + - 1024 + - 1024 + sum: '-6.077e-08' +grads.network.model.decoder.layers.14.self_attn.out_proj.bias: + device: cuda:0 + max: '9.779e-03' + mean: '9.095e-12' + min: '-8.985e-03' + shape: + - 1024 + sum: '9.313e-09' +grads.network.model.decoder.layers.14.self_attn.out_proj.weight: + device: cuda:0 + max: '2.521e-03' + mean: '-2.842e-14' + min: '-2.492e-03' + shape: + - 1024 + - 1024 + sum: '-2.980e-08' +grads.network.model.decoder.layers.14.self_attn.q_proj.bias: + device: cuda:0 + max: '2.483e-03' + mean: '-2.104e-05' + min: '-4.766e-03' + shape: + - 1024 + sum: '-2.155e-02' +grads.network.model.decoder.layers.14.self_attn.q_proj.weight: + device: cuda:0 + max: '3.591e-02' + mean: '4.924e-07' + min: '-2.957e-02' + shape: + - 1024 + - 1024 + sum: '5.163e-01' +grads.network.model.decoder.layers.14.self_attn.v_proj.bias: + device: cuda:0 + max: '8.477e-03' + mean: '1.055e-04' + min: '-8.184e-03' + shape: + - 1024 + sum: '1.081e-01' +grads.network.model.decoder.layers.14.self_attn.v_proj.weight: + device: cuda:0 + max: '2.027e-01' + mean: '-2.47e-06' + min: '-2.218e-01' + shape: + - 1024 + - 1024 + sum: '-2.59e+00' +grads.network.model.decoder.layers.14.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.029e-02' + mean: '4.850e-05' + min: '-9.323e-03' + shape: + - 1024 + sum: '4.967e-02' +grads.network.model.decoder.layers.14.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.910e-02' + mean: '5.651e-06' + min: '-3.208e-02' + shape: + - 1024 + sum: '5.786e-03' +grads.network.model.decoder.layers.15.fc1.bias: + device: cuda:0 + max: '5.394e-03' + mean: '-1.012e-05' + min: '-6.176e-03' + shape: + - 4096 + sum: '-4.146e-02' +grads.network.model.decoder.layers.15.fc1.weight: + device: cuda:0 + max: '8.324e-02' + mean: '-1.046e-08' + min: '-1.047e-01' + shape: + - 4096 + - 1024 + sum: '-4.386e-02' +grads.network.model.decoder.layers.15.fc2.bias: + device: cuda:0 + max: '9.866e-03' + mean: '-7.276e-12' + min: '-1.172e-02' + shape: + - 1024 + sum: '-7.451e-09' +grads.network.model.decoder.layers.15.fc2.weight: + device: cuda:0 + max: '1.37e-02' + mean: '-5.684e-13' + min: '-1.439e-02' + shape: + - 1024 + - 4096 + sum: '-2.384e-06' +grads.network.model.decoder.layers.15.final_layer_norm.bias: + device: cuda:0 + max: '1.231e-02' + mean: '-1.332e-04' + min: '-1.468e-02' + shape: + - 1024 + sum: '-1.364e-01' +grads.network.model.decoder.layers.15.final_layer_norm.weight: + device: cuda:0 + max: '3.634e-02' + mean: '1.128e-05' + min: '-3.444e-02' + shape: + - 1024 + sum: '1.155e-02' +grads.network.model.decoder.layers.15.self_attn.k_proj.bias: + device: cuda:0 + max: '1.164e-09' + mean: '3.457e-12' + min: '-4.657e-10' + shape: + - 1024 + sum: '3.54e-09' +grads.network.model.decoder.layers.15.self_attn.k_proj.weight: + device: cuda:0 + max: '3.154e-02' + mean: '4.652e-14' + min: '-2.124e-02' + shape: + - 1024 + - 1024 + sum: '4.878e-08' +grads.network.model.decoder.layers.15.self_attn.out_proj.bias: + device: cuda:0 + max: '9.871e-03' + mean: '-1.455e-11' + min: '-9.811e-03' + shape: + - 1024 + sum: '-1.490e-08' +grads.network.model.decoder.layers.15.self_attn.out_proj.weight: + device: cuda:0 + max: '4.353e-03' + mean: '1.421e-14' + min: '-4.717e-03' + shape: + - 1024 + - 1024 + sum: '1.490e-08' +grads.network.model.decoder.layers.15.self_attn.q_proj.bias: + device: cuda:0 + max: '1.886e-03' + mean: '2.190e-05' + min: '-2.335e-03' + shape: + - 1024 + sum: '2.243e-02' +grads.network.model.decoder.layers.15.self_attn.q_proj.weight: + device: cuda:0 + max: '2.037e-02' + mean: '-4.754e-07' + min: '-2.289e-02' + shape: + - 1024 + - 1024 + sum: '-4.985e-01' +grads.network.model.decoder.layers.15.self_attn.v_proj.bias: + device: cuda:0 + max: '7.805e-03' + mean: '-4.434e-05' + min: '-9.824e-03' + shape: + - 1024 + sum: '-4.541e-02' +grads.network.model.decoder.layers.15.self_attn.v_proj.weight: + device: cuda:0 + max: '1.984e-01' + mean: '9.627e-07' + min: '-1.703e-01' + shape: + - 1024 + - 1024 + sum: '1.009e+00' +grads.network.model.decoder.layers.15.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.079e-02' + mean: '1.138e-04' + min: '-1.047e-02' + shape: + - 1024 + sum: '1.165e-01' +grads.network.model.decoder.layers.15.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.985e-02' + mean: '-3.775e-06' + min: '-3.666e-02' + shape: + - 1024 + sum: '-3.866e-03' +grads.network.model.decoder.layers.16.fc1.bias: + device: cuda:0 + max: '4.077e-03' + mean: '2.515e-06' + min: '-4.591e-03' + shape: + - 4096 + sum: '1.030e-02' +grads.network.model.decoder.layers.16.fc1.weight: + device: cuda:0 + max: '1.095e-01' + mean: '2.903e-09' + min: '-1.061e-01' + shape: + - 4096 + - 1024 + sum: '1.218e-02' +grads.network.model.decoder.layers.16.fc2.bias: + device: cuda:0 + max: '1.072e-02' + mean: '0.e+00' + min: '-1.028e-02' + shape: + - 1024 + sum: '0.e+00' +grads.network.model.decoder.layers.16.fc2.weight: + device: cuda:0 + max: '2.759e-02' + mean: '0.e+00' + min: '-2.188e-02' + shape: + - 1024 + - 4096 + sum: '0.e+00' +grads.network.model.decoder.layers.16.final_layer_norm.bias: + device: cuda:0 + max: '1.385e-02' + mean: '3.693e-04' + min: '-1.169e-02' + shape: + - 1024 + sum: '3.781e-01' +grads.network.model.decoder.layers.16.final_layer_norm.weight: + device: cuda:0 + max: '2.044e-02' + mean: '-2.249e-06' + min: '-2.405e-02' + shape: + - 1024 + sum: '-2.303e-03' +grads.network.model.decoder.layers.16.self_attn.k_proj.bias: + device: cuda:0 + max: '4.657e-10' + mean: '-1.148e-12' + min: '-4.657e-10' + shape: + - 1024 + sum: '-1.176e-09' +grads.network.model.decoder.layers.16.self_attn.k_proj.weight: + device: cuda:0 + max: '2.442e-02' + mean: '7.527e-14' + min: '-2.925e-02' + shape: + - 1024 + - 1024 + sum: '7.893e-08' +grads.network.model.decoder.layers.16.self_attn.out_proj.bias: + device: cuda:0 + max: '8.875e-03' + mean: '0.e+00' + min: '-9.845e-03' + shape: + - 1024 + sum: '0.e+00' +grads.network.model.decoder.layers.16.self_attn.out_proj.weight: + device: cuda:0 + max: '2.749e-03' + mean: '-1.563e-13' + min: '-2.783e-03' + shape: + - 1024 + - 1024 + sum: '-1.639e-07' +grads.network.model.decoder.layers.16.self_attn.q_proj.bias: + device: cuda:0 + max: '1.541e-03' + mean: '-7.89e-06' + min: '-2.125e-03' + shape: + - 1024 + sum: '-8.079e-03' +grads.network.model.decoder.layers.16.self_attn.q_proj.weight: + device: cuda:0 + max: '2.979e-02' + mean: '1.649e-07' + min: '-3.029e-02' + shape: + - 1024 + - 1024 + sum: '1.729e-01' +grads.network.model.decoder.layers.16.self_attn.v_proj.bias: + device: cuda:0 + max: '9.657e-03' + mean: '-1.308e-04' + min: '-9.640e-03' + shape: + - 1024 + sum: '-1.339e-01' +grads.network.model.decoder.layers.16.self_attn.v_proj.weight: + device: cuda:0 + max: '2.179e-01' + mean: '2.732e-06' + min: '-2.213e-01' + shape: + - 1024 + - 1024 + sum: '2.865e+00' +grads.network.model.decoder.layers.16.self_attn_layer_norm.bias: + device: cuda:0 + max: '9.162e-03' + mean: '-9.535e-05' + min: '-1.059e-02' + shape: + - 1024 + sum: '-9.764e-02' +grads.network.model.decoder.layers.16.self_attn_layer_norm.weight: + device: cuda:0 + max: '2.578e-02' + mean: '9.235e-06' + min: '-2.987e-02' + shape: + - 1024 + sum: '9.457e-03' +grads.network.model.decoder.layers.17.fc1.bias: + device: cuda:0 + max: '6.044e-03' + mean: '2.890e-06' + min: '-6.564e-03' + shape: + - 4096 + sum: '1.184e-02' +grads.network.model.decoder.layers.17.fc1.weight: + device: cuda:0 + max: '1.345e-01' + mean: '5.029e-10' + min: '-1.541e-01' + shape: + - 4096 + - 1024 + sum: '2.109e-03' +grads.network.model.decoder.layers.17.fc2.bias: + device: cuda:0 + max: '1.305e-02' + mean: '0.e+00' + min: '-1.607e-02' + shape: + - 1024 + sum: '0.e+00' +grads.network.model.decoder.layers.17.fc2.weight: + device: cuda:0 + max: '2.616e-02' + mean: '0.e+00' + min: '-3.049e-02' + shape: + - 1024 + - 4096 + sum: '0.e+00' +grads.network.model.decoder.layers.17.final_layer_norm.bias: + device: cuda:0 + max: '1.535e-02' + mean: '-2.257e-04' + min: '-1.923e-02' + shape: + - 1024 + sum: '-2.311e-01' +grads.network.model.decoder.layers.17.final_layer_norm.weight: + device: cuda:0 + max: '3.850e-02' + mean: '2.985e-05' + min: '-2.193e-02' + shape: + - 1024 + sum: '3.056e-02' +grads.network.model.decoder.layers.17.self_attn.k_proj.bias: + device: cuda:0 + max: '3.201e-10' + mean: '1.170e-12' + min: '-2.183e-10' + shape: + - 1024 + sum: '1.198e-09' +grads.network.model.decoder.layers.17.self_attn.k_proj.weight: + device: cuda:0 + max: '1.88e-02' + mean: '1.493e-13' + min: '-1.416e-02' + shape: + - 1024 + - 1024 + sum: '1.566e-07' +grads.network.model.decoder.layers.17.self_attn.out_proj.bias: + device: cuda:0 + max: '1.277e-02' + mean: '-1.455e-11' + min: '-1.398e-02' + shape: + - 1024 + sum: '-1.490e-08' +grads.network.model.decoder.layers.17.self_attn.out_proj.weight: + device: cuda:0 + max: '3.332e-03' + mean: '9.592e-14' + min: '-4.020e-03' + shape: + - 1024 + - 1024 + sum: '1.006e-07' +grads.network.model.decoder.layers.17.self_attn.q_proj.bias: + device: cuda:0 + max: '8.169e-04' + mean: '1.575e-07' + min: '-1.763e-03' + shape: + - 1024 + sum: '1.613e-04' +grads.network.model.decoder.layers.17.self_attn.q_proj.weight: + device: cuda:0 + max: '2.347e-02' + mean: '-2.684e-09' + min: '-1.066e-02' + shape: + - 1024 + - 1024 + sum: '-2.815e-03' +grads.network.model.decoder.layers.17.self_attn.v_proj.bias: + device: cuda:0 + max: '1.098e-02' + mean: '-1.444e-05' + min: '-1.304e-02' + shape: + - 1024 + sum: '-1.479e-02' +grads.network.model.decoder.layers.17.self_attn.v_proj.weight: + device: cuda:0 + max: '3.683e-01' + mean: '2.462e-07' + min: '-3.150e-01' + shape: + - 1024 + - 1024 + sum: '2.581e-01' +grads.network.model.decoder.layers.17.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.358e-02' + mean: '-5.711e-06' + min: '-1.483e-02' + shape: + - 1024 + sum: '-5.848e-03' +grads.network.model.decoder.layers.17.self_attn_layer_norm.weight: + device: cuda:0 + max: '2.098e-02' + mean: '3.371e-06' + min: '-1.99e-02' + shape: + - 1024 + sum: '3.452e-03' +grads.network.model.decoder.layers.18.fc1.bias: + device: cuda:0 + max: '1.147e-02' + mean: '-5.311e-06' + min: '-7.232e-03' + shape: + - 4096 + sum: '-2.175e-02' +grads.network.model.decoder.layers.18.fc1.weight: + device: cuda:0 + max: '1.619e-01' + mean: '-9.185e-09' + min: '-3.223e-01' + shape: + - 4096 + - 1024 + sum: '-3.853e-02' +grads.network.model.decoder.layers.18.fc2.bias: + device: cuda:0 + max: '1.429e-02' + mean: '0.e+00' + min: '-1.499e-02' + shape: + - 1024 + sum: '0.e+00' +grads.network.model.decoder.layers.18.fc2.weight: + device: cuda:0 + max: '2.821e-02' + mean: '-2.274e-13' + min: '-2.067e-02' + shape: + - 1024 + - 4096 + sum: '-9.537e-07' +grads.network.model.decoder.layers.18.final_layer_norm.bias: + device: cuda:0 + max: '1.670e-02' + mean: '2.067e-04' + min: '-1.701e-02' + shape: + - 1024 + sum: '2.117e-01' +grads.network.model.decoder.layers.18.final_layer_norm.weight: + device: cuda:0 + max: '1.673e-02' + mean: '-3.888e-05' + min: '-1.522e-02' + shape: + - 1024 + sum: '-3.981e-02' +grads.network.model.decoder.layers.18.self_attn.k_proj.bias: + device: cuda:0 + max: '8.731e-10' + mean: '2.129e-12' + min: '-4.075e-10' + shape: + - 1024 + sum: '2.18e-09' +grads.network.model.decoder.layers.18.self_attn.k_proj.weight: + device: cuda:0 + max: '4.180e-02' + mean: '1.821e-14' + min: '-5.685e-02' + shape: + - 1024 + - 1024 + sum: '1.909e-08' +grads.network.model.decoder.layers.18.self_attn.out_proj.bias: + device: cuda:0 + max: '1.283e-02' + mean: '7.276e-12' + min: '-1.266e-02' + shape: + - 1024 + sum: '7.451e-09' +grads.network.model.decoder.layers.18.self_attn.out_proj.weight: + device: cuda:0 + max: '2.322e-03' + mean: '2.842e-14' + min: '-2.526e-03' + shape: + - 1024 + - 1024 + sum: '2.980e-08' +grads.network.model.decoder.layers.18.self_attn.q_proj.bias: + device: cuda:0 + max: '5.705e-03' + mean: '-1.891e-05' + min: '-5.284e-03' + shape: + - 1024 + sum: '-1.937e-02' +grads.network.model.decoder.layers.18.self_attn.q_proj.weight: + device: cuda:0 + max: '7.843e-02' + mean: '2.579e-07' + min: '-8.680e-02' + shape: + - 1024 + - 1024 + sum: '2.704e-01' +grads.network.model.decoder.layers.18.self_attn.v_proj.bias: + device: cuda:0 + max: '1.423e-02' + mean: '1.193e-04' + min: '-1.538e-02' + shape: + - 1024 + sum: '1.222e-01' +grads.network.model.decoder.layers.18.self_attn.v_proj.weight: + device: cuda:0 + max: '4.271e-01' + mean: '-1.627e-06' + min: '-3.934e-01' + shape: + - 1024 + - 1024 + sum: '-1.706e+00' +grads.network.model.decoder.layers.18.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.349e-02' + mean: '1.753e-06' + min: '-1.332e-02' + shape: + - 1024 + sum: '1.795e-03' +grads.network.model.decoder.layers.18.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.638e-02' + mean: '1.578e-06' + min: '-1.96e-02' + shape: + - 1024 + sum: '1.616e-03' +grads.network.model.decoder.layers.19.fc1.bias: + device: cuda:0 + max: '1.043e-02' + mean: '3.285e-06' + min: '-8.926e-03' + shape: + - 4096 + sum: '1.346e-02' +grads.network.model.decoder.layers.19.fc1.weight: + device: cuda:0 + max: '2.514e-01' + mean: '1.092e-08' + min: '-2.619e-01' + shape: + - 4096 + - 1024 + sum: '4.581e-02' +grads.network.model.decoder.layers.19.fc2.bias: + device: cuda:0 + max: '1.579e-02' + mean: '7.276e-12' + min: '-1.67e-02' + shape: + - 1024 + sum: '7.451e-09' +grads.network.model.decoder.layers.19.fc2.weight: + device: cuda:0 + max: '2.852e-02' + mean: '0.e+00' + min: '-2.674e-02' + shape: + - 1024 + - 4096 + sum: '0.e+00' +grads.network.model.decoder.layers.19.final_layer_norm.bias: + device: cuda:0 + max: '1.804e-02' + mean: '8.083e-05' + min: '-1.924e-02' + shape: + - 1024 + sum: '8.276e-02' +grads.network.model.decoder.layers.19.final_layer_norm.weight: + device: cuda:0 + max: '2.331e-02' + mean: '-1.504e-05' + min: '-1.230e-02' + shape: + - 1024 + sum: '-1.54e-02' +grads.network.model.decoder.layers.19.self_attn.k_proj.bias: + device: cuda:0 + max: '4.075e-10' + mean: '-1.247e-12' + min: '-4.948e-10' + shape: + - 1024 + sum: '-1.277e-09' +grads.network.model.decoder.layers.19.self_attn.k_proj.weight: + device: cuda:0 + max: '4.950e-02' + mean: '1.668e-13' + min: '-3.336e-02' + shape: + - 1024 + - 1024 + sum: '1.749e-07' +grads.network.model.decoder.layers.19.self_attn.out_proj.bias: + device: cuda:0 + max: '1.443e-02' + mean: '4.366e-11' + min: '-1.464e-02' + shape: + - 1024 + sum: '4.470e-08' +grads.network.model.decoder.layers.19.self_attn.out_proj.weight: + device: cuda:0 + max: '5.047e-03' + mean: '1.137e-13' + min: '-4.323e-03' + shape: + - 1024 + - 1024 + sum: '1.192e-07' +grads.network.model.decoder.layers.19.self_attn.q_proj.bias: + device: cuda:0 + max: '2.846e-03' + mean: '-5.669e-06' + min: '-2.716e-03' + shape: + - 1024 + sum: '-5.805e-03' +grads.network.model.decoder.layers.19.self_attn.q_proj.weight: + device: cuda:0 + max: '5.232e-02' + mean: '7.022e-08' + min: '-5.666e-02' + shape: + - 1024 + - 1024 + sum: '7.363e-02' +grads.network.model.decoder.layers.19.self_attn.v_proj.bias: + device: cuda:0 + max: '1.353e-02' + mean: '-1.046e-04' + min: '-1.307e-02' + shape: + - 1024 + sum: '-1.071e-01' +grads.network.model.decoder.layers.19.self_attn.v_proj.weight: + device: cuda:0 + max: '3.506e-01' + mean: '1.296e-06' + min: '-3.869e-01' + shape: + - 1024 + - 1024 + sum: '1.359e+00' +grads.network.model.decoder.layers.19.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.543e-02' + mean: '1.895e-05' + min: '-1.569e-02' + shape: + - 1024 + sum: '1.941e-02' +grads.network.model.decoder.layers.19.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.44e-02' + mean: '5.186e-07' + min: '-1.104e-02' + shape: + - 1024 + sum: '5.310e-04' +grads.network.model.decoder.layers.2.fc1.bias: + device: cuda:0 + max: '5.921e-03' + mean: '8.856e-06' + min: '-9.619e-03' + shape: + - 4096 + sum: '3.627e-02' +grads.network.model.decoder.layers.2.fc1.weight: + device: cuda:0 + max: '1.109e-01' + mean: '-1.692e-08' + min: '-1.033e-01' + shape: + - 4096 + - 1024 + sum: '-7.098e-02' +grads.network.model.decoder.layers.2.fc2.bias: + device: cuda:0 + max: '8.814e-03' + mean: '1.455e-11' + min: '-9.890e-03' + shape: + - 1024 + sum: '1.490e-08' +grads.network.model.decoder.layers.2.fc2.weight: + device: cuda:0 + max: '8.03e-03' + mean: '1.705e-13' + min: '-7.305e-03' + shape: + - 1024 + - 4096 + sum: '7.153e-07' +grads.network.model.decoder.layers.2.final_layer_norm.bias: + device: cuda:0 + max: '1.062e-02' + mean: '2.142e-05' + min: '-9.885e-03' + shape: + - 1024 + sum: '2.193e-02' +grads.network.model.decoder.layers.2.final_layer_norm.weight: + device: cuda:0 + max: '1.06e-02' + mean: '1.349e-05' + min: '-3.724e-02' + shape: + - 1024 + sum: '1.382e-02' +grads.network.model.decoder.layers.2.self_attn.k_proj.bias: + device: cuda:0 + max: '6.985e-10' + mean: '3.819e-13' + min: '-3.492e-10' + shape: + - 1024 + sum: '3.911e-10' +grads.network.model.decoder.layers.2.self_attn.k_proj.weight: + device: cuda:0 + max: '1.658e-02' + mean: '-6.373e-14' + min: '-1.493e-02' + shape: + - 1024 + - 1024 + sum: '-6.682e-08' +grads.network.model.decoder.layers.2.self_attn.out_proj.bias: + device: cuda:0 + max: '9.061e-03' + mean: '1.455e-11' + min: '-9.315e-03' + shape: + - 1024 + sum: '1.490e-08' +grads.network.model.decoder.layers.2.self_attn.out_proj.weight: + device: cuda:0 + max: '9.092e-03' + mean: '-1.421e-14' + min: '-8.389e-03' + shape: + - 1024 + - 1024 + sum: '-1.490e-08' +grads.network.model.decoder.layers.2.self_attn.q_proj.bias: + device: cuda:0 + max: '1.064e-03' + mean: '4.480e-06' + min: '-1.057e-03' + shape: + - 1024 + sum: '4.588e-03' +grads.network.model.decoder.layers.2.self_attn.q_proj.weight: + device: cuda:0 + max: '9.205e-03' + mean: '3.874e-08' + min: '-1.268e-02' + shape: + - 1024 + - 1024 + sum: '4.063e-02' +grads.network.model.decoder.layers.2.self_attn.v_proj.bias: + device: cuda:0 + max: '8.063e-03' + mean: '3.71e-05' + min: '-6.821e-03' + shape: + - 1024 + sum: '3.799e-02' +grads.network.model.decoder.layers.2.self_attn.v_proj.weight: + device: cuda:0 + max: '1.234e-01' + mean: '3.208e-07' + min: '-1.047e-01' + shape: + - 1024 + - 1024 + sum: '3.364e-01' +grads.network.model.decoder.layers.2.self_attn_layer_norm.bias: + device: cuda:0 + max: '9.170e-03' + mean: '-3.405e-05' + min: '-9.528e-03' + shape: + - 1024 + sum: '-3.486e-02' +grads.network.model.decoder.layers.2.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.376e-02' + mean: '3.953e-06' + min: '-3.395e-02' + shape: + - 1024 + sum: '4.048e-03' +grads.network.model.decoder.layers.20.fc1.bias: + device: cuda:0 + max: '7.671e-03' + mean: '-3.533e-07' + min: '-1.159e-02' + shape: + - 4096 + sum: '-1.447e-03' +grads.network.model.decoder.layers.20.fc1.weight: + device: cuda:0 + max: '3.498e-01' + mean: '-1.061e-09' + min: '-2.271e-01' + shape: + - 4096 + - 1024 + sum: '-4.449e-03' +grads.network.model.decoder.layers.20.fc2.bias: + device: cuda:0 + max: '1.901e-02' + mean: '-1.455e-11' + min: '-1.83e-02' + shape: + - 1024 + sum: '-1.490e-08' +grads.network.model.decoder.layers.20.fc2.weight: + device: cuda:0 + max: '8.356e-02' + mean: '5.684e-14' + min: '-8.36e-02' + shape: + - 1024 + - 4096 + sum: '2.384e-07' +grads.network.model.decoder.layers.20.final_layer_norm.bias: + device: cuda:0 + max: '2.215e-02' + mean: '2.282e-04' + min: '-2.103e-02' + shape: + - 1024 + sum: '2.337e-01' +grads.network.model.decoder.layers.20.final_layer_norm.weight: + device: cuda:0 + max: '2.260e-02' + mean: '-2.262e-05' + min: '-1.660e-02' + shape: + - 1024 + sum: '-2.316e-02' +grads.network.model.decoder.layers.20.self_attn.k_proj.bias: + device: cuda:0 + max: '3.492e-10' + mean: '1.942e-12' + min: '-3.347e-10' + shape: + - 1024 + sum: '1.989e-09' +grads.network.model.decoder.layers.20.self_attn.k_proj.weight: + device: cuda:0 + max: '3.529e-02' + mean: '-4.73e-14' + min: '-3.390e-02' + shape: + - 1024 + - 1024 + sum: '-4.959e-08' +grads.network.model.decoder.layers.20.self_attn.out_proj.bias: + device: cuda:0 + max: '1.786e-02' + mean: '1.455e-11' + min: '-1.611e-02' + shape: + - 1024 + sum: '1.490e-08' +grads.network.model.decoder.layers.20.self_attn.out_proj.weight: + device: cuda:0 + max: '8.450e-03' + mean: '-1.243e-14' + min: '-9.957e-03' + shape: + - 1024 + - 1024 + sum: '-1.304e-08' +grads.network.model.decoder.layers.20.self_attn.q_proj.bias: + device: cuda:0 + max: '1.168e-03' + mean: '1.373e-05' + min: '-1.461e-03' + shape: + - 1024 + sum: '1.406e-02' +grads.network.model.decoder.layers.20.self_attn.q_proj.weight: + device: cuda:0 + max: '3.718e-02' + mean: '-1.270e-07' + min: '-3.829e-02' + shape: + - 1024 + - 1024 + sum: '-1.332e-01' +grads.network.model.decoder.layers.20.self_attn.v_proj.bias: + device: cuda:0 + max: '1.316e-02' + mean: '1.595e-04' + min: '-1.22e-02' + shape: + - 1024 + sum: '1.634e-01' +grads.network.model.decoder.layers.20.self_attn.v_proj.weight: + device: cuda:0 + max: '3.578e-01' + mean: '-1.476e-06' + min: '-3.892e-01' + shape: + - 1024 + - 1024 + sum: '-1.548e+00' +grads.network.model.decoder.layers.20.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.886e-02' + mean: '-2.963e-04' + min: '-1.759e-02' + shape: + - 1024 + sum: '-3.034e-01' +grads.network.model.decoder.layers.20.self_attn_layer_norm.weight: + device: cuda:0 + max: '2.024e-02' + mean: '9.812e-07' + min: '-1.449e-02' + shape: + - 1024 + sum: '1.005e-03' +grads.network.model.decoder.layers.21.fc1.bias: + device: cuda:0 + max: '1.159e-02' + mean: '-7.116e-06' + min: '-1.195e-02' + shape: + - 4096 + sum: '-2.915e-02' +grads.network.model.decoder.layers.21.fc1.weight: + device: cuda:0 + max: '3.364e-01' + mean: '-2.245e-08' + min: '-3.275e-01' + shape: + - 4096 + - 1024 + sum: '-9.418e-02' +grads.network.model.decoder.layers.21.fc2.bias: + device: cuda:0 + max: '2.210e-02' + mean: '1.455e-11' + min: '-2.116e-02' + shape: + - 1024 + sum: '1.490e-08' +grads.network.model.decoder.layers.21.fc2.weight: + device: cuda:0 + max: '1.082e-01' + mean: '-5.684e-14' + min: '-9.473e-02' + shape: + - 1024 + - 4096 + sum: '-2.384e-07' +grads.network.model.decoder.layers.21.final_layer_norm.bias: + device: cuda:0 + max: '2.494e-02' + mean: '2.162e-05' + min: '-2.386e-02' + shape: + - 1024 + sum: '2.214e-02' +grads.network.model.decoder.layers.21.final_layer_norm.weight: + device: cuda:0 + max: '2.376e-02' + mean: '7.015e-06' + min: '-1.133e-02' + shape: + - 1024 + sum: '7.184e-03' +grads.network.model.decoder.layers.21.self_attn.k_proj.bias: + device: cuda:0 + max: '4.002e-10' + mean: '-1.572e-12' + min: '-3.638e-10' + shape: + - 1024 + sum: '-1.61e-09' +grads.network.model.decoder.layers.21.self_attn.k_proj.weight: + device: cuda:0 + max: '2.533e-02' + mean: '2.293e-13' + min: '-3.203e-02' + shape: + - 1024 + - 1024 + sum: '2.405e-07' +grads.network.model.decoder.layers.21.self_attn.out_proj.bias: + device: cuda:0 + max: '1.854e-02' + mean: '0.e+00' + min: '-1.843e-02' + shape: + - 1024 + sum: '0.e+00' +grads.network.model.decoder.layers.21.self_attn.out_proj.weight: + device: cuda:0 + max: '1.236e-02' + mean: '1.137e-13' + min: '-1.02e-02' + shape: + - 1024 + - 1024 + sum: '1.192e-07' +grads.network.model.decoder.layers.21.self_attn.q_proj.bias: + device: cuda:0 + max: '1.768e-03' + mean: '1.468e-05' + min: '-1.166e-03' + shape: + - 1024 + sum: '1.503e-02' +grads.network.model.decoder.layers.21.self_attn.q_proj.weight: + device: cuda:0 + max: '1.766e-02' + mean: '-1.343e-07' + min: '-2.628e-02' + shape: + - 1024 + - 1024 + sum: '-1.408e-01' +grads.network.model.decoder.layers.21.self_attn.v_proj.bias: + device: cuda:0 + max: '1.447e-02' + mean: '1.302e-05' + min: '-1.778e-02' + shape: + - 1024 + sum: '1.333e-02' +grads.network.model.decoder.layers.21.self_attn.v_proj.weight: + device: cuda:0 + max: '4.942e-01' + mean: '-1.191e-07' + min: '-4.252e-01' + shape: + - 1024 + - 1024 + sum: '-1.249e-01' +grads.network.model.decoder.layers.21.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.995e-02' + mean: '1.246e-05' + min: '-1.996e-02' + shape: + - 1024 + sum: '1.276e-02' +grads.network.model.decoder.layers.21.self_attn_layer_norm.weight: + device: cuda:0 + max: '2.301e-02' + mean: '1.724e-06' + min: '-1.395e-02' + shape: + - 1024 + sum: '1.766e-03' +grads.network.model.decoder.layers.22.fc1.bias: + device: cuda:0 + max: '1.418e-02' + mean: '1.925e-05' + min: '-3.796e-02' + shape: + - 4096 + sum: '7.886e-02' +grads.network.model.decoder.layers.22.fc1.weight: + device: cuda:0 + max: '4.455e-01' + mean: '1.533e-08' + min: '-3.281e-01' + shape: + - 4096 + - 1024 + sum: '6.429e-02' +grads.network.model.decoder.layers.22.fc2.bias: + device: cuda:0 + max: '2.107e-02' + mean: '-2.183e-11' + min: '-1.798e-02' + shape: + - 1024 + sum: '-2.235e-08' +grads.network.model.decoder.layers.22.fc2.weight: + device: cuda:0 + max: '3.631e-02' + mean: '-1.137e-13' + min: '-5.145e-02' + shape: + - 1024 + - 4096 + sum: '-4.768e-07' +grads.network.model.decoder.layers.22.final_layer_norm.bias: + device: cuda:0 + max: '2.261e-02' + mean: '-3.098e-04' + min: '-1.996e-02' + shape: + - 1024 + sum: '-3.173e-01' +grads.network.model.decoder.layers.22.final_layer_norm.weight: + device: cuda:0 + max: '1.112e-01' + mean: '1.792e-05' + min: '-7.273e-03' + shape: + - 1024 + sum: '1.835e-02' +grads.network.model.decoder.layers.22.self_attn.k_proj.bias: + device: cuda:0 + max: '2.838e-10' + mean: '1.338e-12' + min: '-2.328e-10' + shape: + - 1024 + sum: '1.37e-09' +grads.network.model.decoder.layers.22.self_attn.k_proj.weight: + device: cuda:0 + max: '1.521e-02' + mean: '-6.001e-14' + min: '-1.506e-02' + shape: + - 1024 + - 1024 + sum: '-6.292e-08' +grads.network.model.decoder.layers.22.self_attn.out_proj.bias: + device: cuda:0 + max: '1.797e-02' + mean: '2.910e-11' + min: '-1.645e-02' + shape: + - 1024 + sum: '2.980e-08' +grads.network.model.decoder.layers.22.self_attn.out_proj.weight: + device: cuda:0 + max: '1.489e-02' + mean: '-2.132e-13' + min: '-1.383e-02' + shape: + - 1024 + - 1024 + sum: '-2.235e-07' +grads.network.model.decoder.layers.22.self_attn.q_proj.bias: + device: cuda:0 + max: '1.432e-03' + mean: '-1.077e-05' + min: '-1.380e-03' + shape: + - 1024 + sum: '-1.103e-02' +grads.network.model.decoder.layers.22.self_attn.q_proj.weight: + device: cuda:0 + max: '1.757e-02' + mean: '6.216e-08' + min: '-1.876e-02' + shape: + - 1024 + - 1024 + sum: '6.518e-02' +grads.network.model.decoder.layers.22.self_attn.v_proj.bias: + device: cuda:0 + max: '1.04e-02' + mean: '9.040e-05' + min: '-1.207e-02' + shape: + - 1024 + sum: '9.257e-02' +grads.network.model.decoder.layers.22.self_attn.v_proj.weight: + device: cuda:0 + max: '3.492e-01' + mean: '-5.219e-07' + min: '-2.943e-01' + shape: + - 1024 + - 1024 + sum: '-5.472e-01' +grads.network.model.decoder.layers.22.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.879e-02' + mean: '-5.430e-05' + min: '-1.734e-02' + shape: + - 1024 + sum: '-5.561e-02' +grads.network.model.decoder.layers.22.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.860e-02' + mean: '-1.348e-05' + min: '-3.154e-02' + shape: + - 1024 + sum: '-1.380e-02' +grads.network.model.decoder.layers.23.fc1.bias: + device: cuda:0 + max: '1.947e-02' + mean: '2.517e-05' + min: '-1.008e-02' + shape: + - 4096 + sum: '1.031e-01' +grads.network.model.decoder.layers.23.fc1.weight: + device: cuda:0 + max: '1.458e-01' + mean: '4.279e-08' + min: '-2.653e-01' + shape: + - 4096 + - 1024 + sum: '1.795e-01' +grads.network.model.decoder.layers.23.fc2.bias: + device: cuda:0 + max: '9.512e-03' + mean: '1.819e-12' + min: '-9.348e-03' + shape: + - 1024 + sum: '1.863e-09' +grads.network.model.decoder.layers.23.fc2.weight: + device: cuda:0 + max: '2.092e-02' + mean: '-4.547e-13' + min: '-1.892e-02' + shape: + - 1024 + - 4096 + sum: '-1.907e-06' +grads.network.model.decoder.layers.23.final_layer_norm.bias: + device: cuda:0 + max: '1.005e-02' + mean: '-9.368e-05' + min: '-9.654e-03' + shape: + - 1024 + sum: '-9.593e-02' +grads.network.model.decoder.layers.23.final_layer_norm.weight: + device: cuda:0 + max: '9.125e-03' + mean: '2.809e-04' + min: '-8.498e-03' + shape: + - 1024 + sum: '2.876e-01' +grads.network.model.decoder.layers.23.self_attn.k_proj.bias: + device: cuda:0 + max: '1.048e-09' + mean: '-2.047e-13' + min: '-1.513e-09' + shape: + - 1024 + sum: '-2.096e-10' +grads.network.model.decoder.layers.23.self_attn.k_proj.weight: + device: cuda:0 + max: '7.757e-02' + mean: '-1.006e-13' + min: '-1.167e-01' + shape: + - 1024 + - 1024 + sum: '-1.055e-07' +grads.network.model.decoder.layers.23.self_attn.out_proj.bias: + device: cuda:0 + max: '9.025e-03' + mean: '-5.457e-12' + min: '-8.085e-03' + shape: + - 1024 + sum: '-5.588e-09' +grads.network.model.decoder.layers.23.self_attn.out_proj.weight: + device: cuda:0 + max: '4.444e-03' + mean: '-6.395e-14' + min: '-4.31e-03' + shape: + - 1024 + - 1024 + sum: '-6.706e-08' +grads.network.model.decoder.layers.23.self_attn.q_proj.bias: + device: cuda:0 + max: '6.065e-03' + mean: '3.442e-05' + min: '-5.142e-03' + shape: + - 1024 + sum: '3.525e-02' +grads.network.model.decoder.layers.23.self_attn.q_proj.weight: + device: cuda:0 + max: '7.615e-02' + mean: '-1.647e-07' + min: '-8.673e-02' + shape: + - 1024 + - 1024 + sum: '-1.727e-01' +grads.network.model.decoder.layers.23.self_attn.v_proj.bias: + device: cuda:0 + max: '1.326e-02' + mean: '-5.18e-05' + min: '-1.957e-02' + shape: + - 1024 + sum: '-5.304e-02' +grads.network.model.decoder.layers.23.self_attn.v_proj.weight: + device: cuda:0 + max: '5.156e-01' + mean: '2.478e-07' + min: '-3.333e-01' + shape: + - 1024 + - 1024 + sum: '2.599e-01' +grads.network.model.decoder.layers.23.self_attn_layer_norm.bias: + device: cuda:0 + max: '9.140e-03' + mean: '1.168e-04' + min: '-7.772e-03' + shape: + - 1024 + sum: '1.196e-01' +grads.network.model.decoder.layers.23.self_attn_layer_norm.weight: + device: cuda:0 + max: '5.779e-03' + mean: '4.173e-06' + min: '-1.385e-02' + shape: + - 1024 + sum: '4.273e-03' +grads.network.model.decoder.layers.3.fc1.bias: + device: cuda:0 + max: '5.954e-03' + mean: '1.316e-05' + min: '-8.344e-03' + shape: + - 4096 + sum: '5.389e-02' +grads.network.model.decoder.layers.3.fc1.weight: + device: cuda:0 + max: '1.064e-01' + mean: '-6.116e-09' + min: '-9.593e-02' + shape: + - 4096 + - 1024 + sum: '-2.565e-02' +grads.network.model.decoder.layers.3.fc2.bias: + device: cuda:0 + max: '8.140e-03' + mean: '-3.638e-12' + min: '-1.140e-02' + shape: + - 1024 + sum: '-3.725e-09' +grads.network.model.decoder.layers.3.fc2.weight: + device: cuda:0 + max: '1.384e-02' + mean: '4.547e-13' + min: '-1.706e-02' + shape: + - 1024 + - 4096 + sum: '1.907e-06' +grads.network.model.decoder.layers.3.final_layer_norm.bias: + device: cuda:0 + max: '9.449e-03' + mean: '2.546e-05' + min: '-1.205e-02' + shape: + - 1024 + sum: '2.607e-02' +grads.network.model.decoder.layers.3.final_layer_norm.weight: + device: cuda:0 + max: '2.066e-02' + mean: '-4.079e-05' + min: '-3.198e-02' + shape: + - 1024 + sum: '-4.177e-02' +grads.network.model.decoder.layers.3.self_attn.k_proj.bias: + device: cuda:0 + max: '3.056e-10' + mean: '-1.023e-12' + min: '-2.983e-10' + shape: + - 1024 + sum: '-1.047e-09' +grads.network.model.decoder.layers.3.self_attn.k_proj.weight: + device: cuda:0 + max: '1.167e-02' + mean: '-1.421e-14' + min: '-1.363e-02' + shape: + - 1024 + - 1024 + sum: '-1.490e-08' +grads.network.model.decoder.layers.3.self_attn.out_proj.bias: + device: cuda:0 + max: '7.554e-03' + mean: '1.819e-11' + min: '-1.130e-02' + shape: + - 1024 + sum: '1.863e-08' +grads.network.model.decoder.layers.3.self_attn.out_proj.weight: + device: cuda:0 + max: '1.395e-02' + mean: '7.105e-14' + min: '-9.944e-03' + shape: + - 1024 + - 1024 + sum: '7.451e-08' +grads.network.model.decoder.layers.3.self_attn.q_proj.bias: + device: cuda:0 + max: '1.262e-03' + mean: '1.523e-05' + min: '-1.661e-03' + shape: + - 1024 + sum: '1.560e-02' +grads.network.model.decoder.layers.3.self_attn.q_proj.weight: + device: cuda:0 + max: '1.264e-02' + mean: '1.393e-07' + min: '-1.569e-02' + shape: + - 1024 + - 1024 + sum: '1.461e-01' +grads.network.model.decoder.layers.3.self_attn.v_proj.bias: + device: cuda:0 + max: '6.315e-03' + mean: '3.350e-05' + min: '-1.044e-02' + shape: + - 1024 + sum: '3.431e-02' +grads.network.model.decoder.layers.3.self_attn.v_proj.weight: + device: cuda:0 + max: '1.511e-01' + mean: '3.064e-07' + min: '-1.489e-01' + shape: + - 1024 + - 1024 + sum: '3.212e-01' +grads.network.model.decoder.layers.3.self_attn_layer_norm.bias: + device: cuda:0 + max: '7.629e-03' + mean: '2.019e-05' + min: '-1.149e-02' + shape: + - 1024 + sum: '2.068e-02' +grads.network.model.decoder.layers.3.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.384e-02' + mean: '1.535e-06' + min: '-3.271e-02' + shape: + - 1024 + sum: '1.572e-03' +grads.network.model.decoder.layers.4.fc1.bias: + device: cuda:0 + max: '8.716e-03' + mean: '-6.134e-06' + min: '-3.885e-03' + shape: + - 4096 + sum: '-2.513e-02' +grads.network.model.decoder.layers.4.fc1.weight: + device: cuda:0 + max: '9.354e-02' + mean: '-1.18e-09' + min: '-1.037e-01' + shape: + - 4096 + - 1024 + sum: '-4.948e-03' +grads.network.model.decoder.layers.4.fc2.bias: + device: cuda:0 + max: '7.127e-03' + mean: '-1.455e-11' + min: '-8.873e-03' + shape: + - 1024 + sum: '-1.490e-08' +grads.network.model.decoder.layers.4.fc2.weight: + device: cuda:0 + max: '1.011e-02' + mean: '-2.274e-13' + min: '-1.157e-02' + shape: + - 1024 + - 4096 + sum: '-9.537e-07' +grads.network.model.decoder.layers.4.final_layer_norm.bias: + device: cuda:0 + max: '7.855e-03' + mean: '-2.88e-05' + min: '-9.680e-03' + shape: + - 1024 + sum: '-2.949e-02' +grads.network.model.decoder.layers.4.final_layer_norm.weight: + device: cuda:0 + max: '1.503e-02' + mean: '1.502e-06' + min: '-1.015e-02' + shape: + - 1024 + sum: '1.538e-03' +grads.network.model.decoder.layers.4.self_attn.k_proj.bias: + device: cuda:0 + max: '4.511e-10' + mean: '-4.124e-12' + min: '-2.838e-10' + shape: + - 1024 + sum: '-4.223e-09' +grads.network.model.decoder.layers.4.self_attn.k_proj.weight: + device: cuda:0 + max: '2.309e-02' + mean: '-2.882e-13' + min: '-2.746e-02' + shape: + - 1024 + - 1024 + sum: '-3.022e-07' +grads.network.model.decoder.layers.4.self_attn.out_proj.bias: + device: cuda:0 + max: '7.763e-03' + mean: '-7.276e-12' + min: '-1.027e-02' + shape: + - 1024 + sum: '-7.451e-09' +grads.network.model.decoder.layers.4.self_attn.out_proj.weight: + device: cuda:0 + max: '1.258e-02' + mean: '-5.684e-14' + min: '-8.443e-03' + shape: + - 1024 + - 1024 + sum: '-5.960e-08' +grads.network.model.decoder.layers.4.self_attn.q_proj.bias: + device: cuda:0 + max: '1.406e-03' + mean: '8.718e-06' + min: '-1.263e-03' + shape: + - 1024 + sum: '8.927e-03' +grads.network.model.decoder.layers.4.self_attn.q_proj.weight: + device: cuda:0 + max: '1.614e-02' + mean: '5.714e-08' + min: '-1.253e-02' + shape: + - 1024 + - 1024 + sum: '5.992e-02' +grads.network.model.decoder.layers.4.self_attn.v_proj.bias: + device: cuda:0 + max: '7.103e-03' + mean: '4.113e-05' + min: '-7.943e-03' + shape: + - 1024 + sum: '4.212e-02' +grads.network.model.decoder.layers.4.self_attn.v_proj.weight: + device: cuda:0 + max: '1.551e-01' + mean: '2.696e-07' + min: '-1.392e-01' + shape: + - 1024 + - 1024 + sum: '2.827e-01' +grads.network.model.decoder.layers.4.self_attn_layer_norm.bias: + device: cuda:0 + max: '8.028e-03' + mean: '7.166e-06' + min: '-1.046e-02' + shape: + - 1024 + sum: '7.338e-03' +grads.network.model.decoder.layers.4.self_attn_layer_norm.weight: + device: cuda:0 + max: '8.643e-03' + mean: '-1.091e-05' + min: '-2.483e-02' + shape: + - 1024 + sum: '-1.117e-02' +grads.network.model.decoder.layers.5.fc1.bias: + device: cuda:0 + max: '4.748e-03' + mean: '4.587e-06' + min: '-5.883e-03' + shape: + - 4096 + sum: '1.879e-02' +grads.network.model.decoder.layers.5.fc1.weight: + device: cuda:0 + max: '9.723e-02' + mean: '-2.199e-09' + min: '-1.125e-01' + shape: + - 4096 + - 1024 + sum: '-9.221e-03' +grads.network.model.decoder.layers.5.fc2.bias: + device: cuda:0 + max: '7.651e-03' + mean: '2.183e-11' + min: '-1.023e-02' + shape: + - 1024 + sum: '2.235e-08' +grads.network.model.decoder.layers.5.fc2.weight: + device: cuda:0 + max: '1.427e-02' + mean: '4.547e-13' + min: '-1.743e-02' + shape: + - 1024 + - 4096 + sum: '1.907e-06' +grads.network.model.decoder.layers.5.final_layer_norm.bias: + device: cuda:0 + max: '8.459e-03' + mean: '-6.824e-05' + min: '-1.104e-02' + shape: + - 1024 + sum: '-6.988e-02' +grads.network.model.decoder.layers.5.final_layer_norm.weight: + device: cuda:0 + max: '2.276e-02' + mean: '1.546e-05' + min: '-1.198e-02' + shape: + - 1024 + sum: '1.583e-02' +grads.network.model.decoder.layers.5.self_attn.k_proj.bias: + device: cuda:0 + max: '4.366e-10' + mean: '2.527e-12' + min: '-3.929e-10' + shape: + - 1024 + sum: '2.588e-09' +grads.network.model.decoder.layers.5.self_attn.k_proj.weight: + device: cuda:0 + max: '2.063e-02' + mean: '6.717e-14' + min: '-1.871e-02' + shape: + - 1024 + - 1024 + sum: '7.043e-08' +grads.network.model.decoder.layers.5.self_attn.out_proj.bias: + device: cuda:0 + max: '7.647e-03' + mean: '1.455e-11' + min: '-1.1e-02' + shape: + - 1024 + sum: '1.490e-08' +grads.network.model.decoder.layers.5.self_attn.out_proj.weight: + device: cuda:0 + max: '1.146e-02' + mean: '-1.137e-13' + min: '-7.558e-03' + shape: + - 1024 + - 1024 + sum: '-1.192e-07' +grads.network.model.decoder.layers.5.self_attn.q_proj.bias: + device: cuda:0 + max: '1.232e-03' + mean: '5.46e-06' + min: '-1.171e-03' + shape: + - 1024 + sum: '5.591e-03' +grads.network.model.decoder.layers.5.self_attn.q_proj.weight: + device: cuda:0 + max: '1.892e-02' + mean: '1.393e-08' + min: '-1.640e-02' + shape: + - 1024 + - 1024 + sum: '1.461e-02' +grads.network.model.decoder.layers.5.self_attn.v_proj.bias: + device: cuda:0 + max: '7.63e-03' + mean: '2.826e-05' + min: '-6.905e-03' + shape: + - 1024 + sum: '2.894e-02' +grads.network.model.decoder.layers.5.self_attn.v_proj.weight: + device: cuda:0 + max: '1.549e-01' + mean: '7.210e-08' + min: '-1.564e-01' + shape: + - 1024 + - 1024 + sum: '7.561e-02' +grads.network.model.decoder.layers.5.self_attn_layer_norm.bias: + device: cuda:0 + max: '7.75e-03' + mean: '-6.064e-05' + min: '-1.140e-02' + shape: + - 1024 + sum: '-6.21e-02' +grads.network.model.decoder.layers.5.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.310e-02' + mean: '-7.533e-06' + min: '-1.207e-02' + shape: + - 1024 + sum: '-7.714e-03' +grads.network.model.decoder.layers.6.fc1.bias: + device: cuda:0 + max: '8.689e-03' + mean: '-1.853e-05' + min: '-5.812e-03' + shape: + - 4096 + sum: '-7.588e-02' +grads.network.model.decoder.layers.6.fc1.weight: + device: cuda:0 + max: '1.247e-01' + mean: '2.587e-11' + min: '-1.671e-01' + shape: + - 4096 + - 1024 + sum: '1.085e-04' +grads.network.model.decoder.layers.6.fc2.bias: + device: cuda:0 + max: '8.694e-03' + mean: '-3.638e-12' + min: '-8.964e-03' + shape: + - 1024 + sum: '-3.725e-09' +grads.network.model.decoder.layers.6.fc2.weight: + device: cuda:0 + max: '2.818e-02' + mean: '-1.99e-13' + min: '-2.423e-02' + shape: + - 1024 + - 4096 + sum: '-8.345e-07' +grads.network.model.decoder.layers.6.final_layer_norm.bias: + device: cuda:0 + max: '9.466e-03' + mean: '1.768e-05' + min: '-9.583e-03' + shape: + - 1024 + sum: '1.811e-02' +grads.network.model.decoder.layers.6.final_layer_norm.weight: + device: cuda:0 + max: '3.202e-02' + mean: '1.739e-05' + min: '-1.373e-02' + shape: + - 1024 + sum: '1.780e-02' +grads.network.model.decoder.layers.6.self_attn.k_proj.bias: + device: cuda:0 + max: '1.048e-09' + mean: '2.847e-12' + min: '-5.821e-10' + shape: + - 1024 + sum: '2.915e-09' +grads.network.model.decoder.layers.6.self_attn.k_proj.weight: + device: cuda:0 + max: '7.468e-02' + mean: '3.264e-14' + min: '-7.459e-02' + shape: + - 1024 + - 1024 + sum: '3.423e-08' +grads.network.model.decoder.layers.6.self_attn.out_proj.bias: + device: cuda:0 + max: '9.673e-03' + mean: '-7.276e-12' + min: '-9.632e-03' + shape: + - 1024 + sum: '-7.451e-09' +grads.network.model.decoder.layers.6.self_attn.out_proj.weight: + device: cuda:0 + max: '1.069e-02' + mean: '-2.558e-13' + min: '-1.237e-02' + shape: + - 1024 + - 1024 + sum: '-2.682e-07' +grads.network.model.decoder.layers.6.self_attn.q_proj.bias: + device: cuda:0 + max: '1.893e-03' + mean: '-1.271e-05' + min: '-3.243e-03' + shape: + - 1024 + sum: '-1.302e-02' +grads.network.model.decoder.layers.6.self_attn.q_proj.weight: + device: cuda:0 + max: '4.317e-02' + mean: '-5.287e-09' + min: '-5.174e-02' + shape: + - 1024 + - 1024 + sum: '-5.543e-03' +grads.network.model.decoder.layers.6.self_attn.v_proj.bias: + device: cuda:0 + max: '6.756e-03' + mean: '8.55e-05' + min: '-5.219e-03' + shape: + - 1024 + sum: '8.755e-02' +grads.network.model.decoder.layers.6.self_attn.v_proj.weight: + device: cuda:0 + max: '1.221e-01' + mean: '3.555e-08' + min: '-1.883e-01' + shape: + - 1024 + - 1024 + sum: '3.728e-02' +grads.network.model.decoder.layers.6.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.004e-02' + mean: '2.542e-06' + min: '-9.872e-03' + shape: + - 1024 + sum: '2.603e-03' +grads.network.model.decoder.layers.6.self_attn_layer_norm.weight: + device: cuda:0 + max: '2.376e-02' + mean: '-1.475e-05' + min: '-1.311e-02' + shape: + - 1024 + sum: '-1.511e-02' +grads.network.model.decoder.layers.7.fc1.bias: + device: cuda:0 + max: '1.040e-02' + mean: '-1.111e-05' + min: '-5.846e-03' + shape: + - 4096 + sum: '-4.551e-02' +grads.network.model.decoder.layers.7.fc1.weight: + device: cuda:0 + max: '1.282e-01' + mean: '-2.034e-09' + min: '-2.541e-01' + shape: + - 4096 + - 1024 + sum: '-8.530e-03' +grads.network.model.decoder.layers.7.fc2.bias: + device: cuda:0 + max: '8.647e-03' + mean: '-1.819e-12' + min: '-1.108e-02' + shape: + - 1024 + sum: '-1.863e-09' +grads.network.model.decoder.layers.7.fc2.weight: + device: cuda:0 + max: '2.036e-02' + mean: '-2.274e-13' + min: '-2.125e-02' + shape: + - 1024 + - 4096 + sum: '-9.537e-07' +grads.network.model.decoder.layers.7.final_layer_norm.bias: + device: cuda:0 + max: '9.436e-03' + mean: '1.051e-04' + min: '-1.201e-02' + shape: + - 1024 + sum: '1.076e-01' +grads.network.model.decoder.layers.7.final_layer_norm.weight: + device: cuda:0 + max: '2.502e-02' + mean: '-2.608e-06' + min: '-1.341e-02' + shape: + - 1024 + sum: '-2.670e-03' +grads.network.model.decoder.layers.7.self_attn.k_proj.bias: + device: cuda:0 + max: '4.075e-10' + mean: '1.863e-13' + min: '-3.492e-10' + shape: + - 1024 + sum: '1.908e-10' +grads.network.model.decoder.layers.7.self_attn.k_proj.weight: + device: cuda:0 + max: '3.309e-02' + mean: '6.817e-14' + min: '-4.19e-02' + shape: + - 1024 + - 1024 + sum: '7.148e-08' +grads.network.model.decoder.layers.7.self_attn.out_proj.bias: + device: cuda:0 + max: '7.477e-03' + mean: '-5.457e-12' + min: '-9.228e-03' + shape: + - 1024 + sum: '-5.588e-09' +grads.network.model.decoder.layers.7.self_attn.out_proj.weight: + device: cuda:0 + max: '1.003e-02' + mean: '-1.563e-13' + min: '-7.771e-03' + shape: + - 1024 + - 1024 + sum: '-1.639e-07' +grads.network.model.decoder.layers.7.self_attn.q_proj.bias: + device: cuda:0 + max: '2.209e-03' + mean: '-4.411e-06' + min: '-1.604e-03' + shape: + - 1024 + sum: '-4.517e-03' +grads.network.model.decoder.layers.7.self_attn.q_proj.weight: + device: cuda:0 + max: '3.379e-02' + mean: '5.986e-10' + min: '-2.946e-02' + shape: + - 1024 + - 1024 + sum: '6.277e-04' +grads.network.model.decoder.layers.7.self_attn.v_proj.bias: + device: cuda:0 + max: '6.926e-03' + mean: '5.966e-05' + min: '-6.282e-03' + shape: + - 1024 + sum: '6.109e-02' +grads.network.model.decoder.layers.7.self_attn.v_proj.weight: + device: cuda:0 + max: '1.424e-01' + mean: '-8.094e-09' + min: '-1.385e-01' + shape: + - 1024 + - 1024 + sum: '-8.487e-03' +grads.network.model.decoder.layers.7.self_attn_layer_norm.bias: + device: cuda:0 + max: '7.795e-03' + mean: '8.083e-05' + min: '-9.428e-03' + shape: + - 1024 + sum: '8.277e-02' +grads.network.model.decoder.layers.7.self_attn_layer_norm.weight: + device: cuda:0 + max: '3.435e-02' + mean: '-2.633e-06' + min: '-1.194e-02' + shape: + - 1024 + sum: '-2.696e-03' +grads.network.model.decoder.layers.8.fc1.bias: + device: cuda:0 + max: '9.447e-03' + mean: '-1.000e-05' + min: '-1.029e-02' + shape: + - 4096 + sum: '-4.096e-02' +grads.network.model.decoder.layers.8.fc1.weight: + device: cuda:0 + max: '1.788e-01' + mean: '-1.028e-08' + min: '-1.565e-01' + shape: + - 4096 + - 1024 + sum: '-4.31e-02' +grads.network.model.decoder.layers.8.fc2.bias: + device: cuda:0 + max: '9.312e-03' + mean: '1.819e-11' + min: '-9.654e-03' + shape: + - 1024 + sum: '1.863e-08' +grads.network.model.decoder.layers.8.fc2.weight: + device: cuda:0 + max: '2.393e-02' + mean: '6.821e-13' + min: '-1.897e-02' + shape: + - 1024 + - 4096 + sum: '2.861e-06' +grads.network.model.decoder.layers.8.final_layer_norm.bias: + device: cuda:0 + max: '1.033e-02' + mean: '-9.404e-05' + min: '-1.074e-02' + shape: + - 1024 + sum: '-9.63e-02' +grads.network.model.decoder.layers.8.final_layer_norm.weight: + device: cuda:0 + max: '8.312e-03' + mean: '-3.398e-05' + min: '-2.52e-02' + shape: + - 1024 + sum: '-3.479e-02' +grads.network.model.decoder.layers.8.self_attn.k_proj.bias: + device: cuda:0 + max: '4.657e-10' + mean: '1.157e-12' + min: '-7.567e-10' + shape: + - 1024 + sum: '1.185e-09' +grads.network.model.decoder.layers.8.self_attn.k_proj.weight: + device: cuda:0 + max: '2.660e-02' + mean: '-1.255e-14' + min: '-2.215e-02' + shape: + - 1024 + - 1024 + sum: '-1.315e-08' +grads.network.model.decoder.layers.8.self_attn.out_proj.bias: + device: cuda:0 + max: '8.574e-03' + mean: '-1.091e-11' + min: '-1.133e-02' + shape: + - 1024 + sum: '-1.118e-08' +grads.network.model.decoder.layers.8.self_attn.out_proj.weight: + device: cuda:0 + max: '5.791e-03' + mean: '1.776e-13' + min: '-7.842e-03' + shape: + - 1024 + - 1024 + sum: '1.863e-07' +grads.network.model.decoder.layers.8.self_attn.q_proj.bias: + device: cuda:0 + max: '2.176e-03' + mean: '1.136e-05' + min: '-1.464e-03' + shape: + - 1024 + sum: '1.164e-02' +grads.network.model.decoder.layers.8.self_attn.q_proj.weight: + device: cuda:0 + max: '2.919e-02' + mean: '-1.766e-08' + min: '-3.662e-02' + shape: + - 1024 + - 1024 + sum: '-1.852e-02' +grads.network.model.decoder.layers.8.self_attn.v_proj.bias: + device: cuda:0 + max: '7.759e-03' + mean: '5.574e-05' + min: '-1.002e-02' + shape: + - 1024 + sum: '5.708e-02' +grads.network.model.decoder.layers.8.self_attn.v_proj.weight: + device: cuda:0 + max: '2.583e-01' + mean: '-8.663e-08' + min: '-1.763e-01' + shape: + - 1024 + - 1024 + sum: '-9.083e-02' +grads.network.model.decoder.layers.8.self_attn_layer_norm.bias: + device: cuda:0 + max: '8.934e-03' + mean: '3.720e-05' + min: '-1.170e-02' + shape: + - 1024 + sum: '3.81e-02' +grads.network.model.decoder.layers.8.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.159e-02' + mean: '-3.363e-06' + min: '-1.334e-02' + shape: + - 1024 + sum: '-3.444e-03' +grads.network.model.decoder.layers.9.fc1.bias: + device: cuda:0 + max: '1.084e-02' + mean: '-1.724e-05' + min: '-8.211e-03' + shape: + - 4096 + sum: '-7.062e-02' +grads.network.model.decoder.layers.9.fc1.weight: + device: cuda:0 + max: '1.987e-01' + mean: '-1.661e-08' + min: '-2.721e-01' + shape: + - 4096 + - 1024 + sum: '-6.966e-02' +grads.network.model.decoder.layers.9.fc2.bias: + device: cuda:0 + max: '1.032e-02' + mean: '-7.276e-12' + min: '-1.013e-02' + shape: + - 1024 + sum: '-7.451e-09' +grads.network.model.decoder.layers.9.fc2.weight: + device: cuda:0 + max: '2.487e-02' + mean: '-5.684e-13' + min: '-2.754e-02' + shape: + - 1024 + - 4096 + sum: '-2.384e-06' +grads.network.model.decoder.layers.9.final_layer_norm.bias: + device: cuda:0 + max: '1.148e-02' + mean: '-7.486e-05' + min: '-1.105e-02' + shape: + - 1024 + sum: '-7.665e-02' +grads.network.model.decoder.layers.9.final_layer_norm.weight: + device: cuda:0 + max: '5.081e-02' + mean: '3.829e-06' + min: '-1.181e-02' + shape: + - 1024 + sum: '3.921e-03' +grads.network.model.decoder.layers.9.self_attn.k_proj.bias: + device: cuda:0 + max: '1.397e-09' + mean: '-3.783e-12' + min: '-2.095e-09' + shape: + - 1024 + sum: '-3.874e-09' +grads.network.model.decoder.layers.9.self_attn.k_proj.weight: + device: cuda:0 + max: '1.288e-01' + mean: '2.314e-13' + min: '-1.159e-01' + shape: + - 1024 + - 1024 + sum: '2.427e-07' +grads.network.model.decoder.layers.9.self_attn.out_proj.bias: + device: cuda:0 + max: '9.677e-03' + mean: '-2.183e-11' + min: '-9.679e-03' + shape: + - 1024 + sum: '-2.235e-08' +grads.network.model.decoder.layers.9.self_attn.out_proj.weight: + device: cuda:0 + max: '8.051e-03' + mean: '2.558e-13' + min: '-8.809e-03' + shape: + - 1024 + - 1024 + sum: '2.682e-07' +grads.network.model.decoder.layers.9.self_attn.q_proj.bias: + device: cuda:0 + max: '3.228e-03' + mean: '-6.335e-06' + min: '-4.683e-03' + shape: + - 1024 + sum: '-6.487e-03' +grads.network.model.decoder.layers.9.self_attn.q_proj.weight: + device: cuda:0 + max: '8.449e-02' + mean: '2.055e-08' + min: '-6.571e-02' + shape: + - 1024 + - 1024 + sum: '2.155e-02' +grads.network.model.decoder.layers.9.self_attn.v_proj.bias: + device: cuda:0 + max: '1.115e-02' + mean: '-3.493e-05' + min: '-9.448e-03' + shape: + - 1024 + sum: '-3.577e-02' +grads.network.model.decoder.layers.9.self_attn.v_proj.weight: + device: cuda:0 + max: '2.284e-01' + mean: '1.133e-07' + min: '-2.614e-01' + shape: + - 1024 + - 1024 + sum: '1.188e-01' +grads.network.model.decoder.layers.9.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.015e-02' + mean: '4.447e-05' + min: '-1.010e-02' + shape: + - 1024 + sum: '4.553e-02' +grads.network.model.decoder.layers.9.self_attn_layer_norm.weight: + device: cuda:0 + max: '9.655e-03' + mean: '2.292e-06' + min: '-2.027e-02' + shape: + - 1024 + sum: '2.347e-03' +grads.network.model.decoder.project_in.weight: + device: cuda:0 + max: '2.645e-02' + mean: '-3.396e-07' + min: '-2.839e-02' + shape: + - 1024 + - 512 + sum: '-1.780e-01' +grads.network.model.decoder.project_out.weight: + device: cuda:0 + max: '9.968e-02' + mean: '-3.139e-07' + min: '-1.016e-01' + shape: + - 512 + - 1024 + sum: '-1.646e-01' +outputs.loss: + device: cuda:0 + max: '4.05e+00' + mean: '4.05e+00' + min: '4.05e+00' + shape: [] + sum: '4.05e+00' diff --git a/.regression_files/project/algorithms/llm_finetuning_test/test_initialization_is_reproducible/cuda/llm_finetuning.yaml b/.regression_files/project/algorithms/llm_finetuning_test/test_initialization_is_reproducible/cuda/llm_finetuning.yaml new file mode 100644 index 00000000..9e7c6ffb --- /dev/null +++ b/.regression_files/project/algorithms/llm_finetuning_test/test_initialization_is_reproducible/cuda/llm_finetuning.yaml @@ -0,0 +1,3261 @@ +network.lm_head.weight: + device: cuda:0 + max: '2.372e-01' + mean: '-1.208e-03' + min: '-2.5e-01' + shape: + - 50272 + - 512 + sum: '-3.109e+04' +network.model.decoder.embed_positions.weight: + device: cuda:0 + max: '1.327e-01' + mean: '1.768e-05' + min: '-1.379e-01' + shape: + - 2050 + - 1024 + sum: '3.711e+01' +network.model.decoder.embed_tokens.weight: + device: cuda:0 + max: '2.372e-01' + mean: '-1.208e-03' + min: '-2.5e-01' + shape: + - 50272 + - 512 + sum: '-3.109e+04' +network.model.decoder.layers.0.fc1.bias: + device: cuda:0 + max: '1.249e-01' + mean: '-2.961e-02' + min: '-1.085e-01' + shape: + - 4096 + sum: '-1.213e+02' +network.model.decoder.layers.0.fc1.weight: + device: cuda:0 + max: '1.25e-01' + mean: '1.667e-04' + min: '-1.251e-01' + shape: + - 4096 + - 1024 + sum: '6.992e+02' +network.model.decoder.layers.0.fc2.bias: + device: cuda:0 + max: '7.88e-02' + mean: '-8.293e-05' + min: '-9.351e-02' + shape: + - 1024 + sum: '-8.492e-02' +network.model.decoder.layers.0.fc2.weight: + device: cuda:0 + max: '1.331e-01' + mean: '5.357e-06' + min: '-1.448e-01' + shape: + - 1024 + - 4096 + sum: '2.247e+01' +network.model.decoder.layers.0.final_layer_norm.bias: + device: cuda:0 + max: '1.256e-01' + mean: '7.015e-03' + min: '-1.204e-01' + shape: + - 1024 + sum: '7.183e+00' +network.model.decoder.layers.0.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.0.self_attn.k_proj.bias: + device: cuda:0 + max: '3.125e-02' + mean: '3.414e-04' + min: '-3.123e-02' + shape: + - 1024 + sum: '3.496e-01' +network.model.decoder.layers.0.self_attn.k_proj.weight: + device: cuda:0 + max: '1.256e-01' + mean: '-4.626e-05' + min: '-1.256e-01' + shape: + - 1024 + - 1024 + sum: '-4.850e+01' +network.model.decoder.layers.0.self_attn.out_proj.bias: + device: cuda:0 + max: '1.579e-02' + mean: '-2.766e-05' + min: '-1.138e-02' + shape: + - 1024 + sum: '-2.833e-02' +network.model.decoder.layers.0.self_attn.out_proj.weight: + device: cuda:0 + max: '1.283e-01' + mean: '-6.181e-06' + min: '-1.295e-01' + shape: + - 1024 + - 1024 + sum: '-6.481e+00' +network.model.decoder.layers.0.self_attn.q_proj.bias: + device: cuda:0 + max: '1.282e-01' + mean: '1.180e-03' + min: '-1.271e-01' + shape: + - 1024 + sum: '1.208e+00' +network.model.decoder.layers.0.self_attn.q_proj.weight: + device: cuda:0 + max: '1.267e-01' + mean: '-5.663e-05' + min: '-1.267e-01' + shape: + - 1024 + - 1024 + sum: '-5.938e+01' +network.model.decoder.layers.0.self_attn.v_proj.bias: + device: cuda:0 + max: '2.769e-02' + mean: '-2.715e-05' + min: '-2.669e-02' + shape: + - 1024 + sum: '-2.780e-02' +network.model.decoder.layers.0.self_attn.v_proj.weight: + device: cuda:0 + max: '8.795e-02' + mean: '1.917e-06' + min: '-8.508e-02' + shape: + - 1024 + - 1024 + sum: '2.011e+00' +network.model.decoder.layers.0.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.271e-01' + mean: '-2.03e-03' + min: '-1.248e-01' + shape: + - 1024 + sum: '-2.079e+00' +network.model.decoder.layers.0.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.1.fc1.bias: + device: cuda:0 + max: '1.236e-01' + mean: '-2.428e-02' + min: '-8.075e-02' + shape: + - 4096 + sum: '-9.946e+01' +network.model.decoder.layers.1.fc1.weight: + device: cuda:0 + max: '1.254e-01' + mean: '1.85e-04' + min: '-1.261e-01' + shape: + - 4096 + - 1024 + sum: '7.759e+02' +network.model.decoder.layers.1.fc2.bias: + device: cuda:0 + max: '8.911e-02' + mean: '2.946e-04' + min: '-8.362e-02' + shape: + - 1024 + sum: '3.017e-01' +network.model.decoder.layers.1.fc2.weight: + device: cuda:0 + max: '1.321e-01' + mean: '-2.468e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-1.035e+01' +network.model.decoder.layers.1.final_layer_norm.bias: + device: cuda:0 + max: '1.256e-01' + mean: '8.647e-03' + min: '-1.198e-01' + shape: + - 1024 + sum: '8.855e+00' +network.model.decoder.layers.1.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.1.self_attn.k_proj.bias: + device: cuda:0 + max: '7.153e-02' + mean: '7.902e-03' + min: '-7.874e-02' + shape: + - 1024 + sum: '8.092e+00' +network.model.decoder.layers.1.self_attn.k_proj.weight: + device: cuda:0 + max: '1.266e-01' + mean: '-1.284e-05' + min: '-1.272e-01' + shape: + - 1024 + - 1024 + sum: '-1.346e+01' +network.model.decoder.layers.1.self_attn.out_proj.bias: + device: cuda:0 + max: '8.606e-02' + mean: '-1.118e-04' + min: '-7.031e-02' + shape: + - 1024 + sum: '-1.144e-01' +network.model.decoder.layers.1.self_attn.out_proj.weight: + device: cuda:0 + max: '1.266e-01' + mean: '1.676e-06' + min: '-1.272e-01' + shape: + - 1024 + - 1024 + sum: '1.758e+00' +network.model.decoder.layers.1.self_attn.q_proj.bias: + device: cuda:0 + max: '1.254e-01' + mean: '-1.557e-03' + min: '-1.252e-01' + shape: + - 1024 + sum: '-1.595e+00' +network.model.decoder.layers.1.self_attn.q_proj.weight: + device: cuda:0 + max: '1.256e-01' + mean: '-3.561e-05' + min: '-1.26e-01' + shape: + - 1024 + - 1024 + sum: '-3.734e+01' +network.model.decoder.layers.1.self_attn.v_proj.bias: + device: cuda:0 + max: '5.002e-02' + mean: '3.967e-04' + min: '-4.831e-02' + shape: + - 1024 + sum: '4.062e-01' +network.model.decoder.layers.1.self_attn.v_proj.weight: + device: cuda:0 + max: '1.092e-01' + mean: '1.417e-05' + min: '-1.07e-01' + shape: + - 1024 + - 1024 + sum: '1.486e+01' +network.model.decoder.layers.1.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.304e-01' + mean: '-2.029e-03' + min: '-1.248e-01' + shape: + - 1024 + sum: '-2.078e+00' +network.model.decoder.layers.1.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.10.fc1.bias: + device: cuda:0 + max: '5.505e-02' + mean: '-2.099e-02' + min: '-8.49e-02' + shape: + - 4096 + sum: '-8.599e+01' +network.model.decoder.layers.10.fc1.weight: + device: cuda:0 + max: '1.27e-01' + mean: '1.603e-05' + min: '-1.296e-01' + shape: + - 4096 + - 1024 + sum: '6.723e+01' +network.model.decoder.layers.10.fc2.bias: + device: cuda:0 + max: '6.293e-02' + mean: '-1.937e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-1.983e-01' +network.model.decoder.layers.10.fc2.weight: + device: cuda:0 + max: '1.281e-01' + mean: '-1.624e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-6.81e+00' +network.model.decoder.layers.10.final_layer_norm.bias: + device: cuda:0 + max: '8.020e-02' + mean: '-9.374e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '-9.599e+00' +network.model.decoder.layers.10.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.10.self_attn.k_proj.bias: + device: cuda:0 + max: '7.422e-02' + mean: '7.871e-03' + min: '-7.428e-02' + shape: + - 1024 + sum: '8.06e+00' +network.model.decoder.layers.10.self_attn.k_proj.weight: + device: cuda:0 + max: '1.318e-01' + mean: '-1.478e-05' + min: '-1.285e-01' + shape: + - 1024 + - 1024 + sum: '-1.55e+01' +network.model.decoder.layers.10.self_attn.out_proj.bias: + device: cuda:0 + max: '7.031e-02' + mean: '-2.308e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '-2.363e-02' +network.model.decoder.layers.10.self_attn.out_proj.weight: + device: cuda:0 + max: '1.321e-01' + mean: '1.384e-06' + min: '-1.316e-01' + shape: + - 1024 + - 1024 + sum: '1.452e+00' +network.model.decoder.layers.10.self_attn.q_proj.bias: + device: cuda:0 + max: '1.089e-01' + mean: '-1.708e-03' + min: '-1.009e-01' + shape: + - 1024 + sum: '-1.749e+00' +network.model.decoder.layers.10.self_attn.q_proj.weight: + device: cuda:0 + max: '1.300e-01' + mean: '5.200e-06' + min: '-1.311e-01' + shape: + - 1024 + - 1024 + sum: '5.453e+00' +network.model.decoder.layers.10.self_attn.v_proj.bias: + device: cuda:0 + max: '5.096e-02' + mean: '3.204e-04' + min: '-5.444e-02' + shape: + - 1024 + sum: '3.281e-01' +network.model.decoder.layers.10.self_attn.v_proj.weight: + device: cuda:0 + max: '1.241e-01' + mean: '1.173e-05' + min: '-1.152e-01' + shape: + - 1024 + - 1024 + sum: '1.229e+01' +network.model.decoder.layers.10.self_attn_layer_norm.bias: + device: cuda:0 + max: '8.594e-02' + mean: '1.188e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '1.217e+00' +network.model.decoder.layers.10.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.11.fc1.bias: + device: cuda:0 + max: '6.107e-02' + mean: '-2.344e-02' + min: '-8.850e-02' + shape: + - 4096 + sum: '-9.601e+01' +network.model.decoder.layers.11.fc1.weight: + device: cuda:0 + max: '1.257e-01' + mean: '-1.888e-04' + min: '-1.263e-01' + shape: + - 4096 + - 1024 + sum: '-7.920e+02' +network.model.decoder.layers.11.fc2.bias: + device: cuda:0 + max: '6.47e-02' + mean: '1.148e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '1.176e-01' +network.model.decoder.layers.11.fc2.weight: + device: cuda:0 + max: '1.26e-01' + mean: '3.113e-07' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '1.306e+00' +network.model.decoder.layers.11.final_layer_norm.bias: + device: cuda:0 + max: '7.886e-02' + mean: '-1.455e-02' + min: '-1.25e-01' + shape: + - 1024 + sum: '-1.489e+01' +network.model.decoder.layers.11.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.11.self_attn.k_proj.bias: + device: cuda:0 + max: '7.074e-02' + mean: '5.886e-03' + min: '-6.482e-02' + shape: + - 1024 + sum: '6.027e+00' +network.model.decoder.layers.11.self_attn.k_proj.weight: + device: cuda:0 + max: '1.331e-01' + mean: '1.017e-05' + min: '-1.31e-01' + shape: + - 1024 + - 1024 + sum: '1.066e+01' +network.model.decoder.layers.11.self_attn.out_proj.bias: + device: cuda:0 + max: '6.311e-02' + mean: '-3.316e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-3.396e-01' +network.model.decoder.layers.11.self_attn.out_proj.weight: + device: cuda:0 + max: '1.514e-01' + mean: '1.601e-05' + min: '-1.647e-01' + shape: + - 1024 + - 1024 + sum: '1.679e+01' +network.model.decoder.layers.11.self_attn.q_proj.bias: + device: cuda:0 + max: '1.105e-01' + mean: '-2.709e-03' + min: '-1.172e-01' + shape: + - 1024 + sum: '-2.774e+00' +network.model.decoder.layers.11.self_attn.q_proj.weight: + device: cuda:0 + max: '1.287e-01' + mean: '5.092e-06' + min: '-1.26e-01' + shape: + - 1024 + - 1024 + sum: '5.339e+00' +network.model.decoder.layers.11.self_attn.v_proj.bias: + device: cuda:0 + max: '3.922e-02' + mean: '4.083e-04' + min: '-4.712e-02' + shape: + - 1024 + sum: '4.180e-01' +network.model.decoder.layers.11.self_attn.v_proj.weight: + device: cuda:0 + max: '1.234e-01' + mean: '-8.525e-05' + min: '-1.197e-01' + shape: + - 1024 + - 1024 + sum: '-8.939e+01' +network.model.decoder.layers.11.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.046e-01' + mean: '4.110e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '4.209e+00' +network.model.decoder.layers.11.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.12.fc1.bias: + device: cuda:0 + max: '7.367e-02' + mean: '-2.188e-02' + min: '-7.434e-02' + shape: + - 4096 + sum: '-8.961e+01' +network.model.decoder.layers.12.fc1.weight: + device: cuda:0 + max: '1.274e-01' + mean: '-2.221e-04' + min: '-1.266e-01' + shape: + - 4096 + - 1024 + sum: '-9.314e+02' +network.model.decoder.layers.12.fc2.bias: + device: cuda:0 + max: '7.233e-02' + mean: '-3.044e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-3.118e-01' +network.model.decoder.layers.12.fc2.weight: + device: cuda:0 + max: '1.265e-01' + mean: '1.128e-07' + min: '-1.393e-01' + shape: + - 1024 + - 4096 + sum: '4.732e-01' +network.model.decoder.layers.12.final_layer_norm.bias: + device: cuda:0 + max: '1.241e-01' + mean: '-1.53e-02' + min: '-1.254e-01' + shape: + - 1024 + sum: '-1.566e+01' +network.model.decoder.layers.12.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.12.self_attn.k_proj.bias: + device: cuda:0 + max: '1.177e-01' + mean: '6.118e-03' + min: '-8.82e-02' + shape: + - 1024 + sum: '6.265e+00' +network.model.decoder.layers.12.self_attn.k_proj.weight: + device: cuda:0 + max: '1.274e-01' + mean: '2.051e-05' + min: '-1.263e-01' + shape: + - 1024 + - 1024 + sum: '2.151e+01' +network.model.decoder.layers.12.self_attn.out_proj.bias: + device: cuda:0 + max: '6.604e-02' + mean: '-4.053e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-4.151e-01' +network.model.decoder.layers.12.self_attn.out_proj.weight: + device: cuda:0 + max: '1.273e-01' + mean: '6.458e-06' + min: '-1.268e-01' + shape: + - 1024 + - 1024 + sum: '6.772e+00' +network.model.decoder.layers.12.self_attn.q_proj.bias: + device: cuda:0 + max: '1.249e-01' + mean: '3.377e-04' + min: '-1.248e-01' + shape: + - 1024 + sum: '3.458e-01' +network.model.decoder.layers.12.self_attn.q_proj.weight: + device: cuda:0 + max: '1.262e-01' + mean: '-4.44e-05' + min: '-1.266e-01' + shape: + - 1024 + - 1024 + sum: '-4.655e+01' +network.model.decoder.layers.12.self_attn.v_proj.bias: + device: cuda:0 + max: '5.71e-02' + mean: '1.127e-04' + min: '-4.361e-02' + shape: + - 1024 + sum: '1.155e-01' +network.model.decoder.layers.12.self_attn.v_proj.weight: + device: cuda:0 + max: '1.246e-01' + mean: '5.265e-05' + min: '-1.251e-01' + shape: + - 1024 + - 1024 + sum: '5.521e+01' +network.model.decoder.layers.12.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.025e-01' + mean: '4.391e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '4.497e+00' +network.model.decoder.layers.12.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.13.fc1.bias: + device: cuda:0 + max: '9.039e-02' + mean: '-2.392e-02' + min: '-7.361e-02' + shape: + - 4096 + sum: '-9.798e+01' +network.model.decoder.layers.13.fc1.weight: + device: cuda:0 + max: '1.263e-01' + mean: '-2.766e-04' + min: '-1.261e-01' + shape: + - 4096 + - 1024 + sum: '-1.160e+03' +network.model.decoder.layers.13.fc2.bias: + device: cuda:0 + max: '7.214e-02' + mean: '2.524e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.584e-01' +network.model.decoder.layers.13.fc2.weight: + device: cuda:0 + max: '1.256e-01' + mean: '-2.636e-06' + min: '-1.754e-01' + shape: + - 1024 + - 4096 + sum: '-1.106e+01' +network.model.decoder.layers.13.final_layer_norm.bias: + device: cuda:0 + max: '1.246e-01' + mean: '-2.340e-02' + min: '-1.254e-01' + shape: + - 1024 + sum: '-2.396e+01' +network.model.decoder.layers.13.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.13.self_attn.k_proj.bias: + device: cuda:0 + max: '7.465e-02' + mean: '5.789e-03' + min: '-7.758e-02' + shape: + - 1024 + sum: '5.928e+00' +network.model.decoder.layers.13.self_attn.k_proj.weight: + device: cuda:0 + max: '1.281e-01' + mean: '3.542e-05' + min: '-1.283e-01' + shape: + - 1024 + - 1024 + sum: '3.714e+01' +network.model.decoder.layers.13.self_attn.out_proj.bias: + device: cuda:0 + max: '6.506e-02' + mean: '-2.055e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-2.104e-01' +network.model.decoder.layers.13.self_attn.out_proj.weight: + device: cuda:0 + max: '1.277e-01' + mean: '-1.117e-05' + min: '-1.268e-01' + shape: + - 1024 + - 1024 + sum: '-1.171e+01' +network.model.decoder.layers.13.self_attn.q_proj.bias: + device: cuda:0 + max: '1.247e-01' + mean: '-2.867e-03' + min: '-1.138e-01' + shape: + - 1024 + sum: '-2.936e+00' +network.model.decoder.layers.13.self_attn.q_proj.weight: + device: cuda:0 + max: '1.265e-01' + mean: '3.923e-05' + min: '-1.273e-01' + shape: + - 1024 + - 1024 + sum: '4.114e+01' +network.model.decoder.layers.13.self_attn.v_proj.bias: + device: cuda:0 + max: '4.150e-02' + mean: '-2.426e-04' + min: '-4.178e-02' + shape: + - 1024 + sum: '-2.485e-01' +network.model.decoder.layers.13.self_attn.v_proj.weight: + device: cuda:0 + max: '1.262e-01' + mean: '-6.461e-05' + min: '-1.251e-01' + shape: + - 1024 + - 1024 + sum: '-6.775e+01' +network.model.decoder.layers.13.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.247e-01' + mean: '3.063e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '3.137e+00' +network.model.decoder.layers.13.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.14.fc1.bias: + device: cuda:0 + max: '6.329e-02' + mean: '-2.279e-02' + min: '-6.866e-02' + shape: + - 4096 + sum: '-9.333e+01' +network.model.decoder.layers.14.fc1.weight: + device: cuda:0 + max: '1.261e-01' + mean: '-1.687e-04' + min: '-1.256e-01' + shape: + - 4096 + - 1024 + sum: '-7.075e+02' +network.model.decoder.layers.14.fc2.bias: + device: cuda:0 + max: '8.209e-02' + mean: '2.395e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.453e-01' +network.model.decoder.layers.14.fc2.weight: + device: cuda:0 + max: '1.265e-01' + mean: '-1.073e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-4.501e+00' +network.model.decoder.layers.14.final_layer_norm.bias: + device: cuda:0 + max: '1.249e-01' + mean: '-2.171e-02' + min: '-1.277e-01' + shape: + - 1024 + sum: '-2.223e+01' +network.model.decoder.layers.14.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.14.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '4.583e-03' + min: '-1.03e-01' + shape: + - 1024 + sum: '4.693e+00' +network.model.decoder.layers.14.self_attn.k_proj.weight: + device: cuda:0 + max: '1.265e-01' + mean: '3.023e-05' + min: '-1.266e-01' + shape: + - 1024 + - 1024 + sum: '3.170e+01' +network.model.decoder.layers.14.self_attn.out_proj.bias: + device: cuda:0 + max: '6.335e-02' + mean: '-2.293e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-2.348e-01' +network.model.decoder.layers.14.self_attn.out_proj.weight: + device: cuda:0 + max: '1.292e-01' + mean: '-1.601e-05' + min: '-1.316e-01' + shape: + - 1024 + - 1024 + sum: '-1.679e+01' +network.model.decoder.layers.14.self_attn.q_proj.bias: + device: cuda:0 + max: '1.237e-01' + mean: '-1.509e-03' + min: '-1.181e-01' + shape: + - 1024 + sum: '-1.546e+00' +network.model.decoder.layers.14.self_attn.q_proj.weight: + device: cuda:0 + max: '1.263e-01' + mean: '3.587e-05' + min: '-1.265e-01' + shape: + - 1024 + - 1024 + sum: '3.761e+01' +network.model.decoder.layers.14.self_attn.v_proj.bias: + device: cuda:0 + max: '4.108e-02' + mean: '4.279e-04' + min: '-3.915e-02' + shape: + - 1024 + sum: '4.381e-01' +network.model.decoder.layers.14.self_attn.v_proj.weight: + device: cuda:0 + max: '1.249e-01' + mean: '6.315e-06' + min: '-1.249e-01' + shape: + - 1024 + - 1024 + sum: '6.622e+00' +network.model.decoder.layers.14.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '9.48e-04' + min: '-1.285e-01' + shape: + - 1024 + sum: '9.707e-01' +network.model.decoder.layers.14.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.15.fc1.bias: + device: cuda:0 + max: '6.256e-02' + mean: '-2.178e-02' + min: '-7.373e-02' + shape: + - 4096 + sum: '-8.921e+01' +network.model.decoder.layers.15.fc1.weight: + device: cuda:0 + max: '1.262e-01' + mean: '-2.048e-04' + min: '-1.274e-01' + shape: + - 4096 + - 1024 + sum: '-8.590e+02' +network.model.decoder.layers.15.fc2.bias: + device: cuda:0 + max: '7.629e-02' + mean: '-2.647e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-2.711e-01' +network.model.decoder.layers.15.fc2.weight: + device: cuda:0 + max: '1.273e-01' + mean: '-1.300e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-5.454e+00' +network.model.decoder.layers.15.final_layer_norm.bias: + device: cuda:0 + max: '1.251e-01' + mean: '-2.09e-02' + min: '-1.271e-01' + shape: + - 1024 + sum: '-2.14e+01' +network.model.decoder.layers.15.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.15.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '5.291e-03' + min: '-8.069e-02' + shape: + - 1024 + sum: '5.418e+00' +network.model.decoder.layers.15.self_attn.k_proj.weight: + device: cuda:0 + max: '1.259e-01' + mean: '3.431e-05' + min: '-1.272e-01' + shape: + - 1024 + - 1024 + sum: '3.598e+01' +network.model.decoder.layers.15.self_attn.out_proj.bias: + device: cuda:0 + max: '6.873e-02' + mean: '2.003e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.051e-02' +network.model.decoder.layers.15.self_attn.out_proj.weight: + device: cuda:0 + max: '1.798e-01' + mean: '1.003e-06' + min: '-1.726e-01' + shape: + - 1024 + - 1024 + sum: '1.052e+00' +network.model.decoder.layers.15.self_attn.q_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '1.456e-03' + min: '-1.242e-01' + shape: + - 1024 + sum: '1.491e+00' +network.model.decoder.layers.15.self_attn.q_proj.weight: + device: cuda:0 + max: '1.271e-01' + mean: '-2.108e-05' + min: '-1.259e-01' + shape: + - 1024 + - 1024 + sum: '-2.21e+01' +network.model.decoder.layers.15.self_attn.v_proj.bias: + device: cuda:0 + max: '4.312e-02' + mean: '-6.573e-04' + min: '-4.214e-02' + shape: + - 1024 + sum: '-6.731e-01' +network.model.decoder.layers.15.self_attn.v_proj.weight: + device: cuda:0 + max: '1.246e-01' + mean: '-1.231e-04' + min: '-1.249e-01' + shape: + - 1024 + - 1024 + sum: '-1.291e+02' +network.model.decoder.layers.15.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '1.033e-03' + min: '-1.627e-01' + shape: + - 1024 + sum: '1.058e+00' +network.model.decoder.layers.15.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.16.fc1.bias: + device: cuda:0 + max: '1.138e-01' + mean: '-2.057e-02' + min: '-8.105e-02' + shape: + - 4096 + sum: '-8.427e+01' +network.model.decoder.layers.16.fc1.weight: + device: cuda:0 + max: '1.261e-01' + mean: '-1.731e-04' + min: '-1.263e-01' + shape: + - 4096 + - 1024 + sum: '-7.259e+02' +network.model.decoder.layers.16.fc2.bias: + device: cuda:0 + max: '7.257e-02' + mean: '-1.059e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-1.085e-01' +network.model.decoder.layers.16.fc2.weight: + device: cuda:0 + max: '1.387e-01' + mean: '-4.515e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-1.894e+01' +network.model.decoder.layers.16.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-1.704e-02' + min: '-1.285e-01' + shape: + - 1024 + sum: '-1.745e+01' +network.model.decoder.layers.16.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.16.self_attn.k_proj.bias: + device: cuda:0 + max: '1.117e-01' + mean: '6.356e-03' + min: '-9.009e-02' + shape: + - 1024 + sum: '6.508e+00' +network.model.decoder.layers.16.self_attn.k_proj.weight: + device: cuda:0 + max: '1.27e-01' + mean: '-1.634e-05' + min: '-1.265e-01' + shape: + - 1024 + - 1024 + sum: '-1.713e+01' +network.model.decoder.layers.16.self_attn.out_proj.bias: + device: cuda:0 + max: '8.398e-02' + mean: '4.806e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '4.921e-02' +network.model.decoder.layers.16.self_attn.out_proj.weight: + device: cuda:0 + max: '1.553e-01' + mean: '-3.501e-06' + min: '-1.626e-01' + shape: + - 1024 + - 1024 + sum: '-3.671e+00' +network.model.decoder.layers.16.self_attn.q_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-1.884e-04' + min: '-1.246e-01' + shape: + - 1024 + sum: '-1.929e-01' +network.model.decoder.layers.16.self_attn.q_proj.weight: + device: cuda:0 + max: '1.261e-01' + mean: '2.789e-06' + min: '-1.278e-01' + shape: + - 1024 + - 1024 + sum: '2.924e+00' +network.model.decoder.layers.16.self_attn.v_proj.bias: + device: cuda:0 + max: '4.462e-02' + mean: '-7.8e-04' + min: '-4.309e-02' + shape: + - 1024 + sum: '-7.987e-01' +network.model.decoder.layers.16.self_attn.v_proj.weight: + device: cuda:0 + max: '1.257e-01' + mean: '-9.28e-05' + min: '-1.259e-01' + shape: + - 1024 + - 1024 + sum: '-9.731e+01' +network.model.decoder.layers.16.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.252e-01' + mean: '1.154e-03' + min: '-2.112e-01' + shape: + - 1024 + sum: '1.182e+00' +network.model.decoder.layers.16.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.17.fc1.bias: + device: cuda:0 + max: '1.113e-01' + mean: '-2.007e-02' + min: '-7.483e-02' + shape: + - 4096 + sum: '-8.219e+01' +network.model.decoder.layers.17.fc1.weight: + device: cuda:0 + max: '1.27e-01' + mean: '-1.176e-04' + min: '-1.266e-01' + shape: + - 4096 + - 1024 + sum: '-4.934e+02' +network.model.decoder.layers.17.fc2.bias: + device: cuda:0 + max: '6.415e-02' + mean: '2.448e-06' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.507e-03' +network.model.decoder.layers.17.fc2.weight: + device: cuda:0 + max: '1.431e-01' + mean: '-1.922e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-8.062e+00' +network.model.decoder.layers.17.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-1.363e-02' + min: '-1.307e-01' + shape: + - 1024 + sum: '-1.396e+01' +network.model.decoder.layers.17.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.17.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '3.524e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '3.609e+00' +network.model.decoder.layers.17.self_attn.k_proj.weight: + device: cuda:0 + max: '1.257e-01' + mean: '-6.266e-06' + min: '-1.268e-01' + shape: + - 1024 + - 1024 + sum: '-6.571e+00' +network.model.decoder.layers.17.self_attn.out_proj.bias: + device: cuda:0 + max: '8.557e-02' + mean: '7.932e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '8.123e-02' +network.model.decoder.layers.17.self_attn.out_proj.weight: + device: cuda:0 + max: '1.682e-01' + mean: '1.080e-05' + min: '-1.591e-01' + shape: + - 1024 + - 1024 + sum: '1.133e+01' +network.model.decoder.layers.17.self_attn.q_proj.bias: + device: cuda:0 + max: '1.081e-01' + mean: '8.627e-04' + min: '-1.006e-01' + shape: + - 1024 + sum: '8.834e-01' +network.model.decoder.layers.17.self_attn.q_proj.weight: + device: cuda:0 + max: '1.265e-01' + mean: '-1.448e-05' + min: '-1.262e-01' + shape: + - 1024 + - 1024 + sum: '-1.518e+01' +network.model.decoder.layers.17.self_attn.v_proj.bias: + device: cuda:0 + max: '4.285e-02' + mean: '4.112e-04' + min: '-4.175e-02' + shape: + - 1024 + sum: '4.211e-01' +network.model.decoder.layers.17.self_attn.v_proj.weight: + device: cuda:0 + max: '1.254e-01' + mean: '-1.06e-05' + min: '-1.25e-01' + shape: + - 1024 + - 1024 + sum: '-1.111e+01' +network.model.decoder.layers.17.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.251e-01' + mean: '1.74e-04' + min: '-1.978e-01' + shape: + - 1024 + sum: '1.781e-01' +network.model.decoder.layers.17.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.18.fc1.bias: + device: cuda:0 + max: '6.793e-02' + mean: '-1.838e-02' + min: '-8.258e-02' + shape: + - 4096 + sum: '-7.527e+01' +network.model.decoder.layers.18.fc1.weight: + device: cuda:0 + max: '1.266e-01' + mean: '-1.719e-04' + min: '-1.256e-01' + shape: + - 4096 + - 1024 + sum: '-7.209e+02' +network.model.decoder.layers.18.fc2.bias: + device: cuda:0 + max: '6.201e-02' + mean: '-3.286e-06' + min: '-1.06e-01' + shape: + - 1024 + sum: '-3.364e-03' +network.model.decoder.layers.18.fc2.weight: + device: cuda:0 + max: '1.271e-01' + mean: '2.113e-06' + min: '-1.885e-01' + shape: + - 1024 + - 4096 + sum: '8.863e+00' +network.model.decoder.layers.18.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-1.239e-02' + min: '-1.262e-01' + shape: + - 1024 + sum: '-1.268e+01' +network.model.decoder.layers.18.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.18.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '5.307e-03' + min: '-1.218e-01' + shape: + - 1024 + sum: '5.434e+00' +network.model.decoder.layers.18.self_attn.k_proj.weight: + device: cuda:0 + max: '1.26e-01' + mean: '1.154e-05' + min: '-1.27e-01' + shape: + - 1024 + - 1024 + sum: '1.210e+01' +network.model.decoder.layers.18.self_attn.out_proj.bias: + device: cuda:0 + max: '7.617e-02' + mean: '-8.257e-06' + min: '-1.25e-01' + shape: + - 1024 + sum: '-8.455e-03' +network.model.decoder.layers.18.self_attn.out_proj.weight: + device: cuda:0 + max: '1.453e-01' + mean: '-6.184e-06' + min: '-1.554e-01' + shape: + - 1024 + - 1024 + sum: '-6.484e+00' +network.model.decoder.layers.18.self_attn.q_proj.bias: + device: cuda:0 + max: '1.002e-01' + mean: '-2.302e-03' + min: '-1.179e-01' + shape: + - 1024 + sum: '-2.357e+00' +network.model.decoder.layers.18.self_attn.q_proj.weight: + device: cuda:0 + max: '1.274e-01' + mean: '-2.129e-05' + min: '-1.27e-01' + shape: + - 1024 + - 1024 + sum: '-2.233e+01' +network.model.decoder.layers.18.self_attn.v_proj.bias: + device: cuda:0 + max: '4.874e-02' + mean: '-1.296e-04' + min: '-4.315e-02' + shape: + - 1024 + sum: '-1.327e-01' +network.model.decoder.layers.18.self_attn.v_proj.weight: + device: cuda:0 + max: '1.249e-01' + mean: '-5.472e-05' + min: '-1.25e-01' + shape: + - 1024 + - 1024 + sum: '-5.738e+01' +network.model.decoder.layers.18.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.251e-01' + mean: '1.729e-03' + min: '-1.528e-01' + shape: + - 1024 + sum: '1.771e+00' +network.model.decoder.layers.18.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.19.fc1.bias: + device: cuda:0 + max: '9.674e-02' + mean: '-1.617e-02' + min: '-7.123e-02' + shape: + - 4096 + sum: '-6.623e+01' +network.model.decoder.layers.19.fc1.weight: + device: cuda:0 + max: '1.276e-01' + mean: '-1.816e-04' + min: '-1.266e-01' + shape: + - 4096 + - 1024 + sum: '-7.616e+02' +network.model.decoder.layers.19.fc2.bias: + device: cuda:0 + max: '6.439e-02' + mean: '-2.292e-04' + min: '-7.587e-02' + shape: + - 1024 + sum: '-2.347e-01' +network.model.decoder.layers.19.fc2.weight: + device: cuda:0 + max: '1.273e-01' + mean: '6.639e-06' + min: '-1.782e-01' + shape: + - 1024 + - 4096 + sum: '2.785e+01' +network.model.decoder.layers.19.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-9.252e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '-9.474e+00' +network.model.decoder.layers.19.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.19.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '7.829e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '8.017e+00' +network.model.decoder.layers.19.self_attn.k_proj.weight: + device: cuda:0 + max: '1.265e-01' + mean: '-2.187e-05' + min: '-1.265e-01' + shape: + - 1024 + - 1024 + sum: '-2.294e+01' +network.model.decoder.layers.19.self_attn.out_proj.bias: + device: cuda:0 + max: '6.445e-02' + mean: '2.324e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.380e-01' +network.model.decoder.layers.19.self_attn.out_proj.weight: + device: cuda:0 + max: '1.454e-01' + mean: '-5.801e-08' + min: '-1.431e-01' + shape: + - 1024 + - 1024 + sum: '-6.083e-02' +network.model.decoder.layers.19.self_attn.q_proj.bias: + device: cuda:0 + max: '1.252e-01' + mean: '-2.284e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '-2.338e+00' +network.model.decoder.layers.19.self_attn.q_proj.weight: + device: cuda:0 + max: '1.276e-01' + mean: '8.971e-05' + min: '-1.281e-01' + shape: + - 1024 + - 1024 + sum: '9.406e+01' +network.model.decoder.layers.19.self_attn.v_proj.bias: + device: cuda:0 + max: '4.413e-02' + mean: '-1.693e-04' + min: '-4.315e-02' + shape: + - 1024 + sum: '-1.733e-01' +network.model.decoder.layers.19.self_attn.v_proj.weight: + device: cuda:0 + max: '1.249e-01' + mean: '-6.37e-05' + min: '-1.249e-01' + shape: + - 1024 + - 1024 + sum: '-6.679e+01' +network.model.decoder.layers.19.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '3.325e-03' + min: '-1.936e-01' + shape: + - 1024 + sum: '3.405e+00' +network.model.decoder.layers.19.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.2.fc1.bias: + device: cuda:0 + max: '7.135e-02' + mean: '-2.341e-02' + min: '-6.665e-02' + shape: + - 4096 + sum: '-9.591e+01' +network.model.decoder.layers.2.fc1.weight: + device: cuda:0 + max: '1.25e-01' + mean: '2.334e-04' + min: '-1.255e-01' + shape: + - 4096 + - 1024 + sum: '9.791e+02' +network.model.decoder.layers.2.fc2.bias: + device: cuda:0 + max: '7.172e-02' + mean: '3.129e-04' + min: '-7.66e-02' + shape: + - 1024 + sum: '3.204e-01' +network.model.decoder.layers.2.fc2.weight: + device: cuda:0 + max: '1.294e-01' + mean: '-1.695e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-7.109e+00' +network.model.decoder.layers.2.final_layer_norm.bias: + device: cuda:0 + max: '1.257e-01' + mean: '9.144e-03' + min: '-1.251e-01' + shape: + - 1024 + sum: '9.364e+00' +network.model.decoder.layers.2.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.2.self_attn.k_proj.bias: + device: cuda:0 + max: '6.384e-02' + mean: '8.869e-03' + min: '-6.445e-02' + shape: + - 1024 + sum: '9.082e+00' +network.model.decoder.layers.2.self_attn.k_proj.weight: + device: cuda:0 + max: '1.292e-01' + mean: '2.489e-05' + min: '-1.265e-01' + shape: + - 1024 + - 1024 + sum: '2.61e+01' +network.model.decoder.layers.2.self_attn.out_proj.bias: + device: cuda:0 + max: '1.234e-01' + mean: '3.411e-04' + min: '-8.948e-02' + shape: + - 1024 + sum: '3.493e-01' +network.model.decoder.layers.2.self_attn.out_proj.weight: + device: cuda:0 + max: '1.317e-01' + mean: '-6.495e-06' + min: '-1.283e-01' + shape: + - 1024 + - 1024 + sum: '-6.811e+00' +network.model.decoder.layers.2.self_attn.q_proj.bias: + device: cuda:0 + max: '1.249e-01' + mean: '9.792e-04' + min: '-1.255e-01' + shape: + - 1024 + sum: '1.003e+00' +network.model.decoder.layers.2.self_attn.q_proj.weight: + device: cuda:0 + max: '1.257e-01' + mean: '1.202e-05' + min: '-1.271e-01' + shape: + - 1024 + - 1024 + sum: '1.260e+01' +network.model.decoder.layers.2.self_attn.v_proj.bias: + device: cuda:0 + max: '4.211e-02' + mean: '-9.478e-05' + min: '-3.799e-02' + shape: + - 1024 + sum: '-9.706e-02' +network.model.decoder.layers.2.self_attn.v_proj.weight: + device: cuda:0 + max: '1.234e-01' + mean: '3.971e-05' + min: '-1.171e-01' + shape: + - 1024 + - 1024 + sum: '4.164e+01' +network.model.decoder.layers.2.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.309e-01' + mean: '-1.911e-03' + min: '-1.254e-01' + shape: + - 1024 + sum: '-1.957e+00' +network.model.decoder.layers.2.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.20.fc1.bias: + device: cuda:0 + max: '7.928e-02' + mean: '-1.524e-02' + min: '-7.220e-02' + shape: + - 4096 + sum: '-6.244e+01' +network.model.decoder.layers.20.fc1.weight: + device: cuda:0 + max: '1.277e-01' + mean: '-1.853e-04' + min: '-1.271e-01' + shape: + - 4096 + - 1024 + sum: '-7.770e+02' +network.model.decoder.layers.20.fc2.bias: + device: cuda:0 + max: '6.787e-02' + mean: '-1.132e-04' + min: '-7.617e-02' + shape: + - 1024 + sum: '-1.159e-01' +network.model.decoder.layers.20.fc2.weight: + device: cuda:0 + max: '1.27e-01' + mean: '6.366e-06' + min: '-2.393e-01' + shape: + - 1024 + - 4096 + sum: '2.670e+01' +network.model.decoder.layers.20.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-9.149e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '-9.369e+00' +network.model.decoder.layers.20.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.20.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '1.126e-02' + min: '-1.25e-01' + shape: + - 1024 + sum: '1.153e+01' +network.model.decoder.layers.20.self_attn.k_proj.weight: + device: cuda:0 + max: '1.356e-01' + mean: '4.825e-05' + min: '-1.333e-01' + shape: + - 1024 + - 1024 + sum: '5.059e+01' +network.model.decoder.layers.20.self_attn.out_proj.bias: + device: cuda:0 + max: '6.512e-02' + mean: '-8.754e-05' + min: '-1.215e-01' + shape: + - 1024 + sum: '-8.964e-02' +network.model.decoder.layers.20.self_attn.out_proj.weight: + device: cuda:0 + max: '1.334e-01' + mean: '8.321e-06' + min: '-1.311e-01' + shape: + - 1024 + - 1024 + sum: '8.725e+00' +network.model.decoder.layers.20.self_attn.q_proj.bias: + device: cuda:0 + max: '1.252e-01' + mean: '-2.386e-03' + min: '-1.256e-01' + shape: + - 1024 + sum: '-2.444e+00' +network.model.decoder.layers.20.self_attn.q_proj.weight: + device: cuda:0 + max: '1.278e-01' + mean: '1.178e-07' + min: '-1.279e-01' + shape: + - 1024 + - 1024 + sum: '1.235e-01' +network.model.decoder.layers.20.self_attn.v_proj.bias: + device: cuda:0 + max: '4.395e-02' + mean: '-3.544e-04' + min: '-4.248e-02' + shape: + - 1024 + sum: '-3.629e-01' +network.model.decoder.layers.20.self_attn.v_proj.weight: + device: cuda:0 + max: '1.246e-01' + mean: '1.676e-06' + min: '-1.249e-01' + shape: + - 1024 + - 1024 + sum: '1.757e+00' +network.model.decoder.layers.20.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '3.003e-03' + min: '-1.256e-01' + shape: + - 1024 + sum: '3.075e+00' +network.model.decoder.layers.20.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.21.fc1.bias: + device: cuda:0 + max: '8.362e-02' + mean: '-1.634e-02' + min: '-9.613e-02' + shape: + - 4096 + sum: '-6.693e+01' +network.model.decoder.layers.21.fc1.weight: + device: cuda:0 + max: '1.289e-01' + mean: '-1.814e-04' + min: '-1.299e-01' + shape: + - 4096 + - 1024 + sum: '-7.611e+02' +network.model.decoder.layers.21.fc2.bias: + device: cuda:0 + max: '9.045e-02' + mean: '5.474e-05' + min: '-7.306e-02' + shape: + - 1024 + sum: '5.605e-02' +network.model.decoder.layers.21.fc2.weight: + device: cuda:0 + max: '1.322e-01' + mean: '3.575e-07' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '1.499e+00' +network.model.decoder.layers.21.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-5.773e-03' + min: '-1.249e-01' + shape: + - 1024 + sum: '-5.912e+00' +network.model.decoder.layers.21.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.21.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '9.81e-03' + min: '-1.318e-01' + shape: + - 1024 + sum: '1.005e+01' +network.model.decoder.layers.21.self_attn.k_proj.weight: + device: cuda:0 + max: '1.425e-01' + mean: '-2.337e-05' + min: '-1.454e-01' + shape: + - 1024 + - 1024 + sum: '-2.450e+01' +network.model.decoder.layers.21.self_attn.out_proj.bias: + device: cuda:0 + max: '7.263e-02' + mean: '-6.624e-05' + min: '-9.937e-02' + shape: + - 1024 + sum: '-6.783e-02' +network.model.decoder.layers.21.self_attn.out_proj.weight: + device: cuda:0 + max: '1.294e-01' + mean: '1.762e-06' + min: '-1.285e-01' + shape: + - 1024 + - 1024 + sum: '1.847e+00' +network.model.decoder.layers.21.self_attn.q_proj.bias: + device: cuda:0 + max: '1.257e-01' + mean: '-1.89e-03' + min: '-1.257e-01' + shape: + - 1024 + sum: '-1.935e+00' +network.model.decoder.layers.21.self_attn.q_proj.weight: + device: cuda:0 + max: '1.327e-01' + mean: '-1.882e-05' + min: '-1.31e-01' + shape: + - 1024 + - 1024 + sum: '-1.974e+01' +network.model.decoder.layers.21.self_attn.v_proj.bias: + device: cuda:0 + max: '4.669e-02' + mean: '-2.74e-04' + min: '-4.211e-02' + shape: + - 1024 + sum: '-2.806e-01' +network.model.decoder.layers.21.self_attn.v_proj.weight: + device: cuda:0 + max: '1.25e-01' + mean: '-7.892e-05' + min: '-1.249e-01' + shape: + - 1024 + - 1024 + sum: '-8.276e+01' +network.model.decoder.layers.21.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '3.155e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '3.231e+00' +network.model.decoder.layers.21.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.22.fc1.bias: + device: cuda:0 + max: '1.251e-01' + mean: '-1.548e-02' + min: '-1.254e-01' + shape: + - 4096 + sum: '-6.341e+01' +network.model.decoder.layers.22.fc1.weight: + device: cuda:0 + max: '1.278e-01' + mean: '-1.567e-04' + min: '-1.277e-01' + shape: + - 4096 + - 1024 + sum: '-6.574e+02' +network.model.decoder.layers.22.fc2.bias: + device: cuda:0 + max: '7.642e-02' + mean: '1.103e-04' + min: '-7.037e-02' + shape: + - 1024 + sum: '1.13e-01' +network.model.decoder.layers.22.fc2.weight: + device: cuda:0 + max: '1.279e-01' + mean: '1.737e-06' + min: '-1.288e-01' + shape: + - 1024 + - 4096 + sum: '7.287e+00' +network.model.decoder.layers.22.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-4.785e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '-4.9e+00' +network.model.decoder.layers.22.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.22.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '6.801e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '6.964e+00' +network.model.decoder.layers.22.self_attn.k_proj.weight: + device: cuda:0 + max: '1.401e-01' + mean: '-8.573e-06' + min: '-1.409e-01' + shape: + - 1024 + - 1024 + sum: '-8.99e+00' +network.model.decoder.layers.22.self_attn.out_proj.bias: + device: cuda:0 + max: '7.709e-02' + mean: '-1.158e-05' + min: '-8.099e-02' + shape: + - 1024 + sum: '-1.186e-02' +network.model.decoder.layers.22.self_attn.out_proj.weight: + device: cuda:0 + max: '1.302e-01' + mean: '-1.088e-06' + min: '-1.293e-01' + shape: + - 1024 + - 1024 + sum: '-1.141e+00' +network.model.decoder.layers.22.self_attn.q_proj.bias: + device: cuda:0 + max: '1.013e-01' + mean: '-1.666e-03' + min: '-1.021e-01' + shape: + - 1024 + sum: '-1.706e+00' +network.model.decoder.layers.22.self_attn.q_proj.weight: + device: cuda:0 + max: '1.331e-01' + mean: '-2.958e-05' + min: '-1.338e-01' + shape: + - 1024 + - 1024 + sum: '-3.102e+01' +network.model.decoder.layers.22.self_attn.v_proj.bias: + device: cuda:0 + max: '4.211e-02' + mean: '5.506e-04' + min: '-4.501e-02' + shape: + - 1024 + sum: '5.638e-01' +network.model.decoder.layers.22.self_attn.v_proj.weight: + device: cuda:0 + max: '1.257e-01' + mean: '-2.981e-05' + min: '-1.25e-01' + shape: + - 1024 + - 1024 + sum: '-3.125e+01' +network.model.decoder.layers.22.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '7.961e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '8.152e-01' +network.model.decoder.layers.22.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.23.fc1.bias: + device: cuda:0 + max: '1.25e-01' + mean: '2.694e-03' + min: '-1.278e-01' + shape: + - 4096 + sum: '1.103e+01' +network.model.decoder.layers.23.fc1.weight: + device: cuda:0 + max: '2.107e-01' + mean: '8.400e-05' + min: '-2.146e-01' + shape: + - 4096 + - 1024 + sum: '3.523e+02' +network.model.decoder.layers.23.fc2.bias: + device: cuda:0 + max: '6.299e-02' + mean: '1.316e-03' + min: '-6.311e-02' + shape: + - 1024 + sum: '1.348e+00' +network.model.decoder.layers.23.fc2.weight: + device: cuda:0 + max: '2.5e-01' + mean: '1.024e-05' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '4.294e+01' +network.model.decoder.layers.23.final_layer_norm.bias: + device: cuda:0 + max: '7.251e-02' + mean: '9.345e-03' + min: '-7.196e-02' + shape: + - 1024 + sum: '9.57e+00' +network.model.decoder.layers.23.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.23.self_attn.k_proj.bias: + device: cuda:0 + max: '2.219e-01' + mean: '3.647e-03' + min: '-1.824e-01' + shape: + - 1024 + sum: '3.734e+00' +network.model.decoder.layers.23.self_attn.k_proj.weight: + device: cuda:0 + max: '1.294e-01' + mean: '-1.63e-05' + min: '-1.304e-01' + shape: + - 1024 + - 1024 + sum: '-1.709e+01' +network.model.decoder.layers.23.self_attn.out_proj.bias: + device: cuda:0 + max: '7.605e-02' + mean: '-1.183e-04' + min: '-6.47e-02' + shape: + - 1024 + sum: '-1.212e-01' +network.model.decoder.layers.23.self_attn.out_proj.weight: + device: cuda:0 + max: '2.5e-01' + mean: '-1.078e-05' + min: '-2.5e-01' + shape: + - 1024 + - 1024 + sum: '-1.130e+01' +network.model.decoder.layers.23.self_attn.q_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-2.744e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-2.809e-01' +network.model.decoder.layers.23.self_attn.q_proj.weight: + device: cuda:0 + max: '1.338e-01' + mean: '2.096e-05' + min: '-1.337e-01' + shape: + - 1024 + - 1024 + sum: '2.197e+01' +network.model.decoder.layers.23.self_attn.v_proj.bias: + device: cuda:0 + max: '4.068e-02' + mean: '2.158e-05' + min: '-4.48e-02' + shape: + - 1024 + sum: '2.210e-02' +network.model.decoder.layers.23.self_attn.v_proj.weight: + device: cuda:0 + max: '1.267e-01' + mean: '6.273e-05' + min: '-1.256e-01' + shape: + - 1024 + - 1024 + sum: '6.577e+01' +network.model.decoder.layers.23.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '1.700e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '1.741e+00' +network.model.decoder.layers.23.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.3.fc1.bias: + device: cuda:0 + max: '8.453e-02' + mean: '-2.474e-02' + min: '-1.194e-01' + shape: + - 4096 + sum: '-1.013e+02' +network.model.decoder.layers.3.fc1.weight: + device: cuda:0 + max: '1.251e-01' + mean: '1.348e-04' + min: '-1.252e-01' + shape: + - 4096 + - 1024 + sum: '5.654e+02' +network.model.decoder.layers.3.fc2.bias: + device: cuda:0 + max: '7.086e-02' + mean: '1.769e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '1.811e-01' +network.model.decoder.layers.3.fc2.weight: + device: cuda:0 + max: '1.276e-01' + mean: '1.857e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '7.790e+00' +network.model.decoder.layers.3.final_layer_norm.bias: + device: cuda:0 + max: '1.254e-01' + mean: '6.555e-03' + min: '-1.254e-01' + shape: + - 1024 + sum: '6.712e+00' +network.model.decoder.layers.3.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.3.self_attn.k_proj.bias: + device: cuda:0 + max: '6.372e-02' + mean: '8.278e-03' + min: '-3.555e-02' + shape: + - 1024 + sum: '8.477e+00' +network.model.decoder.layers.3.self_attn.k_proj.weight: + device: cuda:0 + max: '1.266e-01' + mean: '-1.901e-05' + min: '-1.266e-01' + shape: + - 1024 + - 1024 + sum: '-1.993e+01' +network.model.decoder.layers.3.self_attn.out_proj.bias: + device: cuda:0 + max: '1.240e-01' + mean: '1.084e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '1.11e-01' +network.model.decoder.layers.3.self_attn.out_proj.weight: + device: cuda:0 + max: '1.764e-01' + mean: '-1.601e-06' + min: '-1.614e-01' + shape: + - 1024 + - 1024 + sum: '-1.679e+00' +network.model.decoder.layers.3.self_attn.q_proj.bias: + device: cuda:0 + max: '1.248e-01' + mean: '-2.804e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-2.871e-01' +network.model.decoder.layers.3.self_attn.q_proj.weight: + device: cuda:0 + max: '1.266e-01' + mean: '-1.642e-05' + min: '-1.266e-01' + shape: + - 1024 + - 1024 + sum: '-1.721e+01' +network.model.decoder.layers.3.self_attn.v_proj.bias: + device: cuda:0 + max: '3.882e-02' + mean: '-9.93e-04' + min: '-4.312e-02' + shape: + - 1024 + sum: '-1.017e+00' +network.model.decoder.layers.3.self_attn.v_proj.weight: + device: cuda:0 + max: '1.216e-01' + mean: '-9.011e-05' + min: '-1.204e-01' + shape: + - 1024 + - 1024 + sum: '-9.449e+01' +network.model.decoder.layers.3.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.290e-01' + mean: '-4.648e-04' + min: '-1.259e-01' + shape: + - 1024 + sum: '-4.76e-01' +network.model.decoder.layers.3.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.4.fc1.bias: + device: cuda:0 + max: '7.648e-02' + mean: '-2.333e-02' + min: '-1.11e-01' + shape: + - 4096 + sum: '-9.556e+01' +network.model.decoder.layers.4.fc1.weight: + device: cuda:0 + max: '1.252e-01' + mean: '7.858e-05' + min: '-1.261e-01' + shape: + - 4096 + - 1024 + sum: '3.296e+02' +network.model.decoder.layers.4.fc2.bias: + device: cuda:0 + max: '6.671e-02' + mean: '6.644e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '6.803e-01' +network.model.decoder.layers.4.fc2.weight: + device: cuda:0 + max: '1.281e-01' + mean: '2.081e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '8.729e+00' +network.model.decoder.layers.4.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '2.551e-03' + min: '-1.259e-01' + shape: + - 1024 + sum: '2.613e+00' +network.model.decoder.layers.4.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.4.self_attn.k_proj.bias: + device: cuda:0 + max: '6.433e-02' + mean: '9.123e-03' + min: '-6.219e-02' + shape: + - 1024 + sum: '9.342e+00' +network.model.decoder.layers.4.self_attn.k_proj.weight: + device: cuda:0 + max: '1.298e-01' + mean: '3.159e-05' + min: '-1.27e-01' + shape: + - 1024 + - 1024 + sum: '3.312e+01' +network.model.decoder.layers.4.self_attn.out_proj.bias: + device: cuda:0 + max: '1.113e-01' + mean: '3.284e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '3.363e-01' +network.model.decoder.layers.4.self_attn.out_proj.weight: + device: cuda:0 + max: '1.307e-01' + mean: '5.154e-06' + min: '-1.296e-01' + shape: + - 1024 + - 1024 + sum: '5.404e+00' +network.model.decoder.layers.4.self_attn.q_proj.bias: + device: cuda:0 + max: '1.251e-01' + mean: '1.442e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '1.477e+00' +network.model.decoder.layers.4.self_attn.q_proj.weight: + device: cuda:0 + max: '1.277e-01' + mean: '-1.649e-06' + min: '-1.267e-01' + shape: + - 1024 + - 1024 + sum: '-1.729e+00' +network.model.decoder.layers.4.self_attn.v_proj.bias: + device: cuda:0 + max: '3.711e-02' + mean: '1.497e-04' + min: '-3.909e-02' + shape: + - 1024 + sum: '1.533e-01' +network.model.decoder.layers.4.self_attn.v_proj.weight: + device: cuda:0 + max: '1.139e-01' + mean: '6.411e-05' + min: '-1.227e-01' + shape: + - 1024 + - 1024 + sum: '6.722e+01' +network.model.decoder.layers.4.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.271e-01' + mean: '1.923e-04' + min: '-1.272e-01' + shape: + - 1024 + sum: '1.969e-01' +network.model.decoder.layers.4.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.5.fc1.bias: + device: cuda:0 + max: '9.772e-02' + mean: '-2.182e-02' + min: '-1.219e-01' + shape: + - 4096 + sum: '-8.94e+01' +network.model.decoder.layers.5.fc1.weight: + device: cuda:0 + max: '1.257e-01' + mean: '1.105e-04' + min: '-1.254e-01' + shape: + - 4096 + - 1024 + sum: '4.637e+02' +network.model.decoder.layers.5.fc2.bias: + device: cuda:0 + max: '6.384e-02' + mean: '9.162e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '9.382e-02' +network.model.decoder.layers.5.fc2.weight: + device: cuda:0 + max: '1.262e-01' + mean: '4.982e-07' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '2.089e+00' +network.model.decoder.layers.5.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '4.158e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '4.258e-01' +network.model.decoder.layers.5.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.5.self_attn.k_proj.bias: + device: cuda:0 + max: '7.245e-02' + mean: '1.13e-02' + min: '-5.319e-02' + shape: + - 1024 + sum: '1.157e+01' +network.model.decoder.layers.5.self_attn.k_proj.weight: + device: cuda:0 + max: '1.263e-01' + mean: '-5.184e-05' + min: '-1.263e-01' + shape: + - 1024 + - 1024 + sum: '-5.436e+01' +network.model.decoder.layers.5.self_attn.out_proj.bias: + device: cuda:0 + max: '1.068e-01' + mean: '2.054e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.103e-01' +network.model.decoder.layers.5.self_attn.out_proj.weight: + device: cuda:0 + max: '1.582e-01' + mean: '2.069e-05' + min: '-1.821e-01' + shape: + - 1024 + - 1024 + sum: '2.169e+01' +network.model.decoder.layers.5.self_attn.q_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-6.643e-04' + min: '-1.254e-01' + shape: + - 1024 + sum: '-6.802e-01' +network.model.decoder.layers.5.self_attn.q_proj.weight: + device: cuda:0 + max: '1.261e-01' + mean: '1.035e-05' + min: '-1.27e-01' + shape: + - 1024 + - 1024 + sum: '1.086e+01' +network.model.decoder.layers.5.self_attn.v_proj.bias: + device: cuda:0 + max: '4.800e-02' + mean: '5.821e-04' + min: '-4.202e-02' + shape: + - 1024 + sum: '5.960e-01' +network.model.decoder.layers.5.self_attn.v_proj.weight: + device: cuda:0 + max: '1.182e-01' + mean: '1.019e-05' + min: '-1.202e-01' + shape: + - 1024 + - 1024 + sum: '1.068e+01' +network.model.decoder.layers.5.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.263e-01' + mean: '-4.794e-04' + min: '-1.257e-01' + shape: + - 1024 + sum: '-4.909e-01' +network.model.decoder.layers.5.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.6.fc1.bias: + device: cuda:0 + max: '1.191e-01' + mean: '-2.029e-02' + min: '-9.454e-02' + shape: + - 4096 + sum: '-8.312e+01' +network.model.decoder.layers.6.fc1.weight: + device: cuda:0 + max: '1.282e-01' + mean: '1.416e-04' + min: '-1.27e-01' + shape: + - 4096 + - 1024 + sum: '5.939e+02' +network.model.decoder.layers.6.fc2.bias: + device: cuda:0 + max: '6.439e-02' + mean: '-1.532e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-1.569e-01' +network.model.decoder.layers.6.fc2.weight: + device: cuda:0 + max: '1.343e-01' + mean: '-3.220e-07' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-1.351e+00' +network.model.decoder.layers.6.final_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-1.357e-04' + min: '-1.254e-01' + shape: + - 1024 + sum: '-1.389e-01' +network.model.decoder.layers.6.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.6.self_attn.k_proj.bias: + device: cuda:0 + max: '8.856e-02' + mean: '1.296e-02' + min: '-6.641e-02' + shape: + - 1024 + sum: '1.327e+01' +network.model.decoder.layers.6.self_attn.k_proj.weight: + device: cuda:0 + max: '1.300e-01' + mean: '1.62e-05' + min: '-1.300e-01' + shape: + - 1024 + - 1024 + sum: '1.698e+01' +network.model.decoder.layers.6.self_attn.out_proj.bias: + device: cuda:0 + max: '6.47e-02' + mean: '-1.618e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-1.657e-01' +network.model.decoder.layers.6.self_attn.out_proj.weight: + device: cuda:0 + max: '1.340e-01' + mean: '9.419e-06' + min: '-1.305e-01' + shape: + - 1024 + - 1024 + sum: '9.877e+00' +network.model.decoder.layers.6.self_attn.q_proj.bias: + device: cuda:0 + max: '1.256e-01' + mean: '2.037e-03' + min: '-1.257e-01' + shape: + - 1024 + sum: '2.086e+00' +network.model.decoder.layers.6.self_attn.q_proj.weight: + device: cuda:0 + max: '1.272e-01' + mean: '4.741e-06' + min: '-1.276e-01' + shape: + - 1024 + - 1024 + sum: '4.972e+00' +network.model.decoder.layers.6.self_attn.v_proj.bias: + device: cuda:0 + max: '4.633e-02' + mean: '3.225e-05' + min: '-4.407e-02' + shape: + - 1024 + sum: '3.303e-02' +network.model.decoder.layers.6.self_attn.v_proj.weight: + device: cuda:0 + max: '1.147e-01' + mean: '4.657e-05' + min: '-1.19e-01' + shape: + - 1024 + - 1024 + sum: '4.883e+01' +network.model.decoder.layers.6.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '-1.389e-06' + min: '-1.257e-01' + shape: + - 1024 + sum: '-1.423e-03' +network.model.decoder.layers.6.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.7.fc1.bias: + device: cuda:0 + max: '1.077e-01' + mean: '-2.155e-02' + min: '-1.226e-01' + shape: + - 4096 + sum: '-8.828e+01' +network.model.decoder.layers.7.fc1.weight: + device: cuda:0 + max: '1.284e-01' + mean: '1.858e-04' + min: '-1.311e-01' + shape: + - 4096 + - 1024 + sum: '7.793e+02' +network.model.decoder.layers.7.fc2.bias: + device: cuda:0 + max: '6.897e-02' + mean: '4.677e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '4.789e-02' +network.model.decoder.layers.7.fc2.weight: + device: cuda:0 + max: '1.459e-01' + mean: '-4.578e-07' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-1.92e+00' +network.model.decoder.layers.7.final_layer_norm.bias: + device: cuda:0 + max: '1.093e-01' + mean: '-1.554e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '-1.591e+00' +network.model.decoder.layers.7.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.7.self_attn.k_proj.bias: + device: cuda:0 + max: '1.021e-01' + mean: '1.303e-02' + min: '-6.25e-02' + shape: + - 1024 + sum: '1.334e+01' +network.model.decoder.layers.7.self_attn.k_proj.weight: + device: cuda:0 + max: '1.323e-01' + mean: '1.285e-05' + min: '-1.333e-01' + shape: + - 1024 + - 1024 + sum: '1.348e+01' +network.model.decoder.layers.7.self_attn.out_proj.bias: + device: cuda:0 + max: '5.948e-02' + mean: '2.333e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.389e-01' +network.model.decoder.layers.7.self_attn.out_proj.weight: + device: cuda:0 + max: '1.316e-01' + mean: '-1.173e-06' + min: '-1.301e-01' + shape: + - 1024 + - 1024 + sum: '-1.230e+00' +network.model.decoder.layers.7.self_attn.q_proj.bias: + device: cuda:0 + max: '1.252e-01' + mean: '3.876e-03' + min: '-1.261e-01' + shape: + - 1024 + sum: '3.969e+00' +network.model.decoder.layers.7.self_attn.q_proj.weight: + device: cuda:0 + max: '1.272e-01' + mean: '-3.278e-06' + min: '-1.292e-01' + shape: + - 1024 + - 1024 + sum: '-3.437e+00' +network.model.decoder.layers.7.self_attn.v_proj.bias: + device: cuda:0 + max: '4.297e-02' + mean: '4.138e-04' + min: '-4.077e-02' + shape: + - 1024 + sum: '4.237e-01' +network.model.decoder.layers.7.self_attn.v_proj.weight: + device: cuda:0 + max: '1.183e-01' + mean: '-3.309e-05' + min: '-1.174e-01' + shape: + - 1024 + - 1024 + sum: '-3.47e+01' +network.model.decoder.layers.7.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '1.830e-04' + min: '-1.267e-01' + shape: + - 1024 + sum: '1.874e-01' +network.model.decoder.layers.7.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.8.fc1.bias: + device: cuda:0 + max: '6.335e-02' + mean: '-2.258e-02' + min: '-1.26e-01' + shape: + - 4096 + sum: '-9.249e+01' +network.model.decoder.layers.8.fc1.weight: + device: cuda:0 + max: '1.278e-01' + mean: '5.06e-05' + min: '-1.271e-01' + shape: + - 4096 + - 1024 + sum: '2.122e+02' +network.model.decoder.layers.8.fc2.bias: + device: cuda:0 + max: '6.818e-02' + mean: '-1.369e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '-1.402e-01' +network.model.decoder.layers.8.fc2.weight: + device: cuda:0 + max: '1.392e-01' + mean: '-4.149e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-1.740e+01' +network.model.decoder.layers.8.final_layer_norm.bias: + device: cuda:0 + max: '6.47e-02' + mean: '-3.244e-03' + min: '-1.252e-01' + shape: + - 1024 + sum: '-3.322e+00' +network.model.decoder.layers.8.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.8.self_attn.k_proj.bias: + device: cuda:0 + max: '9.65e-02' + mean: '1.109e-02' + min: '-6.247e-02' + shape: + - 1024 + sum: '1.136e+01' +network.model.decoder.layers.8.self_attn.k_proj.weight: + device: cuda:0 + max: '1.318e-01' + mean: '8.991e-06' + min: '-1.32e-01' + shape: + - 1024 + - 1024 + sum: '9.428e+00' +network.model.decoder.layers.8.self_attn.out_proj.bias: + device: cuda:0 + max: '6.317e-02' + mean: '-7.463e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '-7.643e-02' +network.model.decoder.layers.8.self_attn.out_proj.weight: + device: cuda:0 + max: '1.306e-01' + mean: '6.679e-06' + min: '-1.327e-01' + shape: + - 1024 + - 1024 + sum: '7.003e+00' +network.model.decoder.layers.8.self_attn.q_proj.bias: + device: cuda:0 + max: '1.256e-01' + mean: '1.131e-05' + min: '-1.257e-01' + shape: + - 1024 + sum: '1.159e-02' +network.model.decoder.layers.8.self_attn.q_proj.weight: + device: cuda:0 + max: '1.311e-01' + mean: '-4.181e-07' + min: '-1.293e-01' + shape: + - 1024 + - 1024 + sum: '-4.384e-01' +network.model.decoder.layers.8.self_attn.v_proj.bias: + device: cuda:0 + max: '4.486e-02' + mean: '5.294e-04' + min: '-4.657e-02' + shape: + - 1024 + sum: '5.421e-01' +network.model.decoder.layers.8.self_attn.v_proj.weight: + device: cuda:0 + max: '1.242e-01' + mean: '1.489e-05' + min: '-1.243e-01' + shape: + - 1024 + - 1024 + sum: '1.561e+01' +network.model.decoder.layers.8.self_attn_layer_norm.bias: + device: cuda:0 + max: '1.25e-01' + mean: '1.027e-03' + min: '-1.254e-01' + shape: + - 1024 + sum: '1.052e+00' +network.model.decoder.layers.8.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.9.fc1.bias: + device: cuda:0 + max: '7.355e-02' + mean: '-2.086e-02' + min: '-8.301e-02' + shape: + - 4096 + sum: '-8.545e+01' +network.model.decoder.layers.9.fc1.weight: + device: cuda:0 + max: '1.256e-01' + mean: '2.51e-05' + min: '-1.265e-01' + shape: + - 4096 + - 1024 + sum: '1.053e+02' +network.model.decoder.layers.9.fc2.bias: + device: cuda:0 + max: '6.647e-02' + mean: '2.622e-04' + min: '-1.25e-01' + shape: + - 1024 + sum: '2.685e-01' +network.model.decoder.layers.9.fc2.weight: + device: cuda:0 + max: '1.256e-01' + mean: '-3.312e-06' + min: '-2.5e-01' + shape: + - 1024 + - 4096 + sum: '-1.389e+01' +network.model.decoder.layers.9.final_layer_norm.bias: + device: cuda:0 + max: '7.349e-02' + mean: '-8.035e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '-8.227e+00' +network.model.decoder.layers.9.final_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.layers.9.self_attn.k_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '8.960e-03' + min: '-1.25e-01' + shape: + - 1024 + sum: '9.175e+00' +network.model.decoder.layers.9.self_attn.k_proj.weight: + device: cuda:0 + max: '1.346e-01' + mean: '4.302e-05' + min: '-1.346e-01' + shape: + - 1024 + - 1024 + sum: '4.511e+01' +network.model.decoder.layers.9.self_attn.out_proj.bias: + device: cuda:0 + max: '6.616e-02' + mean: '-8.681e-05' + min: '-1.25e-01' + shape: + - 1024 + sum: '-8.89e-02' +network.model.decoder.layers.9.self_attn.out_proj.weight: + device: cuda:0 + max: '1.497e-01' + mean: '-7.002e-06' + min: '-1.382e-01' + shape: + - 1024 + - 1024 + sum: '-7.342e+00' +network.model.decoder.layers.9.self_attn.q_proj.bias: + device: cuda:0 + max: '1.25e-01' + mean: '2.336e-03' + min: '-1.208e-01' + shape: + - 1024 + sum: '2.392e+00' +network.model.decoder.layers.9.self_attn.q_proj.weight: + device: cuda:0 + max: '1.344e-01' + mean: '-1.583e-05' + min: '-1.379e-01' + shape: + - 1024 + - 1024 + sum: '-1.66e+01' +network.model.decoder.layers.9.self_attn.v_proj.bias: + device: cuda:0 + max: '6.241e-02' + mean: '2.777e-04' + min: '-6.464e-02' + shape: + - 1024 + sum: '2.844e-01' +network.model.decoder.layers.9.self_attn.v_proj.weight: + device: cuda:0 + max: '1.131e-01' + mean: '-2.935e-05' + min: '-1.183e-01' + shape: + - 1024 + - 1024 + sum: '-3.077e+01' +network.model.decoder.layers.9.self_attn_layer_norm.bias: + device: cuda:0 + max: '7.812e-02' + mean: '9.632e-04' + min: '-1.255e-01' + shape: + - 1024 + sum: '9.864e-01' +network.model.decoder.layers.9.self_attn_layer_norm.weight: + device: cuda:0 + max: '1.e+00' + mean: '1.e+00' + min: '1.e+00' + shape: + - 1024 + sum: '1.024e+03' +network.model.decoder.project_in.weight: + device: cuda:0 + max: '1.305e-01' + mean: '3.482e-05' + min: '-1.318e-01' + shape: + - 1024 + - 512 + sum: '1.826e+01' +network.model.decoder.project_out.weight: + device: cuda:0 + max: '1.373e-01' + mean: '8.706e-05' + min: '-1.376e-01' + shape: + - 512 + - 1024 + sum: '4.564e+01'