-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3_correlation.py
223 lines (150 loc) · 8.24 KB
/
3_correlation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# %%
"""
We have a bootstrapped sample data set with different momenta, each momentum case has 100 configurations and 11 time slices.
Correlation is determined when averaging samples
------------------------------------------------
1. If average each mom separately (average on tseq axis), different moms will be treated as independent variables without correlations.
2. If average all mom and tseq together, every (mom, tseq) will be treated as correlated variables.
"""
import gvar as gv
import numpy as np
# ignore the warning of log(0)
np.seterr(invalid='ignore')
samp_data = gv.load("data/samp_data_mom_mix.pkl")
n_mom = 7
n_t = 11
n_samp = 100
# %%
#! correlation is determined when averaging samples
# * avg each mom separately
if True:
data_avg_sep = []
for key in samp_data.keys():
data_avg_sep.append(gv.dataset.avg_data(samp_data[key], bstrap=True))
data_avg_sep = np.array(data_avg_sep)
corr_sep = gv.evalcorr(data_avg_sep)
print("\n shape of corr_sep: ", np.shape(corr_sep))
print("\n correlation matrix between mom 0 and mom 1 when averaging separately: ")
print(corr_sep[0, :, 1, :]) #? there is no correlation between different momenta
# * avg all mom and time slices together
if True:
data_col = np.array([samp_data[key] for key in samp_data.keys()])
# print(np.shape(data_col))
data_col = np.swapaxes(data_col, 0, 1) # swap the sample axis to the 0-th axis
# print(np.shape(data_col))
data_col = list(data_col)
data_avg_col = gv.dataset.avg_data(data_col, bstrap=True)
corr_col = gv.evalcorr(data_avg_col)
print("\n shape of corr_col: ", np.shape(corr_col))
print("\n correlation matrix between mom 0 and mom 1 when averaging all together: ")
print(corr_col[0, :, 1, :]) #? it can be seen that the correlation between different momenta is non-zero
# %%
#! show the correlation matrix of the average all together case
if True:
import matplotlib.pyplot as plt
data_avg_col_plot = data_avg_col.reshape(77)
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.imshow(gv.evalcorr(data_avg_col_plot))
# add the colorbar for both plots
fig.colorbar(ax.imshow(gv.evalcorr(data_avg_col_plot)), ax=ax)
# add the title for both plots
ax.set_title("correlation matrix when averaging all together")
plt.show() #? the correlation matrix shows a non-zero correlation between different momenta
# %%
"""
Gvar list can preserve all correlations as well as reconstruct distributions
----------------------------------------------------------------------------
If we have distributions with N samples, after averaging it to gvar list and reconstructing distributions with mean and correlations, the reconstructed distributions will be almost same as the original distributions.
In other words, gvar list with correlations approx distributions. [The difference is below 1% .]
----------------------------------------------------------------------------
Gvar list assumes that all variables are Gaussian distributed, why it is a good assumption? Because we have the Central Limit Theorem (CLT), which states that the distribution of the config means converges to a standard normal distribution, no matter what the original distribution of configs is. Therefore, as long as we have enough configurations, do a bootstrap resampling, the distribution of bs samples will be Gaussian distributed.
"""
#! use gvar list to reconstruct distributions and compare with the original distributions
import module.resampling as resamp
if True:
data_test = np.array(samp_data["mom_0"])
print("\n shape of data_test: ", np.shape(data_test))
data_test_avg = gv.dataset.avg_data(data_test, bstrap=True)
print("\n data_test_avg: ")
print(data_test_avg)
# * reconstruct distributions
data_test_recon = resamp.gv_ls_to_samples_corr(data_test_avg, N_samp=100)
# * compare the reconstructed distributions with the original distributions
diff = (data_test - data_test_recon) / data_test
print("\n normalized difference of the reconstructed distributions: ")
print(diff)
print("\n max difference: ")
print(np.max(np.abs(diff))) #? we can see that the largest difference is below 1%
#! the difference is below 1%
# %%
"""
If we construct gvar list with the completed correlation matrix, gvar list behaves exactly like the distribution of bs samples, as long as you did the bootstrap resampling, no matter in linear or non-linear (including lsqfit, check "4_lsq_fit.py") operations.
Therefore, you can choose one of two methods according to your situation, bs samples take more time but less memory, while gvar list takes less time but you may need more memory to store the large correlation matrix.
Let's check the behavior in linear and non-linear operations using fake data.
"""
#! generate two random data sets, one is sin with amplitude 1, the other is sin with amplitude 2, add small gaussian noise to them
if True:
N_conf = 100
N_t = 10
noise_amp = 0.1
bs_seed = np.random.choice(N_conf, (100, N_conf), replace=True)
#? When we do the 3pt / 2pt ratio, we expect the fluctuation will be partly canceled out on each config/sample via ratio, so we add a large fixed noise to imitate this situation.
noise_fix = np.random.normal(0, 0.1 + noise_amp, (N_conf, N_t))
if_noise_fix = True
#* generate two data sets
t = np.arange(N_t)
# sin with amplitude 1
if if_noise_fix:
data_1 = np.sin(t) + noise_fix + np.random.normal(0, noise_amp, (N_conf, N_t))
else:
data_1 = np.sin(t) + np.random.normal(0, noise_amp, (N_conf, N_t))
print("\n shape of data_1: ", np.shape(data_1))
# sin with amplitude 2
if if_noise_fix:
data_2 = 2 * np.sin(t) + noise_fix + np.random.normal(0, noise_amp, (N_conf, N_t))
else:
data_2 = 2 * np.sin(t) + np.random.normal(0, noise_amp, (N_conf, N_t))
print("\n shape of data_2: ", np.shape(data_2))
#* do the bootstrap resampling with the same seed
data_1 = resamp.bootstrap_with_seed(data_1, bs_seed, axis=0)
data_2 = resamp.bootstrap_with_seed(data_2, bs_seed, axis=0)
all_data_dic = {"1": data_1, "2": data_2}
#* process operations with sample averaging by gvar
print("\n >>> Linear and non-linear operations with gvar sample averaging:")
all_data_gv = resamp.bs_dic_avg(all_data_dic) # avg together
data_1_gv = np.array( all_data_gv["1"] )
data_2_gv = np.array( all_data_gv["2"] )
# subtract data_1 from data_2
subtraction_gv = data_2_gv - data_1_gv
print("\n gvar subtraction: ")
print(subtraction_gv)
# divide data_2 by data_1
division_gv = data_2_gv / data_1_gv
print("\n gvar division: ")
print(division_gv)
#* process operations sample by sample
print("\n >>> Linear and non-linear operations sample by sample:")
# subtract data_1 from data_2
subtraction_sample = data_2 - data_1
print("\n bs sample subtraction: ")
print(resamp.bs_ls_avg(subtraction_sample)) #* note here if no bstrap, the error needs to be divided by sqrt(N_conf)
# divide data_2 by data_1
division_sample = data_2 / data_1
print("\n bs sample division: ")
print(resamp.bs_ls_avg(division_sample)) #* note here if no bstrap, the error needs to be divided by sqrt(N_conf)
# %%
#! plot the subtraction results and division results of the two methods for comparison, use errorbar plot
if True:
from module.plot_settings import *
from module.general_plot_funcs import errorbar_ls_plot
#* comparison plot of subtraction
x_ls = [np.arange(N_t), np.arange(N_t)+0.2]
y_ls = [gv.mean(subtraction_gv), gv.mean( resamp.bs_ls_avg(subtraction_sample) )]
yerr_ls = [gv.sdev(subtraction_gv), gv.sdev( resamp.bs_ls_avg(subtraction_sample) )]
errorbar_ls_plot(x_ls, y_ls, yerr_ls, label_ls=["gvar", "sample"], title="subtraction comparison", save=False)
#* comparison plot of division
x_ls = [np.arange(N_t), np.arange(N_t)+0.2]
y_ls = [gv.mean(division_gv), gv.mean( resamp.bs_ls_avg(division_sample) )]
yerr_ls = [gv.sdev(division_gv), gv.sdev( resamp.bs_ls_avg(division_sample) )]
errorbar_ls_plot(x_ls, y_ls, yerr_ls, label_ls=["gvar", "sample"], title="division comparison", save=False, ylim=[1, 3]) #? above two methods are consistent, even for non-linear operations
# %%