Skip to content

Commit

Permalink
Clean-up + pre-commit
Browse files Browse the repository at this point in the history
  • Loading branch information
achiefa authored and RoyStegeman committed Jan 13, 2025
1 parent 21bc618 commit bd47a55
Show file tree
Hide file tree
Showing 2 changed files with 115 additions and 125 deletions.
7 changes: 4 additions & 3 deletions nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
'''

import logging
import os

from filter_utils import Extractor
import numpy as np

logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')

CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))

if __name__ == "__main__":
CMS_WCHARM_TOT = Extractor("./metadata.yaml", "WPWM-TOT", mult_factor=1000)
CMS_WCHARM_TOT = Extractor(f"{CURRENT_DIR}/metadata.yaml", "WPWM-TOT", mult_factor=1000)
CMS_WCHARM_TOT.generate_data()

CMS_WCHARM_RATIO = Extractor("./metadata.yaml", "WPWM-RATIO", mult_factor=1.0)
CMS_WCHARM_RATIO = Extractor(f"{CURRENT_DIR}/metadata.yaml", "WPWM-RATIO", mult_factor=1.0)
CMS_WCHARM_RATIO.generate_data()
233 changes: 111 additions & 122 deletions nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import functools
import logging
import os

import numpy as np
import yaml
Expand All @@ -8,28 +10,29 @@
yaml.add_representer(float, prettify_float)

MW2 = 80.385**2
CMSLUMI13 = 2.5

CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
ART_LABEL = 'art_corr'
STAT_LABEL = 'stat_uncorr'
TABLE_TOKEN = 'Table'


class Extractor:
"""
Extracts kinematics, central data, and uncertainties for a given dataset
Parameters
----------
metadata_file: str
Path to the metadata file
observable: str
The name of the observable for which the data is extracted. The name must
be listed in the metadata file.
"""

def __init__(self, metadata_file, observable, mult_factor=1):

"""
Parameters
----------
metadata_file: str
Path to the metadata file
observable: str
The name of the observable for which the data is extracted. The name
must be listed in the metadata file.
mult_factor: float
Multiplication factor to apply to the central data points. This is
useful to convert the data in the metadata file to the desired
units.
"""
# Open metadata and select process
with open(metadata_file, 'r') as file:
metadata = yaml.safe_load(file)
Expand All @@ -44,69 +47,51 @@ def __init__(self, metadata_file, observable, mult_factor=1):
if self.metadata is None:
raise Exception(f"{observable} is not listed in the metadata file.")

# Initialise dict of tables
self.tables = {}
self.observable = observable
self.mult_factor = mult_factor
self.kin_labels = self.metadata['kinematic_coverage']
self.ndata = self.metadata['ndata']

def __retrieve_table(self, table_id):
@functools.cache
def _retrieve_table(self, table_id):
"""
Implementation of the lazy loading for the tables. If the table
is loaded for the first time, it is stored into an internal
container of the class, so that it will not be loaded each time.
When called, this functions checks if the table has already been stored
and, if that is the case, returns the stored table.
Implementation of the loading for the table.
Parameters
----------
table_id: int
Index that specifies the table
Index that specifies the table.
Return
------
The table specified by `table_id`. If not previously loaded, it is also
stored into the internal container for future use.
The table specified by `table_id`.
"""
try:
table = self.tables[str(table_id)]
except KeyError:
logging.debug(
f'Table {table_id} has not already been used or stored.' f' Storing the table...'
)
with open(f'./rawdata/{TABLE_TOKEN}{table_id}.yaml', 'r') as tab:
tab_dict = yaml.safe_load(tab)
self.tables[str(table_id)] = tab_dict
table = tab_dict
return table
with open(f'{CURRENT_DIR}/rawdata/{TABLE_TOKEN}{table_id}.yaml') as tab:
tab_dict = yaml.safe_load(tab)
return tab_dict

def __extract_kinematics(self, table: dict):
def _generate_kinematics(self):
"""
Extracts the kinematic variables of the single differential
distribution given a table.
For each bin, it computes the max, min, and mid value of the transverse
momentum of the boson.
Parameters
----------
table: dict
Dictionary containing the bins in the transverse momentum
The function generates the kinematics by reading and processing it from
the referenced table. Kinematics is processed in the format of a list of
dictionaries. The keys in each dictionaries specify the label (i.e. name)
for the kinematic variables. For this dataset, they are 'abs_eta' and 'm_W2'.
The labels are taken from the matadata file. The corresponding values are
'min', 'mid', and 'max'.
For this dataset, 'm_W2' is used in the computation of the (x,Q2)-map and
does not have any active role in the fit. For that reason, every bin has the
same value. Moreover, only the mid value is used.
"""
logging.info(f"Generating kinematics for CMS_{self.observable}...")

Return
------
List of bins containing min, max, and mid values for each of the kinematic
observables listed in the `kinematic_coverage` of the metadata file.
table_ID = self.metadata["tables"][0]
tab_dict = self._retrieve_table(table_ID)

"""
data = table['independent_variables'][0]
label = self.kin_labels
data = tab_dict['independent_variables'][0]
label = self.metadata['kinematic_coverage']
kinematics = []
for bin in data['values']:
abs_eta_min = bin['low']
abs_eta_max = bin['high']
for eta_bin in data['values']:
abs_eta_max = eta_bin['high']
abs_eta_min = eta_bin['low']
kin_bin = {
label[0]: {
'min': abs_eta_min,
Expand All @@ -116,61 +101,47 @@ def __extract_kinematics(self, table: dict):
label[1]: {'min': None, 'mid': MW2, 'max': None},
}
kinematics.append(kin_bin)
return kinematics

def generate_kinematics(self):
"""
Function that generates the kinematics by looping over all the
tables specified in the metadata file. The resulting kinematics
is then saved to a yaml file. It relies on the method
`__extract_kinematics`.
"""

logging.info(f"Generating kinematics for ATLAS_{self.observable}...")

# Initialise kinematics list
kinematics = []
ndata = 0
table = self.metadata["tables"][0]
tab_dict = self.__retrieve_table(table)
kin = self.__extract_kinematics(tab_dict)
kinematics = np.concatenate([kinematics, kin])
ndata += len(kin)

# Check number of data agrees with metadata
try:
assert self.metadata['ndata'] is not None
assert self.metadata['ndata'] == ndata
except AssertionError as e:
logging.warning(
f"The number of data in the metafile is either wrong or unspecified."
f" The correct number is {ndata}. Please, update the metafile."
ndata = len(kinematics)
if not self.metadata['ndata'] == ndata:
raise ValueError(
f"Mismatch in 'ndata': expected {self.metadata['ndata']}, but got {ndata}"
)
return
return kinematics.tolist()
self.ndata = ndata
return kinematics

def generate_data_and_unc(self, mult_factor=1.0):
def _generate_data_and_unc(self):
"""
Same as `generate_kinematics`, but for central data points.
Return a list with central data points and two additional lists with the corresponding
statistical and systematic uncertainties. For this dataset, uncertainties are always
symmetric. Uncertainties are given as absolute values.
Note that, for the total x-sec, the correlation matrix is provided. The corresponding
covariance matrix is constructed in `_generate_covmat`.
"""
logging.info(f"Generating central data for CMS_{self.observable}...")
dat_central = []
stat_unc = []
asy_sys_unc = []
table = self.metadata['tables'][0]
tab_dict = self.__retrieve_table(table)
table_ID = self.metadata['tables'][0]
tab_dict = self._retrieve_table(table_ID)

# Select data with pT > 25 GeV
tab_dict = tab_dict['dependent_variables'][0]['values']

# Loop over bins
for rap_bin in tab_dict:
dat_central.append(rap_bin['value'] * mult_factor)
stat_unc.append(rap_bin['errors'][0]['symerror'] * mult_factor)
asy_sys_unc.append(rap_bin['errors'][1]['symerror'] * mult_factor)
dat_central.append(rap_bin['value'] * self.mult_factor)
stat_unc.append(rap_bin['errors'][0]['symerror'] * self.mult_factor)
asy_sys_unc.append(rap_bin['errors'][1]['symerror'] * self.mult_factor)
return dat_central, stat_unc, asy_sys_unc

def __build_unc_definitions(self):
def _build_unc_definitions(self):
"""
Build the dictionary containing the definitions of the uncertainties to be
used in the uncertainty data file.
"""
unc_definitions = {}

# Statistical uncertainty
Expand All @@ -196,9 +167,22 @@ def __build_unc_definitions(self):

return unc_definitions

def generate_covmat(self, diag_uncs=None):
table = self.metadata["tables"][1]
tab_dict = self.__retrieve_table(table)
def _generate_covmat(self, diag_uncs):
"""
Generate the covariance matrix for the total x-sec. This function requires
the diagonal systematic uncertainties as argument. The diagonal uncertainties
are used to construct the covariance matrix from the correlation matrix stored
in the HepData table.
Note that such a correlation matrix exists for the total x-sec only, while the
ratio observable does not provide this information.
"""
if not self.observable == 'WPWM-TOT':
raise ValueError(
"The construction of the covariance matrix is defined for the total x-sec only."
)
table_ID = self.metadata["tables"][1]
tab_dict = self._retrieve_table(table_ID)
matlist = tab_dict['dependent_variables'][0]['values']
matlist = [d['value'] for d in matlist]
covmat = np.zeros((self.ndata, self.ndata))
Expand All @@ -208,64 +192,69 @@ def generate_covmat(self, diag_uncs=None):
return covmat

def generate_data(self):
'''
Collect central data, kinematics, and uncertainties and save them into
yaml files.
'''
"""
The function collects central data, kinematics, and uncertainties ans save them
into yaml files.
The systematic uncertainties are given as percentages relative the central data point.
The absolute value of the uncertainty is obtained from the central data point before
the shifts are applied.
"""
# Get central data and kinematics
central_data, stat_unc, sys_unc = self.generate_data_and_unc(self.mult_factor)
kinematics = self.generate_kinematics()
central_data, stat_unc, sys_unc = self._generate_data_and_unc()
kinematics = self._generate_kinematics()

# Uncertainty definitions
unc_definitions = self.__build_unc_definitions()
unc_definitions = self._build_unc_definitions()
sys_artificial = [] # Initialize vector of artificial uncertainties

if self.observable == 'WPWM-TOT':
covmat = self.generate_covmat(sys_unc)
# Generate covmat and perform eigen decomposition
covmat = self._generate_covmat(sys_unc)
eigvals, eigvecs = np.linalg.eig(covmat)
art_unc = np.sqrt(eigvals) * eigvecs

# Loop over bins
for data_idx, data in enumerate(central_data):
for data_idx in range(len(central_data)):
# Statistical uncertainty
unc_dict = {STAT_LABEL: stat_unc[data_idx]}

# Artificial systematic uncertainties
for sys_idx, art_sys in enumerate(art_unc[data_idx, :]):
unc_dict[f'{ART_LABEL}_{sys_idx+1}'] = float(art_sys)

# Append to list
sys_artificial.append(unc_dict)

elif self.observable == 'WPWM-RATIO':
for data_idx, data in enumerate(central_data):
for data_idx in range(len(central_data)):
# Statistical uncertainty
unc_dict = {STAT_LABEL: stat_unc[data_idx]}

# Systematic uncertainty
unc_dict[f'{ART_LABEL}'] = sys_unc[data_idx]
sys_artificial.append(unc_dict)

# Local path for yaml files
path='./'

# Save kinematics into file
logging.info("Dumping kinematics to file...")
kinematics_yaml = {'bins': kinematics}
with open(path + self.metadata['kinematics']['file'], 'w') as kin_out_file:
yaml.dump(kinematics_yaml, kin_out_file, sort_keys=False)
kins_file_name = self.metadata['kinematics']['file']
with open(CURRENT_DIR + '/' + kins_file_name, 'w') as file:
yaml.dump(kinematics_yaml, file, sort_keys=False)
logging.info("Done!")

# Save central data into file
logging.info("Dumping kinematics to file...")
dat_central_yaml = {'data_central': central_data}
file_name = self.metadata['data_central']
with open(path + file_name, 'w') as dat_out_file:
yaml.dump(dat_central_yaml, dat_out_file, sort_keys=False)
data_file_name = self.metadata['data_central']
with open(CURRENT_DIR + '/' + data_file_name, 'w') as file:
yaml.dump(dat_central_yaml, file, sort_keys=False)
logging.info("Done!")

# Save unertainties
logging.info("Dumping kinematics to file...")
uncertainties_yaml = {'definitions': unc_definitions, 'bins': sys_artificial}
file_name = (
self.metadata['data_uncertainties'][0]
)
with open(path + file_name, 'w') as dat_out_file:
yaml.dump(uncertainties_yaml, dat_out_file, sort_keys=False)
unc_file_name = self.metadata['data_uncertainties'][0]
with open(CURRENT_DIR + '/' + unc_file_name, 'w') as file:
yaml.dump(uncertainties_yaml, file, sort_keys=False)
logging.info("Done!")

0 comments on commit bd47a55

Please sign in to comment.