Merge pull request #2245 from NNPDF/new_CMS_WCHARM_7TEV_WPWM

implementation of CMS_WCHARM_7TEV in the new format
NNPDF · Jan 14, 2025 · 31730da · 31730da
2 parents 9d67f04 + bd47a55
commit 31730da
Show file tree

Hide file tree

Showing 12 changed files with 675 additions and 142 deletions.
diff --git a/...S_WCHARM_7TEV/data_legacy_WPWM-RATIO.yaml → ...data/CMS_WCHARM_7TEV/data_WPWM-RATIO.yaml b/...S_WCHARM_7TEV/data_legacy_WPWM-RATIO.yaml → ...data/CMS_WCHARM_7TEV/data_WPWM-RATIO.yaml
diff --git a/...CMS_WCHARM_7TEV/data_legacy_WPWM-TOT.yaml → ...ondata/CMS_WCHARM_7TEV/data_WPWM-TOT.yaml b/...CMS_WCHARM_7TEV/data_legacy_WPWM-TOT.yaml → ...ondata/CMS_WCHARM_7TEV/data_WPWM-TOT.yaml
diff --git a/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter.py b/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter.py
@@ -0,0 +1,19 @@
+'''
+Filter script for CMS_WCHARM_7TEV
+'''
+
+import logging
+import os
+
+from filter_utils import Extractor
+
+logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
+
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+if __name__ == "__main__":
+    CMS_WCHARM_TOT = Extractor(f"{CURRENT_DIR}/metadata.yaml", "WPWM-TOT", mult_factor=1000)
+    CMS_WCHARM_TOT.generate_data()
+
+    CMS_WCHARM_RATIO = Extractor(f"{CURRENT_DIR}/metadata.yaml", "WPWM-RATIO", mult_factor=1.0)
+    CMS_WCHARM_RATIO.generate_data()
diff --git a/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter_utils.py b/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter_utils.py
@@ -0,0 +1,260 @@
+import functools
+import logging
+import os
+
+import numpy as np
+import yaml
+
+from nnpdf_data.filter_utils.utils import prettify_float
+
+yaml.add_representer(float, prettify_float)
+
+MW2 = 80.385**2
+
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+ART_LABEL = 'art_corr'
+STAT_LABEL = 'stat_uncorr'
+TABLE_TOKEN = 'Table'
+
+
+class Extractor:
+
+    def __init__(self, metadata_file, observable, mult_factor=1):
+        """
+        Parameters
+        ----------
+        metadata_file: str
+            Path to the metadata file
+        observable: str
+            The name of the observable for which the data is extracted. The name
+            must be listed in the metadata file.
+        mult_factor: float
+            Multiplication factor to apply to the central data points. This is
+            useful to convert the data in the metadata file to the desired
+            units.
+        """
+        # Open metadata and select process
+        with open(metadata_file, 'r') as file:
+            metadata = yaml.safe_load(file)
+            self.metadata = next(
+                (
+                    md
+                    for md in metadata["implemented_observables"]
+                    if md['observable_name'] == observable
+                ),
+                None,
+            )
+            if self.metadata is None:
+                raise Exception(f"{observable} is not listed in the metadata file.")
+
+        self.observable = observable
+        self.mult_factor = mult_factor
+
+    @functools.cache
+    def _retrieve_table(self, table_id):
+        """
+        Implementation of the loading for the table.
+
+        Parameters
+        ----------
+        table_id: int
+          Index that specifies the table.
+
+        Return
+        ------
+        The table specified by `table_id`.
+        """
+        with open(f'{CURRENT_DIR}/rawdata/{TABLE_TOKEN}{table_id}.yaml') as tab:
+            tab_dict = yaml.safe_load(tab)
+        return tab_dict
+
+    def _generate_kinematics(self):
+        """
+        The function generates the kinematics by reading and processing it from
+        the referenced table. Kinematics is processed in the format of a list of
+        dictionaries. The keys in each dictionaries specify the label (i.e. name)
+        for the kinematic variables. For this dataset, they are 'abs_eta' and 'm_W2'.
+        The labels are taken from the matadata file. The corresponding values are
+        'min', 'mid', and 'max'.
+
+        For this dataset, 'm_W2' is used in the computation of the (x,Q2)-map and
+        does not have any active role in the fit. For that reason, every bin has the
+        same value. Moreover, only the mid value is used.
+        """
+        logging.info(f"Generating kinematics for CMS_{self.observable}...")
+
+        table_ID = self.metadata["tables"][0]
+        tab_dict = self._retrieve_table(table_ID)
+
+        data = tab_dict['independent_variables'][0]
+        label = self.metadata['kinematic_coverage']
+        kinematics = []
+        for eta_bin in data['values']:
+            abs_eta_max = eta_bin['high']
+            abs_eta_min = eta_bin['low']
+            kin_bin = {
+                label[0]: {
+                    'min': abs_eta_min,
+                    'mid': (abs_eta_max + abs_eta_min) / 2,
+                    'max': abs_eta_max,
+                },
+                label[1]: {'min': None, 'mid': MW2, 'max': None},
+            }
+            kinematics.append(kin_bin)
+
+        # Check number of data agrees with metadata
+        ndata = len(kinematics)
+        if not self.metadata['ndata'] == ndata:
+            raise ValueError(
+                f"Mismatch in 'ndata': expected {self.metadata['ndata']}, but got {ndata}"
+            )
+        self.ndata = ndata
+        return kinematics
+
+    def _generate_data_and_unc(self):
+        """
+        Return a list with central data points and two additional lists with the corresponding
+        statistical and systematic uncertainties. For this dataset, uncertainties are always
+        symmetric. Uncertainties are given as absolute values.
+
+        Note that, for the total x-sec, the correlation matrix is provided. The corresponding
+        covariance matrix is constructed in `_generate_covmat`.
+        """
+        logging.info(f"Generating central data for CMS_{self.observable}...")
+        dat_central = []
+        stat_unc = []
+        asy_sys_unc = []
+        table_ID = self.metadata['tables'][0]
+        tab_dict = self._retrieve_table(table_ID)
+
+        # Select data with pT > 25 GeV
+        tab_dict = tab_dict['dependent_variables'][0]['values']
+
+        # Loop over bins
+        for rap_bin in tab_dict:
+            dat_central.append(rap_bin['value'] * self.mult_factor)
+            stat_unc.append(rap_bin['errors'][0]['symerror'] * self.mult_factor)
+            asy_sys_unc.append(rap_bin['errors'][1]['symerror'] * self.mult_factor)
+        return dat_central, stat_unc, asy_sys_unc
+
+    def _build_unc_definitions(self):
+        """
+        Build the dictionary containing the definitions of the uncertainties to be
+        used in the uncertainty data file.
+        """
+        unc_definitions = {}
+
+        # Statistical uncertainty
+        unc_definitions[STAT_LABEL] = {
+            'description': f'Statistical uncertainty',
+            'treatment': 'ADD',
+            'type': 'UNCORR',
+        }
+
+        if self.observable == 'WPWM-RATIO':
+            unc_definitions['ART_LABEL'] = {
+                'description': f'Correlated systematic uncertainty',
+                'treatment': 'MULT',
+                'type': 'CORR',
+            }
+        elif self.observable == 'WPWM-TOT':
+            for idx in range(self.ndata):
+                unc_definitions[f'{ART_LABEL}_{idx+1}'] = {
+                    'description': f'Correlated systematic uncertainty {idx+1}',
+                    'treatment': 'ADD',
+                    'type': 'CORR',
+                }
+
+        return unc_definitions
+
+    def _generate_covmat(self, diag_uncs):
+        """
+        Generate the covariance matrix for the total x-sec. This function requires
+        the diagonal systematic uncertainties as argument. The diagonal uncertainties
+        are used to construct the covariance matrix from the correlation matrix stored
+        in the HepData table.
+
+        Note that such a correlation matrix exists for the total x-sec only, while the
+        ratio observable does not provide this information.
+        """
+        if not self.observable == 'WPWM-TOT':
+            raise ValueError(
+                "The construction of the covariance matrix is defined for the total x-sec only."
+            )
+        table_ID = self.metadata["tables"][1]
+        tab_dict = self._retrieve_table(table_ID)
+        matlist = tab_dict['dependent_variables'][0]['values']
+        matlist = [d['value'] for d in matlist]
+        covmat = np.zeros((self.ndata, self.ndata))
+        for i in range(self.ndata):
+            for j in range(self.ndata):
+                covmat[i, j] = matlist[i + self.ndata * j] * diag_uncs[i] * diag_uncs[j]
+        return covmat
+
+    def generate_data(self):
+        """
+        The function collects central data, kinematics, and uncertainties ans save them
+        into yaml files.
+
+        The systematic uncertainties are given as percentages relative the central data point.
+        The absolute value of the uncertainty is obtained from the central data point before
+        the shifts are applied.
+        """
+        # Get central data and kinematics
+        central_data, stat_unc, sys_unc = self._generate_data_and_unc()
+        kinematics = self._generate_kinematics()
+
+        # Uncertainty definitions
+        unc_definitions = self._build_unc_definitions()
+        sys_artificial = []  # Initialize vector of artificial uncertainties
+
+        if self.observable == 'WPWM-TOT':
+            # Generate covmat and perform eigen decomposition
+            covmat = self._generate_covmat(sys_unc)
+            eigvals, eigvecs = np.linalg.eig(covmat)
+            art_unc = np.sqrt(eigvals) * eigvecs
+
+            # Loop over bins
+            for data_idx in range(len(central_data)):
+                # Statistical uncertainty
+                unc_dict = {STAT_LABEL: stat_unc[data_idx]}
+
+                # Artificial systematic uncertainties
+                for sys_idx, art_sys in enumerate(art_unc[data_idx, :]):
+                    unc_dict[f'{ART_LABEL}_{sys_idx+1}'] = float(art_sys)
+
+                # Append to list
+                sys_artificial.append(unc_dict)
+
+        elif self.observable == 'WPWM-RATIO':
+            for data_idx in range(len(central_data)):
+                # Statistical uncertainty
+                unc_dict = {STAT_LABEL: stat_unc[data_idx]}
+
+                # Systematic uncertainty
+                unc_dict[f'{ART_LABEL}'] = sys_unc[data_idx]
+                sys_artificial.append(unc_dict)
+
+        # Save kinematics into file
+        logging.info("Dumping kinematics to file...")
+        kinematics_yaml = {'bins': kinematics}
+        kins_file_name = self.metadata['kinematics']['file']
+        with open(CURRENT_DIR + '/' + kins_file_name, 'w') as file:
+            yaml.dump(kinematics_yaml, file, sort_keys=False)
+        logging.info("Done!")
+
+        # Save central data into file
+        logging.info("Dumping kinematics to file...")
+        dat_central_yaml = {'data_central': central_data}
+        data_file_name = self.metadata['data_central']
+        with open(CURRENT_DIR + '/' + data_file_name, 'w') as file:
+            yaml.dump(dat_central_yaml, file, sort_keys=False)
+        logging.info("Done!")
+
+        # Save unertainties
+        logging.info("Dumping kinematics to file...")
+        uncertainties_yaml = {'definitions': unc_definitions, 'bins': sys_artificial}
+        unc_file_name = self.metadata['data_uncertainties'][0]
+        with open(CURRENT_DIR + '/' + unc_file_name, 'w') as file:
+            yaml.dump(uncertainties_yaml, file, sort_keys=False)
+        logging.info("Done!")
diff --git a/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/kinematics_WPWM-RATIO.yaml b/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/kinematics_WPWM-RATIO.yaml
@@ -1,61 +1,41 @@
 bins:
-- k1:
-    min: null
+- abs_eta:
+    min: 0.0
     mid: 0.175
-    max: null
-  k2:
-    min: null
-    mid: 6463.838404
-    max: null
-  k3:
-    min: null
-    mid: 7000.0
-    max: null
-- k1:
-    min: null
-    mid: 0.525
-    max: null
-  k2:
+    max: 0.35
+  m_W2:
     min: null
-    mid: 6463.838404
+    mid: 6.46174823e+03
     max: null
-  k3:
+- abs_eta:
+    min: 0.35
+    mid: 5.25000000e-01
+    max: 0.7
+  m_W2:
     min: null
-    mid: 7000.0
+    mid: 6.46174823e+03
     max: null
-- k1:
-    min: null
+- abs_eta:
+    min: 0.7
     mid: 0.9
-    max: null
-  k2:
+    max: 1.1
+  m_W2:
     min: null
-    mid: 6463.838404
+    mid: 6.46174823e+03
     max: null
-  k3:
-    min: null
-    mid: 7000.0
-    max: null
-- k1:
-    min: null
+- abs_eta:
+    min: 1.1
     mid: 1.35
-    max: null
-  k2:
-    min: null
-    mid: 6463.838404
-    max: null
-  k3:
+    max: 1.6
+  m_W2:
     min: null
-    mid: 7000.0
+    mid: 6.46174823e+03
     max: null
-- k1:
-    min: null
+- abs_eta:
+    min: 1.6
     mid: 1.85
-    max: null
-  k2:
-    min: null
-    mid: 6463.838404
-    max: null
-  k3:
+    max: 2.1
+  m_W2:
     min: null
-    mid: 7000.0
+    mid: 6.46174823e+03
     max: null