diff --git a/.DS_Store b/.DS_Store index c297da6a..f6092697 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/4j2b_horCR.yaml b/4j2b_horCR.yaml deleted file mode 100644 index 9e6cb989..00000000 --- a/4j2b_horCR.yaml +++ /dev/null @@ -1,231 +0,0 @@ -independent_variables: - - header: - name: sample - values: - - value: 4j2b CR bin 1 - - value: 4j2b CR bin 2 - - value: 4j2b CR bin 3 - - value: 4j2b CR bin 4 - - value: 4j2b CR bin 5 - - value: 4j2b CR bin 6 - - value: 4j2b CR bin 7 - - value: 4j2b CR bin 8 - - value: 4j2b CR bin 9 - - value: 4j2b CR bin 10 - - value: 4j2b CR bin 11 - -dependent_variables: - - header: - name: W+jets - values: - - value: 167.53 - errors: - - symerror: 71.38 - - value: 83.77 - errors: - - symerror: 107.65 - - value: 502.60 - errors: - - symerror: 180.89 - - value: 125.65 - errors: - - symerror: 168.99 - - value: 209.42 - errors: - - symerror: 72.83 - - value: 125.65 - errors: - - symerror: 68.64 - - value: 125.65 - errors: - - symerror: 98.56 - - value: 0.00 - - value: 125.65 - errors: - - symerror: 87.34 - - value: 83.77 - errors: - - symerror: 65.34 - - value: 0.00 - errors: - - symerror: 38.74 - - header: - name: single top, s-channel - values: - - value: 2.43 - errors: - - symerror: 0.58 - - value: 2.43 - errors: - - symerror: 0.65 - - value: 2.68 - errors: - - symerror: 0.74 - - value: 2.10 - errors: - - symerror: 0.72 - - value: 1.92 - errors: - - symerror: 0.64 - - value: 1.48 - errors: - - symerror: 0.56 - - value: 0.90 - errors: - - symerror: 0.39 - - value: 0.69 - errors: - - symerror: 0.33 - - value: 0.58 - errors: - - symerror: 0.24 - - value: 0.51 - errors: - - symerror: 0.21 - - value: 0.35 - errors: - - symerror: 0.16 - - header: - name: single top, t-channel - values: - - value: 93.54 - errors: - - symerror: 22.49 - - value: 130.55 - errors: - - symerror: 30.51 - - value: 125.16 - errors: - - symerror: 34.37 - - value: 113.72 - errors: - - symerror: 31.08 - - value: 76.04 - errors: - - symerror: 23.60 - - value: 80.08 - errors: - - symerror: 26.88 - - value: 53.16 - errors: - - symerror: 17.97 - - value: 45.76 - errors: - - symerror: 16.91 - - value: 34.99 - errors: - - symerror: 14.34 - - value: 30.95 - errors: - - symerror: 11.01 - - value: 35.66 - errors: - - symerror: 14.05 - - header: - name: tW - values: - - value: 305.89 - errors: - - symerror: 66.61 - - value: 453.69 - errors: - - symerror: 118.41 - - value: 321.00 - errors: - - symerror: 88.65 - - value: 256.92 - errors: - - symerror: 77.19 - - value: 180.45 - errors: - - symerror: 61.59 - - value: 143.27 - errors: - - symerror: 50.40 - - value: 117.28 - errors: - - symerror: 44.33 - - value: 80.10 - errors: - - symerror: 33.23 - - value: 64.99 - errors: - - symerror: 26.78 - - value: 58.34 - errors: - - symerror: 24.93 - - value: 43.83 - errors: - - symerror: 21.73 - - header: - name: ttbar - values: - - value: 8025.68 - errors: - - symerror: 2228.73 - - value: 14588.12 - errors: - - symerror: 4091.35 - - value: 7870.49 - errors: - - symerror: 2729.98 - - value: 4968.02 - errors: - - symerror: 1916.02 - - value: 3508.46 - errors: - - symerror: 1512.30 - - value: 2597.63 - errors: - - symerror: 1246.67 - - value: 1965.77 - errors: - - symerror: 920.95 - - value: 1320.99 - errors: - - symerror: 819.50 - - value: 1090.04 - errors: - - symerror: 614.33 - - value: 829.54 - errors: - - symerror: 465.73 - - value: 666.96 - errors: - - symerror: 405.96 - - header: - name: total - values: - - value: 8595.07 - errors: - - symerror: 2331.90 - - value: 15258.55 - errors: - - symerror: 4264.57 - - value: 8821.93 - errors: - - symerror: 2914.95 - - value: 5466.41 - errors: - - symerror: 2060.48 - - value: 3976.29 - errors: - - symerror: 1627.32 - - value: 2948.11 - errors: - - symerror: 1326.71 - - value: 2262.76 - errors: - - symerror: 1002.38 - - value: 1447.53 - errors: - - symerror: 850.66 - - value: 1316.25 - errors: - - symerror: 691.95 - - value: 1003.11 - errors: - - symerror: 512.32 - - value: 746.80 - errors: - - symerror: 435.96 \ No newline at end of file diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb index 44f6458b..8b4f8bd9 100644 --- a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb +++ b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb @@ -901,31 +901,6 @@ "utils.file_output.save_histograms(all_histograms['hist_dict'], \"histograms.root\")\n", "\n", "if USE_INFERENCE:\n", - " utils.file_output.save_histograms(all_histograms['ml_hist_dict'],\n", - " fileset,\n", - " \"histograms_ml.root\",\n", - " utils.config[\"ml\"][\"FEATURE_NAMES\"],\n", - " add_offset=True)\n", - " #Ok, here supposely needs to be the \n", - " # REANA platform runs each workflow in an independent workspace" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a53282f", - "metadata": {}, - "outputs": [], - "source": [ - "#Merging step\n", - "import uproot\n", - "#LIST_OF_FILES_PER_SAMPLE is a list of 9 files containing histograms per sample \n", - "\n", - "with uproot.recreate(\"histograms_merged.root\") as f_out:\n", - " for h_file in LIST_OF_FILES_PER_SAMPLE:\n", - " with uproot.open(\"h_file\") as f_per_sample:\n", - " for key, value in f_per_sample.items():\n", - " f_out[key] = value " " utils.file_output.save_histograms(all_histograms['ml_hist_dict'], \"histograms_ml.root\", add_offset=True)" ] }, @@ -974,7 +949,6 @@ ] }, { - "attachments": {}, "cell_type": "code", "execution_count": 16, "id": "09a83712", @@ -1190,7 +1164,7 @@ "We have a channel for each ML observable:" ] }, -{ + { "cell_type": "code", "execution_count": 22, "id": "d8e36bac", @@ -1332,7 +1306,7 @@ "metadata": {}, "outputs": [], "source": [ - "if utils.config[\"preservation\"][\"HEP_DATA\"] == False:\n", + "if utils.config[\"preservation\"][\"HEP_DATA\"] == True:\n", " import utils.hepdata\n", " #Submission of model prediction\n", " utils.hepdata.submission_hep_data(model, model_prediction, \"hepdata_model\")\n", @@ -1341,8 +1315,6 @@ ] }, { - "attachments": {}, - "cell_type": "markdown", "id": "a2ce2d14", "metadata": {}, diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py index 7a8b85c9..58cb3846 100644 --- a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py +++ b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py @@ -25,7 +25,7 @@ # ### Imports: setting up our environment -# In[1]: + import logging @@ -70,7 +70,7 @@ # # The input files are all in the 1–3 GB range. -# In[2]: + ### GLOBAL CONFIGURATION @@ -103,7 +103,7 @@ # # During the processing step, machine learning is used to calculate one of the variables used for this analysis. The models used are trained separately in the `jetassignment_training.ipynb` notebook. Jets in the events are assigned to labels corresponding with their parent partons using a boosted decision tree (BDT). More information about the model and training can be found within that notebook. -# In[3]: + class TtbarAnalysis(processor.ProcessorABC): @@ -353,7 +353,7 @@ def postprocess(self, accumulator): # # Here, we gather all the required information about the files we want to process: paths to the files and asociated metadata. -# In[4]: + fileset = utils.file_input.construct_fileset( @@ -372,7 +372,7 @@ def postprocess(self, accumulator): # # Define the func_adl query to be used for the purpose of extracting columns and filtering. -# In[5]: + def get_query(source): @@ -458,7 +458,7 @@ def get_query(source): # # Using the queries created with `func_adl`, we are using `ServiceX` to read the CMS Open Data files to build cached files with only the specific event information as dictated by the query. -# In[6]: + if USE_SERVICEX: @@ -495,7 +495,7 @@ def get_query(source): # # When `USE_SERVICEX` is false, the input files need to be processed during this step as well. -# In[7]: + NanoAODSchema.warn_missing_crossrefs = False # silences warnings about branches we will not use here @@ -535,7 +535,7 @@ def get_query(source): print(f"\nexecution took {exec_time:.2f} seconds") -# In[8]: + # track metrics @@ -546,7 +546,7 @@ def get_query(source): # Let's have a look at the data we obtained. # We built histograms in two phase space regions, for multiple physics processes and systematic variations. -# In[9]: + import utils.plotting # noqa: E402 @@ -558,7 +558,7 @@ def get_query(source): plt.title("$\geq$ 4 jets, 1 b-tag") plt.xlabel("$H_T$ [GeV]"); -# In[10]: + all_histograms["hist_dict"]["4j2b"][:, :, "nominal"].stack("process")[::-1].plot(stack=True, histtype="fill", linewidth=1,edgecolor="grey") @@ -574,7 +574,7 @@ def get_query(source): # # We are making of [UHI](https://uhi.readthedocs.io/) here to re-bin. -# In[11]: + # b-tagging variations @@ -587,7 +587,7 @@ def get_query(source): plt.xlabel("$H_T$ [GeV]") plt.title("b-tagging variations"); -# In[12]: + # jet energy scale variations @@ -598,7 +598,7 @@ def get_query(source): plt.xlabel("$m_{bjj}$ [Gev]") plt.title("Jet energy variations"); -# In[13]: + # ML inference variables @@ -626,7 +626,7 @@ def get_query(source): # We'll save everything to disk for subsequent usage. # This also builds pseudo-data by combining events from the various simulation setups we have processed. -# In[14]: + utils.file_output.save_histograms(all_histograms['hist_dict'], "histograms.root") @@ -643,7 +643,7 @@ def get_query(source): # A statistical model has been defined in `config.yml`, ready to be used with our output. # We will use `cabinetry` to combine all histograms into a `pyhf` workspace and fit the resulting statistical model to the pseudodata we built. -# In[15]: + import utils.rebinning # noqa: E402 @@ -659,14 +659,14 @@ def get_query(source): # We can inspect the workspace with `pyhf`, or use `pyhf` to perform inference. -# In[16]: + !pyhf inspect workspace.json | head -n 20 # Let's try out what we built: the next cell will perform a maximum likelihood fit of our statistical model to the pseudodata we built. -# In[17]: + model, data = cabinetry.model_utils.model_and_data(ws) @@ -678,7 +678,7 @@ def get_query(source): # For this pseudodata, what is the resulting ttbar cross-section divided by the Standard Model prediction? -# In[18]: + poi_index = model.config.poi_index @@ -687,7 +687,7 @@ def get_query(source): # Let's also visualize the model before and after the fit, in both the regions we are using. # The binning here corresponds to the binning used for the fit. -# In[19]: + model_prediction = cabinetry.model_utils.prediction(model) @@ -699,7 +699,7 @@ def get_query(source): # ### ML Validation # We can further validate our results by applying the above fit to different ML observables and checking for good agreement. -# In[20]: + # load the ml workspace (uses the ml observable instead of previous method) @@ -717,7 +717,7 @@ def get_query(source): cabinetry.workspace.save(ws_pruned, "workspace_ml.json") -# In[21]: + if USE_INFERENCE: @@ -725,27 +725,26 @@ def get_query(source): # We have a channel for each ML observable: -# In[22]: !pyhf inspect workspace_ml.json | head -n 20 -# In[23]: + # obtain model prediction before and after fit if USE_INFERENCE: - model_prediction_ml = cabinetry.model_utils.prediction(model_ml) + model_prediction = cabinetry.model_utils.prediction(model_ml) fit_results_mod = cabinetry.model_utils.match_fit_results(model_ml, fit_results) model_prediction_postfit = cabinetry.model_utils.prediction(model_ml, fit_results=fit_results_mod) -# In[24]: + if USE_INFERENCE: - utils.plotting.plot_data_mc(model_prediction_ml, model_prediction_postfit, data_ml, config_ml) + utils.plotting.plot_data_mc(model_prediction, model_prediction_postfit, data_ml, config_ml) + -# In[ ]: if utils.config["preservation"]["HEP_DATA"] == True: @@ -754,7 +753,6 @@ def get_query(source): utils.hepdata.submission_hep_data(model, model_prediction, "hepdata_model") #Submission of model_ml prediction utils.hepdata.submission_hep_data(model_ml, model_prediction_ml,"hepdata_model_ml") - # ### What is next? #