From 2bcc68ba0af245c31817b82018b6f2be7f79701d Mon Sep 17 00:00:00 2001 From: Yi-Mu Chen Date: Fri, 9 Feb 2024 14:31:42 +0100 Subject: [PATCH] Fixing N-dimensional histogram entry ordering --- examples/reading_scikihep_histograms.ipynb | 381 ----------- examples/reading_scikithep_histograms.ipynb | 689 ++++++++++++++++++++ hepdata_lib/hist_utils.py | 63 +- 3 files changed, 716 insertions(+), 417 deletions(-) delete mode 100644 examples/reading_scikihep_histograms.ipynb create mode 100644 examples/reading_scikithep_histograms.ipynb diff --git a/examples/reading_scikihep_histograms.ipynb b/examples/reading_scikihep_histograms.ipynb deleted file mode 100644 index 7334b5cd..00000000 --- a/examples/reading_scikihep_histograms.ipynb +++ /dev/null @@ -1,381 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Reading histograms\n", - "\n", - "For the new python-based frameworks, another common format would needs\n", - "translation are histogram in the\n", - "[`scikit-hep.hist`](https://hist.readthedocs.io/en/latest/). The functions in\n", - "the `hepdata_lib.hist_utils` will help you with that, and this notebook will\n", - "demonstrate how to do that.\n", - "\n", - "As explained in the [Getting started notebook](Getting_started.ipynb), a\n", - "`Submission` needs to exist or be created. Here, we'll just create one without\n", - "any additional information.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Welcome to JupyROOT 6.28/04\n" - ] - } - ], - "source": [ - "from hepdata_lib import Submission\n", - "\n", - "submission = Submission()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The common use-case for `scikit-hep` histograms is to allow for after-the-fact\n", - "slicing and grouping from a primary histogram. Let us first generate a fake\n", - "histogram that may appear in common histograms, as well as a common slicing\n", - "routine\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Hist(\n", - " StrCategory(['data', 'QCD', 'ttbar'], name='dataset'),\n", - " IntCategory([-1, 0, 4, 5], name='flavor'),\n", - " Regular(60, -3, 3, name='eta'),\n", - " Regular(20, 0, 500, name='pt'),\n", - " storage=Weight()) # Sum: WeightedSum(value=221221, variance=123802) (WeightedSum(value=256973, variance=143935) with flow)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import hist\n", - "import numpy as np\n", - "\n", - "rng = np.random.default_rng(seed=123_456_789)\n", - "\n", - "h = hist.Hist(\n", - " hist.axis.StrCategory([\"data\", \"QCD\", \"ttbar\"], name=\"dataset\"),\n", - " hist.axis.IntCategory([-1, 0, 4, 5], name=\"flavor\"),\n", - " hist.axis.Regular(60, -3, +3, name=\"eta\"),\n", - " hist.axis.Regular(20, 0, 500, name=\"pt\"),\n", - " storage=hist.storage.Weight(),\n", - ")\n", - "\n", - "h.fill( ## For mock data\n", - " dataset=\"data\",\n", - " flavor=-1,\n", - " eta=rng.normal(0, 2.0, size=123_456),\n", - " pt=rng.exponential(100, size=123_456),\n", - ")\n", - "h.fill( ## For Mock QCD\n", - " dataset=\"QCD\",\n", - " flavor=rng.choice([0, 4, 5], size=1_000_000, p=[0.8, 0.15, 0.05]),\n", - " eta=rng.normal(0.0, 2.0, size=1_000_000),\n", - " pt=rng.exponential(100, size=1_000_000),\n", - " weight=0.123456 * 2 * rng.random(size=1_000_000),\n", - ")\n", - "h.fill( ## For mock ttbar\n", - " dataset=\"ttbar\",\n", - " flavor=rng.choice([0, 4, 5], size=1_000_000, p=[0.45, 0.1, 0.45]),\n", - " eta=rng.normal(0.0, 1.5, size=1_000_000),\n", - " pt=rng.exponential(200, size=1_000_000),\n", - " weight=0.01 * 2 * rng.random(size=1_000_000),\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example of manual processing to 1D array\n", - "\n", - "Let use create a simple slicing routine to get the various histograms of\n", - "interest, then use the most general function, the\n", - "`hepdata_lib.hist_utils.read_hist` method, to create arrays that will be\n", - "compatible with `hepdata_lib.Variable` declaration.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'hist_value': array([27405., 21382., 16585., 12740., 10069., 7878., 6007., 4678.,\n", - " 3666., 2903., 2333., 1734., 1352., 1048., 851., 634.,\n", - " 485., 401., 294., 230.]),\n", - " 'hist_variance': array([27405., 21382., 16585., 12740., 10069., 7878., 6007., 4678.,\n", - " 3666., 2903., 2333., 1734., 1352., 1048., 851., 634.,\n", - " 485., 401., 294., 230.]),\n", - " 'pt': array([( 0., 25.), ( 25., 50.), ( 50., 75.), ( 75., 100.),\n", - " (100., 125.), (125., 150.), (150., 175.), (175., 200.),\n", - " (200., 225.), (225., 250.), (250., 275.), (275., 300.),\n", - " (300., 325.), (325., 350.), (350., 375.), (375., 400.),\n", - " (400., 425.), (425., 450.), (450., 475.), (475., 500.)],\n", - " dtype=[('f0', ' Dict[str, numpy.ndarray]: @@ -15,38 +15,28 @@ def read_hist(histo: hist.Hist, flow: bool = False) -> Dict[str, numpy.ndarray]: can be used for hepdata_lib Variable and Uncertainty declaration. For all axes define in the histogram, a `hepdata_lib.Variable` with - `is_independent=True` should be declared. The `values` of this variable will - be stored in the return dictionary following the axes names. + `is_independent=True` should be declared. The `values` of this variable + will be stored in the return dictionary following the axes names. - Overflow and underflow bin will be handled using a single flag for all axes, - so be sure to declare/modify histogram axes properties according to your - needs. + Overflow and underflow bin will be handled using a single flag for all + axes, so be sure to declare/modify histogram axes properties according to + your needs. The storage content will be returned as is, so additional uncertainty processing will need to be handled by the user using the return values. """ - axes_entries = [_get_histaxis_array(ax, flow=flow) for ax in histo.axes] + axes_entries = [_get_histaxis_array(ax, flow=flow) for ax in reversed(histo.axes)] axes_entries = numpy.meshgrid(*axes_entries) - ## Getting axes return values + # Getting axes return values readout = { - ax.name: axes_entries[idx].flatten() for idx, ax in enumerate(histo.axes) + ax.name: axes_entries[idx].flatten() + for idx, ax in enumerate(reversed(histo.axes)) } - ## Getting the histogram return values + # Getting the histogram return values view = histo.view(flow=flow) - _storage_keys = { - hist.storage.Weight: ["value", "variance"], - hist.storage.Mean: ["value", "count", "_sum_of_deltas_squared"], - hist.storage.WeightedMean: [ - "value", - "sum_of_weights", - "sum_of_weights_squared", - "_sum_of_weighted_deltas_squared", - ], - } - if view.dtype.names is None: # Single value storages readout["hist_value"] = view.flatten() else: @@ -61,17 +51,17 @@ def _get_histaxis_array(axis, flow: bool) -> numpy.ndarray: Given an axis array, return the bin entries and a numpy array. For continuous axes, the return will be a Nx2 array of bin edge pairs. For - categorical axes, the return will be a N array of bin content values. If the - flow is set to true, the function will also add the overflow/underflow bins - according to the settings found in axis.traits. For categorical axis, this - will include an extra `"__UNDETERMINED__"` entry (for StrCategory) or an +1 - entry (for IntCategory). + categorical axes, the return will be a N array of bin content values. If + the flow is set to true, the function will also add the overflow/underflow + bins according to the settings found in axis.traits. For categorical axis, + this will include an extra `"__UNDETERMINED__"` entry (for StrCategory) or + an +1 entry (for IntCategory). """ - ## Getting the entries as a simple list + # Getting the entries as a simple list entries = list(axis) - ## Adding overflow bin + # Adding overflow bin if flow and axis.traits.overflow: if isinstance(axis, hist.axis.StrCategory): entries.append("__UNDETERMINED__") @@ -82,14 +72,14 @@ def _get_histaxis_array(axis, flow: bool) -> numpy.ndarray: else: entries.append((axis.edges[-1], numpy.inf)) - ## Adding underflow bin + # Adding underflow bin if flow and axis.traits.underflow: - if isinstance(axis,hist.axis.Integer): + if isinstance(axis, hist.axis.Integer): entries = [-numpy.inf] + entries else: entries = [(-numpy.inf, axis.edges[0])] + entries - ## Converting to numpy array + # Converting to numpy array if axis.traits.continuous: entries = numpy.array(entries, dtype="f,f") else: @@ -114,7 +104,8 @@ def hist_as_variable( The `uncertainty` is a dictionary defining how uncertainties should be defined. Dictionary keys are used as the name of the uncertainty, while the - value defines how the uncertainty should be constructed. This can either be: + value defines how the uncertainty should be constructed. This can either + be: - `str`: either "poisson_asym" or "poisson_sym", indicating to extract Poisson uncertainty directly from the histogram values. (Either the @@ -185,8 +176,8 @@ def _make_poisson_unc_array( ) -> numpy.ndarray: """ Given the results of `read_hist`, extract the Poisson uncertainty using - hist.intervals. Automatically detecting the histogram storage type to handle - weighted uncertainties + hist.intervals. Automatically detecting the histogram storage type to + handle weighted uncertainties """ if symmetric: if "hist_variance" not in readout.keys():