diff --git a/.bumpversion.cfg b/.bumpversion.cfg index d7d82ce0..3d2727d0 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.14.0 +current_version = 0.14.2 commit = True tag = True diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 45a4e603..0d8044e6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -47,17 +47,18 @@ jobs: - uses: actions/checkout@v4 - name: Setup Micromamba environment - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: - environment-file: false environment-name: ci - extra-specs: | + create-args: >- python=${{ matrix.python-version }} root=${{ matrix.root-version }} imagemagick ghostscript pip - channels: conda-forge + condarc: | + channels: + - conda-forge - name: ROOT info run: | @@ -102,9 +103,9 @@ jobs: - name: Upload notebooks if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: notebooks py3 + name: notebooks-${{ matrix.root-version }}-${{ matrix.python-version }}-${{ matrix.os }} py3-${{ matrix.root-version }}-${{ matrix.python-version }}-${{ matrix.os }} path: examples/*.html - name: Run pylint diff --git a/README.md b/README.md index 3adb2353..cd89e6d1 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ If you are not sure about your Python environment, please also see below how to ## Getting started -For using `hepdata_lib`, you don't even need to install it, but can use the [binder](https://mybinder.org/) or [SWAN](https://swan.cern.ch/) (CERN-only) services using one of the buttons below and following the instructions in the notebook with name [Getting_started](examples/Getting_started.ipynb): +For using `hepdata_lib`, you don't even need to install it, but can use the [binder](https://mybinder.org/) or [SWAN](https://swan.cern.ch/) (CERN-only) services using one of the buttons below: [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/HEPData/hepdata_lib/main?filepath=examples/Getting_started.ipynb) [![SWAN](https://swanserver.web.cern.ch/swanserver/images/badge_swan_white_150.png)](https://cern.ch/swanserver/cgi-bin/go/?projurl=https://github.com/HEPData/hepdata_lib.git) diff --git a/docs/conf.py b/docs/conf.py index e4fc3f55..ea64d692 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -39,9 +39,9 @@ def __getattr__(cls, name): author = 'Andreas Albert, Clemens Lange' # The short X.Y version -version = '0.14.0' +version = '0.14.2' # The full version, including alpha/beta/rc tags -release = '0.14.0' +release = '0.14.2' # -- General configuration --------------------------------------------------- diff --git a/docs/dev.rst b/docs/dev.rst index c39ee778..f4f4bcb0 100644 --- a/docs/dev.rst +++ b/docs/dev.rst @@ -121,4 +121,17 @@ Analysing the code pylint hepdata_lib/*.py pylint tests/*.py --rcfile=tests/pylintrc -These commands are run by GitHub Actions, so you should first check locally that no issues are flagged. \ No newline at end of file +These commands are run by GitHub Actions, so you should first check locally that no issues are flagged. + + +Making a release +---------------- + +After making a new release available on `PyPI`_, a `JIRA`_ issue (`example`_) should be opened to request that +``hepdata_lib`` is upgraded in future `LCG Releases`_ used by `SWAN`_. + +.. _PyPI: https://pypi.org/project/hepdata-lib/ +.. _JIRA: https://its.cern.ch/jira/projects/SPI/ +.. _example: https://its.cern.ch/jira/browse/SPI-2507 +.. _LCG Releases: https://lcginfo.cern.ch/pkg/hepdata_lib/ +.. _SWAN: http://swan.cern.ch/ \ No newline at end of file diff --git a/docs/setup.rst b/docs/setup.rst index a93b7b57..cda8762d 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -34,6 +34,21 @@ On LXPLUS and many other local computing sites with CVMFS and Apptainer (previou This opens a new shell with ``hepdata_lib``, ROOT, and Python 3 available. Your home directory and most other user directories on the machine on which you execute Apptainer will also be accessible from within this shell. +Using SWAN +++++++++++ + +`SWAN`_ requires a CERN account. ``hepdata_lib`` should already be installed in most recent `LCG Releases`_ used by +SWAN. The latest LCG Release might not contain the latest ``hepdata_lib`` version. The `LCG Nightly`_, possibly +containing a more recent ``hepdata_lib`` version, can be used by selecting the "Bleeding Edge" software stack in the +SWAN configuration. Alternatively, you can upgrade ``hepdata_lib`` by adding a local installation path to the +``$PYTHONPATH`` in a startup script specified as the "Environment script" in the SWAN configuration (see +`Install packages in CERNBox`_). Then execute ``!pip install hepdata_lib --user --upgrade`` in your Jupyter notebook +to upgrade ``hepdata_lib`` to the latest version. + +.. _SWAN: http://swan.cern.ch/ +.. _LCG Releases: https://lcginfo.cern.ch/pkg/hepdata_lib/ +.. _LCG Nightly: https://lcginfo.cern.ch/#nightlies +.. _Install packages in CERNBox: https://swan.docs.cern.ch/advanced/install_packages/ .. _sec-setup-developers: diff --git a/docs/usage.rst b/docs/usage.rst index 3ffb3292..9c046e24 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -351,4 +351,12 @@ After creating the Uncertainty objects, the only additional step is to attach th variable.add_uncertainty(unc1) variable.add_uncertainty(unc2) - +See `Uncertainties`_ for more guidance. In particular, note that ``hepdata_lib`` will omit the ``errors`` key from the +YAML output if all uncertainties are zero for a particular bin, printing a warning message "Note that bins with zero +content should preferably be omitted completely from the HEPData table". A legitimate use case is where there are +multiple dependent variables and a (different) subset of the bins has missing content for some dependent variables. +In this case the uncertainties should be set to zero for the missing bins with a non-numeric central value like ``'-'``. +The warning message can be suppressed by passing an optional argument ``zero_uncertainties_warning=False`` when +defining an instance of the ``Variable`` class. + +.. _`Uncertainties`: https://hepdata-submission.readthedocs.io/en/latest/data_yaml.html#uncertainties \ No newline at end of file diff --git a/examples/Getting_started.ipynb b/examples/Getting_started.ipynb index 2a636653..0934890a 100644 --- a/examples/Getting_started.ipynb +++ b/examples/Getting_started.ipynb @@ -6,15 +6,7 @@ "source": [ "# Getting started with hepdata_lib\n", "\n", - "The following instructions and examples should get you started to get your analysis into [HEPData](https://hepdata.net) using `hepdata_lib`. Please also refer to the [documentation](http://hepdata-lib.readthedocs.io/). While you can also run `hepdata_lib` on your local computer, you can use the [binder](https://mybinder.org/) or [SWAN](http://swan.cern.ch/) services in the browser. Mind that SWAN is only available for people with a CERN account.\n", - "\n", - "## SWAN-specific instructions\n", - "\n", - "For SWAN, if you haven't done so already, open up a new terminal (going back to the project tab and clicking the terminal icon in the top right), and enter the following to install `hepdata_lib`:\n", - "```\n", - "pip install --user hepdata_lib\n", - "```\n", - "Then go back to the notebook." + "The following instructions and examples should get you started to get your analysis into [HEPData](https://hepdata.net) using `hepdata_lib`. Please also refer to the [documentation](http://hepdata-lib.readthedocs.io/). While you can also run `hepdata_lib` on your local computer, you can use the [binder](https://mybinder.org/) or [SWAN](http://swan.cern.ch/) services in the browser. Mind that SWAN is only available for people with a CERN account." ] }, { @@ -35,12 +27,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Welcome to JupyROOT 6.26/06\n" + "Welcome to JupyROOT 6.26/06\n", + "hepdata_lib version 0.14.1\n" ] } ], "source": [ - "import hepdata_lib" + "import hepdata_lib\n", + "print(\"hepdata_lib version\", hepdata_lib.__version__)" ] }, { diff --git a/examples/reading_scikihep_histograms.ipynb b/examples/reading_scikihep_histograms.ipynb deleted file mode 100644 index 7334b5cd..00000000 --- a/examples/reading_scikihep_histograms.ipynb +++ /dev/null @@ -1,381 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Reading histograms\n", - "\n", - "For the new python-based frameworks, another common format would needs\n", - "translation are histogram in the\n", - "[`scikit-hep.hist`](https://hist.readthedocs.io/en/latest/). The functions in\n", - "the `hepdata_lib.hist_utils` will help you with that, and this notebook will\n", - "demonstrate how to do that.\n", - "\n", - "As explained in the [Getting started notebook](Getting_started.ipynb), a\n", - "`Submission` needs to exist or be created. Here, we'll just create one without\n", - "any additional information.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Welcome to JupyROOT 6.28/04\n" - ] - } - ], - "source": [ - "from hepdata_lib import Submission\n", - "\n", - "submission = Submission()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The common use-case for `scikit-hep` histograms is to allow for after-the-fact\n", - "slicing and grouping from a primary histogram. Let us first generate a fake\n", - "histogram that may appear in common histograms, as well as a common slicing\n", - "routine\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Hist(\n", - " StrCategory(['data', 'QCD', 'ttbar'], name='dataset'),\n", - " IntCategory([-1, 0, 4, 5], name='flavor'),\n", - " Regular(60, -3, 3, name='eta'),\n", - " Regular(20, 0, 500, name='pt'),\n", - " storage=Weight()) # Sum: WeightedSum(value=221221, variance=123802) (WeightedSum(value=256973, variance=143935) with flow)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import hist\n", - "import numpy as np\n", - "\n", - "rng = np.random.default_rng(seed=123_456_789)\n", - "\n", - "h = hist.Hist(\n", - " hist.axis.StrCategory([\"data\", \"QCD\", \"ttbar\"], name=\"dataset\"),\n", - " hist.axis.IntCategory([-1, 0, 4, 5], name=\"flavor\"),\n", - " hist.axis.Regular(60, -3, +3, name=\"eta\"),\n", - " hist.axis.Regular(20, 0, 500, name=\"pt\"),\n", - " storage=hist.storage.Weight(),\n", - ")\n", - "\n", - "h.fill( ## For mock data\n", - " dataset=\"data\",\n", - " flavor=-1,\n", - " eta=rng.normal(0, 2.0, size=123_456),\n", - " pt=rng.exponential(100, size=123_456),\n", - ")\n", - "h.fill( ## For Mock QCD\n", - " dataset=\"QCD\",\n", - " flavor=rng.choice([0, 4, 5], size=1_000_000, p=[0.8, 0.15, 0.05]),\n", - " eta=rng.normal(0.0, 2.0, size=1_000_000),\n", - " pt=rng.exponential(100, size=1_000_000),\n", - " weight=0.123456 * 2 * rng.random(size=1_000_000),\n", - ")\n", - "h.fill( ## For mock ttbar\n", - " dataset=\"ttbar\",\n", - " flavor=rng.choice([0, 4, 5], size=1_000_000, p=[0.45, 0.1, 0.45]),\n", - " eta=rng.normal(0.0, 1.5, size=1_000_000),\n", - " pt=rng.exponential(200, size=1_000_000),\n", - " weight=0.01 * 2 * rng.random(size=1_000_000),\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example of manual processing to 1D array\n", - "\n", - "Let use create a simple slicing routine to get the various histograms of\n", - "interest, then use the most general function, the\n", - "`hepdata_lib.hist_utils.read_hist` method, to create arrays that will be\n", - "compatible with `hepdata_lib.Variable` declaration.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'hist_value': array([27405., 21382., 16585., 12740., 10069., 7878., 6007., 4678.,\n", - " 3666., 2903., 2333., 1734., 1352., 1048., 851., 634.,\n", - " 485., 401., 294., 230.]),\n", - " 'hist_variance': array([27405., 21382., 16585., 12740., 10069., 7878., 6007., 4678.,\n", - " 3666., 2903., 2333., 1734., 1352., 1048., 851., 634.,\n", - " 485., 401., 294., 230.]),\n", - " 'pt': array([( 0., 25.), ( 25., 50.), ( 50., 75.), ( 75., 100.),\n", - " (100., 125.), (125., 150.), (150., 175.), (175., 200.),\n", - " (200., 225.), (225., 250.), (250., 275.), (275., 300.),\n", - " (300., 325.), (325., 350.), (350., 375.), (375., 400.),\n", - " (400., 425.), (425., 450.), (450., 475.), (475., 500.)],\n", - " dtype=[('f0', ' Dict[str, numpy.ndarray]: @@ -15,38 +15,28 @@ def read_hist(histo: hist.Hist, flow: bool = False) -> Dict[str, numpy.ndarray]: can be used for hepdata_lib Variable and Uncertainty declaration. For all axes define in the histogram, a `hepdata_lib.Variable` with - `is_independent=True` should be declared. The `values` of this variable will - be stored in the return dictionary following the axes names. + `is_independent=True` should be declared. The `values` of this variable + will be stored in the return dictionary following the axes names. - Overflow and underflow bin will be handled using a single flag for all axes, - so be sure to declare/modify histogram axes properties according to your - needs. + Overflow and underflow bin will be handled using a single flag for all + axes, so be sure to declare/modify histogram axes properties according to + your needs. The storage content will be returned as is, so additional uncertainty processing will need to be handled by the user using the return values. """ - axes_entries = [_get_histaxis_array(ax, flow=flow) for ax in histo.axes] + axes_entries = [_get_histaxis_array(ax, flow=flow) for ax in reversed(histo.axes)] axes_entries = numpy.meshgrid(*axes_entries) - ## Getting axes return values + # Getting axes return values readout = { - ax.name: axes_entries[idx].flatten() for idx, ax in enumerate(histo.axes) + ax.name: axes_entries[idx].flatten() + for idx, ax in enumerate(reversed(histo.axes)) } - ## Getting the histogram return values + # Getting the histogram return values view = histo.view(flow=flow) - _storage_keys = { - hist.storage.Weight: ["value", "variance"], - hist.storage.Mean: ["value", "count", "_sum_of_deltas_squared"], - hist.storage.WeightedMean: [ - "value", - "sum_of_weights", - "sum_of_weights_squared", - "_sum_of_weighted_deltas_squared", - ], - } - if view.dtype.names is None: # Single value storages readout["hist_value"] = view.flatten() else: @@ -61,17 +51,17 @@ def _get_histaxis_array(axis, flow: bool) -> numpy.ndarray: Given an axis array, return the bin entries and a numpy array. For continuous axes, the return will be a Nx2 array of bin edge pairs. For - categorical axes, the return will be a N array of bin content values. If the - flow is set to true, the function will also add the overflow/underflow bins - according to the settings found in axis.traits. For categorical axis, this - will include an extra `"__UNDETERMINED__"` entry (for StrCategory) or an +1 - entry (for IntCategory). + categorical axes, the return will be a N array of bin content values. If + the flow is set to true, the function will also add the overflow/underflow + bins according to the settings found in axis.traits. For categorical axis, + this will include an extra `"__UNDETERMINED__"` entry (for StrCategory) or + an +1 entry (for IntCategory). """ - ## Getting the entries as a simple list + # Getting the entries as a simple list entries = list(axis) - ## Adding overflow bin + # Adding overflow bin if flow and axis.traits.overflow: if isinstance(axis, hist.axis.StrCategory): entries.append("__UNDETERMINED__") @@ -82,14 +72,14 @@ def _get_histaxis_array(axis, flow: bool) -> numpy.ndarray: else: entries.append((axis.edges[-1], numpy.inf)) - ## Adding underflow bin + # Adding underflow bin if flow and axis.traits.underflow: - if isinstance(axis,hist.axis.Integer): + if isinstance(axis, hist.axis.Integer): entries = [-numpy.inf] + entries else: entries = [(-numpy.inf, axis.edges[0])] + entries - ## Converting to numpy array + # Converting to numpy array if axis.traits.continuous: entries = numpy.array(entries, dtype="f,f") else: @@ -114,7 +104,8 @@ def hist_as_variable( The `uncertainty` is a dictionary defining how uncertainties should be defined. Dictionary keys are used as the name of the uncertainty, while the - value defines how the uncertainty should be constructed. This can either be: + value defines how the uncertainty should be constructed. This can either + be: - `str`: either "poisson_asym" or "poisson_sym", indicating to extract Poisson uncertainty directly from the histogram values. (Either the @@ -185,8 +176,8 @@ def _make_poisson_unc_array( ) -> numpy.ndarray: """ Given the results of `read_hist`, extract the Poisson uncertainty using - hist.intervals. Automatically detecting the histogram storage type to handle - weighted uncertainties + hist.intervals. Automatically detecting the histogram storage type to + handle weighted uncertainties """ if symmetric: if "hist_variance" not in readout.keys(): diff --git a/setup.py b/setup.py index 6a7d19a9..c2699e17 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setup( name='hepdata_lib', - version='0.14.0', + version='0.14.2', description='Library for getting your data into HEPData', long_description=LONG_DESCRIPTION, long_description_content_type='text/markdown', diff --git a/tests/test_uncertainty.py b/tests/test_uncertainty.py index 3dd70a12..e121e3bf 100644 --- a/tests/test_uncertainty.py +++ b/tests/test_uncertainty.py @@ -47,7 +47,7 @@ def test_scale_values(self): def test_set_values_from_intervals(self): '''Test behavior of Uncertainy.test_set_values_from_intervals function''' - # Dummy central values and variatons relative to central value + # Dummy central values and variations relative to central value npoints = 100 values = list(range(0, npoints, 1)) uncertainty = [(-random.uniform(0, 1), random.uniform(0, 1)) @@ -91,3 +91,27 @@ def test_mixed_uncertainties(self): pattern = ['symerror', 'asymerror', 'asymerror', 'symerror'] self.assertTrue((list(dictionary['values'][i]['errors'][0].keys())[ 0], value) for i, value in enumerate(pattern)) + + def test_zero_uncertainties(self): + '''Test cases where a data point has zero uncertainties''' + + # Asymmetric uncertainties + var = Variable("testvar", is_binned=False, values=[1, 2, 3, 4]) + unc = Uncertainty("fake_unc", is_symmetric=False) + unc.values = [(-1, 1), (-1.5, 2), (0, 0), (-2.5, 2.5)] + var.add_uncertainty(unc) + dictionary = var.make_dict() + # Check that 'errors' key is missing only if zero uncertainties + self.assertTrue(all('errors' in dictionary['values'][i] for i in [0, 1, 3])) + self.assertTrue('errors' not in dictionary['values'][2]) + + # Symmetric uncertainties (and use "zero_uncertainties_warning=False" option) + var = Variable("testvar", is_binned=False, values=[1, 2, 3, 4], + zero_uncertainties_warning=False) + unc = Uncertainty("fake_unc", is_symmetric=True) + unc.values = [1, 1.5, 0, 2.5] + var.add_uncertainty(unc) + dictionary = var.make_dict() + # Check that 'errors' key is missing only if zero uncertainties + self.assertTrue(all('errors' in dictionary['values'][i] for i in [0, 1, 3])) + self.assertTrue('errors' not in dictionary['values'][2])