From 206c76978ed66a04636538c8ff751d1e6e6b09b2 Mon Sep 17 00:00:00 2001 From: Bart van Beusekom Date: Mon, 29 Apr 2024 13:00:44 +0200 Subject: [PATCH] Address review comments --- CITATION.cff | 43 ++++++++++++++++++++++++++++++ README.md | 14 +++++++++- docs/v6-crosstab-py/references.rst | 11 ++++++++ docs/v6-crosstab-py/usage.rst | 7 ++--- v6-crosstab-py/central.py | 15 ++++++----- v6-crosstab-py/partial.py | 28 ++++++++++++++++--- 6 files changed, 103 insertions(+), 15 deletions(-) create mode 100644 CITATION.cff diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..f0d39e7 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,43 @@ +# This CITATION.cff file was generated with cffinit. +# Visit https://bit.ly/cffinit to generate yours today! + +cff-version: 1.2.0 +title: vantage6 contingency table algorithm +message: >- + If you use this software, please cite it using the + metadata from this file. +type: software +authors: + - given-names: Bart + family-names: Beusekom + name-particle: van + email: b.vanbeusekom@iknl.nl + affiliation: IKNL + orcid: 'https://orcid.org/0000-0002-2183-2901' + - given-names: Frank + family-names: Martin + email: f.martin@iknl.nl + affiliation: IKNL + orcid: 'https://orcid.org/0000-0002-5897-1569' + - given-names: Hasan + family-names: Alradhi + email: h.alradhi@iknl.nl + affiliation: IKNL + orcid: 'https://orcid.org/0000-0001-7887-3926' +# identifiers: +# - type: doi +# value: 10.5281/zenodo.7221216 +# description: DOI of the code published on zenodo +repository-code: 'https://github.com/vantage6/v6-crosstab-py' +url: 'https://vantage6.ai/' +abstract: >+ + An algorithm that calculates a contingency table for a given dataset. + +keywords: + - crosstable + - contingency table + - data analysis + - vantage6 + - privacy enhancing technology + - personal health train +license: MIT diff --git a/README.md b/README.md index 6f8e5bf..6452fb8 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ +

+
+ vantage6 +

# v6-crosstab-py @@ -11,6 +15,14 @@ The base code for this algorithm has been created via the [v6-algorithm-template](https://github.com/vantage6/v6-algorithm-template) template generator. +## Docker image + +The Docker image that contains this algorithm can be retrieved with: + +``` +docker pull harbor2.vantage6.ai/algorithms/crosstab +``` + ## Dockerizing your algorithm To finally run your algorithm on the vantage6 infrastructure, you need to @@ -20,7 +32,7 @@ create a Docker image of your algorithm. The easiest way to create a Docker image is to use the GitHub Actions pipeline to automatically build and push the Docker image. All that you need to do is push a -commit to the ``main`` branch. +commit to the `main` branch. ### Manually diff --git a/docs/v6-crosstab-py/references.rst b/docs/v6-crosstab-py/references.rst index 8281e57..6756aab 100644 --- a/docs/v6-crosstab-py/references.rst +++ b/docs/v6-crosstab-py/references.rst @@ -1,6 +1,9 @@ References ========== +Cite this implementation +------------------------ + This particular algorithm has not been published yet. If you use this code in your research, please cite the following paper: @@ -8,3 +11,11 @@ research, please cite the following paper: leArninG infrastructurE for Secure Insight eXchange." *AMIA annual symposium proceedings.* Vol. 2020. American Medical Informatics Association, 2020. `[link] `_ + +Used sources +------------ + +This implementation has been inspired by earlier implementations of the same algorithm: + +- https://github.com/IKNL/vantage6-algorithms/tree/crosstab +- https://github.com/IKNL/v6-starter-crosstabulation-py diff --git a/docs/v6-crosstab-py/usage.rst b/docs/v6-crosstab-py/usage.rst index 3ec22a3..a41f0fd 100644 --- a/docs/v6-crosstab-py/usage.rst +++ b/docs/v6-crosstab-py/usage.rst @@ -4,21 +4,22 @@ How to use Input arguments --------------- -Input arguments ---------------- - .. list-table:: :widths: 20 80 :header-rows: 1 * - Argument + - Type - Description * - ``results_col`` + - Column name (string) - The column whose categories will be the columns of the contingency table. * - ``group_cols`` + - List of column names (list of strings) - One or more columns whose categories, or combinations of categories, will be the rows of the contingency table. * - ``organizations_to_include`` + - List of integers - Which organizations to include in the computation. Python client example diff --git a/v6-crosstab-py/central.py b/v6-crosstab-py/central.py index c99dc05..0c6e74a 100644 --- a/v6-crosstab-py/central.py +++ b/v6-crosstab-py/central.py @@ -91,20 +91,21 @@ def _aggregate_results(results: dict, group_cols: list[str]) -> pd.DataFrame: """ # The results are pandas dictionaries converted to JSON. Convert them back and # then add them together to get the final partial_df. - results = [pd.read_json(StringIO(result)) for result in results] - - # set group cols as index - for idx, df in enumerate(results): - results[idx] = df.set_index(group_cols) + partial_dfs = [] + for result in results: + df = pd.read_json(StringIO(result)) + # set group cols as index + df.set_index(group_cols, inplace=True) + partial_dfs.append(df) # Get all unique values for the result column - all_result_levels = list(set([col for df in results for col in df.columns])) + all_result_levels = list(set([col for df in partial_dfs for col in df.columns])) # The partial results are already in the form of a contingency table, but they # contain ranges (e.g. "0-5"). These are converted to two columns: one for the # minimum value and one for the maximum value. converted_results = [] - for partial_df in results: + for partial_df in partial_dfs: # expand the ranges to min and max values orig_columns = partial_df.columns for col in orig_columns: diff --git a/v6-crosstab-py/partial.py b/v6-crosstab-py/partial.py index aacc763..5273b79 100644 --- a/v6-crosstab-py/partial.py +++ b/v6-crosstab-py/partial.py @@ -14,6 +14,10 @@ from vantage6.algorithm.tools.util import info, warn, error from vantage6.algorithm.tools.decorators import data from vantage6.algorithm.tools.util import get_env_var +from vantage6.algorithm.tools.exceptions import ( + EnvironmentVariableError, + PrivacyThresholdViolation, +) from .globals import ( DEFAULT_PRIVACY_THRESHOLD, @@ -27,7 +31,7 @@ def partial_crosstab( df: pd.DataFrame, results_col: str, group_cols: list[str], -) -> Any: +) -> str: """ Decentral part of the algorithm @@ -39,6 +43,16 @@ def partial_crosstab( The column for which counts are calculated group_cols : list[str] List of one or more columns to group the data by. + + Returns + ------- + str + The contingency table as a JSON string. + + Raises + ------ + PrivacyThresholdViolation + The privacy threshold is not met by any values in the contingency table. """ # get environment variables with privacy settings # pylint: disable=invalid-name @@ -78,7 +92,7 @@ def partial_crosstab( non_na_crosstab_df.index.get_level_values(col) != "N/A" ] if not (non_na_crosstab_df >= PRIVACY_THRESHOLD).any().any(): - raise ValueError( + raise PrivacyThresholdViolation( "No values in the contingency table are higher than the privacy threshold " f"of {PRIVACY_THRESHOLD}. Please check if you submitted categorical " "variables - if you did, there may simply not be enough data at this node." @@ -129,20 +143,26 @@ def _do_prestart_privacy_checks( The privacy threshold value. allow_zero : bool The flag indicating whether zero values are allowed. + + Raises + ------ + EnvironmentVariableError + The environment variables set by the node are not compatible. + """ minimum_rows_total = _convert_envvar_to_int( "CROSSTAB_MINIMUM_ROWS_TOTAL", DEFAULT_MINIMUM_ROWS_TOTAL ) if privacy_threshold == 0 and not allow_zero: - raise ValueError( + raise EnvironmentVariableError( "Privacy threshold is set to 0, but zero values are not allowed. This " "directly contradicts each other - please change one of the settings." ) # Check if dataframe contains enough rows if len(df) < minimum_rows_total: - raise ValueError( + raise PrivacyThresholdViolation( f"Dataframe contains less than {minimum_rows_total} rows. Refusing to " "handle this computation, as it may lead to privacy issues." )