Merge pull request #7 from vantage6/change/review-comments

Address review comments
vantage6 · May 8, 2024 · ec2eb6c · ec2eb6c
2 parents fbc73db + 206c769
commit ec2eb6c
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 15 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,43 @@
+# This CITATION.cff file was generated with cffinit.
+# Visit https://bit.ly/cffinit to generate yours today!
+
+cff-version: 1.2.0
+title: vantage6 contingency table algorithm
+message: >-
+  If you use this software, please cite it using the
+  metadata from this file.
+type: software
+authors:
+  - given-names: Bart
+    family-names: Beusekom
+    name-particle: van
+    email: [email protected]
+    affiliation: IKNL
+    orcid: 'https://orcid.org/0000-0002-2183-2901'
+  - given-names: Frank
+    family-names: Martin
+    email: [email protected]
+    affiliation: IKNL
+    orcid: 'https://orcid.org/0000-0002-5897-1569'
+  - given-names: Hasan
+    family-names: Alradhi
+    email: [email protected]
+    affiliation: IKNL
+    orcid: 'https://orcid.org/0000-0001-7887-3926'
+# identifiers:
+#   - type: doi
+#     value: 10.5281/zenodo.7221216
+#     description: DOI of the code published on zenodo
+repository-code: 'https://github.com/vantage6/v6-crosstab-py'
+url: 'https://vantage6.ai/'
+abstract: >+
+  An algorithm that calculates a contingency table for a given dataset.
+
+keywords:
+  - crosstable
+  - contingency table
+  - data analysis
+  - vantage6
+  - privacy enhancing technology
+  - personal health train
+license: MIT
diff --git a/README.md b/README.md
@@ -1,3 +1,7 @@
+<h1 align="center">
+  <br>
+  <a href="https://vantage6.ai"><img src="https://github.com/IKNL/guidelines/blob/master/resources/logos/vantage6.png?raw=true" alt="vantage6" width="350"></a>
+</h1>
 
 # v6-crosstab-py
 
@@ -11,6 +15,14 @@ The base code for this algorithm has been created via the
 [v6-algorithm-template](https://github.com/vantage6/v6-algorithm-template)
 template generator.
 
+## Docker image
+
+The Docker image that contains this algorithm can be retrieved with:
+
+```
+docker pull harbor2.vantage6.ai/algorithms/crosstab
+```
+
 ## Dockerizing your algorithm
 
 To finally run your algorithm on the vantage6 infrastructure, you need to
@@ -20,7 +32,7 @@ create a Docker image of your algorithm.
 
 The easiest way to create a Docker image is to use the GitHub Actions pipeline to
 automatically build and push the Docker image. All that you need to do is push a
-commit to the ``main`` branch.
+commit to the `main` branch.
 
 ### Manually
 

diff --git a/docs/v6-crosstab-py/references.rst b/docs/v6-crosstab-py/references.rst
@@ -1,10 +1,21 @@
 References
 ==========
 
+Cite this implementation
+------------------------
+
 This particular algorithm has not been published yet. If you use this code in your
 research, please cite the following paper:
 
 1. Moncada-Torres, Arturo, et al. "VANTAGE6: an open source priVAcy preserviNg federaTed
    leArninG infrastructurE for Secure Insight eXchange." *AMIA annual symposium proceedings.*
    Vol. 2020. American Medical Informatics Association, 2020.
    `[link] <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8075508/>`_
+
+Used sources
+------------
+
+This implementation has been inspired by earlier implementations of the same algorithm:
+
+- https://github.com/IKNL/vantage6-algorithms/tree/crosstab
+- https://github.com/IKNL/v6-starter-crosstabulation-py
diff --git a/docs/v6-crosstab-py/usage.rst b/docs/v6-crosstab-py/usage.rst
@@ -4,21 +4,22 @@ How to use
 Input arguments
 ---------------
 
-Input arguments
----------------
-
 .. list-table::
    :widths: 20 80
    :header-rows: 1
 
    * - Argument
+     - Type
      - Description
    * - ``results_col``
+     - Column name (string)
      - The column whose categories will be the columns of the contingency table.
    * - ``group_cols``
+     - List of column names (list of strings)
      - One or more columns whose categories, or combinations of categories, will be the
        rows of the contingency table.
    * - ``organizations_to_include``
+     - List of integers
      - Which organizations to include in the computation.
 
 Python client example

diff --git a/v6-crosstab-py/central.py b/v6-crosstab-py/central.py
@@ -91,20 +91,21 @@ def _aggregate_results(results: dict, group_cols: list[str]) -> pd.DataFrame:
     """
     # The results are pandas dictionaries converted to JSON. Convert them back and
     # then add them together to get the final partial_df.
-    results = [pd.read_json(StringIO(result)) for result in results]
-
-    # set group cols as index
-    for idx, df in enumerate(results):
-        results[idx] = df.set_index(group_cols)
+    partial_dfs = []
+    for result in results:
+        df = pd.read_json(StringIO(result))
+        # set group cols as index
+        df.set_index(group_cols, inplace=True)
+        partial_dfs.append(df)
 
     # Get all unique values for the result column
-    all_result_levels = list(set([col for df in results for col in df.columns]))
+    all_result_levels = list(set([col for df in partial_dfs for col in df.columns]))
 
     # The partial results are already in the form of a contingency table, but they
     # contain ranges (e.g. "0-5"). These are converted to two columns: one for the
     # minimum value and one for the maximum value.
     converted_results = []
-    for partial_df in results:
+    for partial_df in partial_dfs:
         # expand the ranges to min and max values
         orig_columns = partial_df.columns
         for col in orig_columns:

diff --git a/v6-crosstab-py/partial.py b/v6-crosstab-py/partial.py
@@ -14,6 +14,10 @@
 from vantage6.algorithm.tools.util import info, warn, error
 from vantage6.algorithm.tools.decorators import data
 from vantage6.algorithm.tools.util import get_env_var
+from vantage6.algorithm.tools.exceptions import (
+    EnvironmentVariableError,
+    PrivacyThresholdViolation,
+)
 
 from .globals import (
     DEFAULT_PRIVACY_THRESHOLD,
@@ -27,7 +31,7 @@ def partial_crosstab(
     df: pd.DataFrame,
     results_col: str,
     group_cols: list[str],
-) -> Any:
+) -> str:
     """
     Decentral part of the algorithm
 
@@ -39,6 +43,16 @@ def partial_crosstab(
         The column for which counts are calculated
     group_cols : list[str]
         List of one or more columns to group the data by.
+
+    Returns
+    -------
+    str
+        The contingency table as a JSON string.
+
+    Raises
+    ------
+    PrivacyThresholdViolation
+        The privacy threshold is not met by any values in the contingency table.
     """
     # get environment variables with privacy settings
     # pylint: disable=invalid-name
@@ -78,7 +92,7 @@ def partial_crosstab(
             non_na_crosstab_df.index.get_level_values(col) != "N/A"
         ]
     if not (non_na_crosstab_df >= PRIVACY_THRESHOLD).any().any():
-        raise ValueError(
+        raise PrivacyThresholdViolation(
             "No values in the contingency table are higher than the privacy threshold "
             f"of {PRIVACY_THRESHOLD}. Please check if you submitted categorical "
             "variables - if you did, there may simply not be enough data at this node."
@@ -129,20 +143,26 @@ def _do_prestart_privacy_checks(
         The privacy threshold value.
     allow_zero : bool
         The flag indicating whether zero values are allowed.
+
+    Raises
+    ------
+    EnvironmentVariableError
+        The environment variables set by the node are not compatible.
+
     """
     minimum_rows_total = _convert_envvar_to_int(
         "CROSSTAB_MINIMUM_ROWS_TOTAL", DEFAULT_MINIMUM_ROWS_TOTAL
     )
 
     if privacy_threshold == 0 and not allow_zero:
-        raise ValueError(
+        raise EnvironmentVariableError(
             "Privacy threshold is set to 0, but zero values are not allowed. This "
             "directly contradicts each other - please change one of the settings."
         )
 
     # Check if dataframe contains enough rows
     if len(df) < minimum_rows_total:
-        raise ValueError(
+        raise PrivacyThresholdViolation(
             f"Dataframe contains less than {minimum_rows_total} rows. Refusing to "
             "handle this computation, as it may lead to privacy issues."
         )