Merge pull request #535 from UW-GAC/feature/phenotype-inventory-workf…

…low-input Generate input for the phenotype inventory workflow
UW-GAC · Apr 29, 2024 · 4ae379d · 4ae379d
2 parents 13ae966 + f0c2327
commit 4ae379d
Show file tree

Hide file tree

Showing 8 changed files with 612 additions and 6 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,7 +21,7 @@ repos:
       - id: isort
 
   - repo: https://github.com/PyCQA/flake8
-    rev: 3.9.2
+    rev: 7.0.0
     hooks:
       - id: flake8
         args: ['--config=setup.cfg']

diff --git a/add_phenotype_inventory_input_example_data.py b/add_phenotype_inventory_input_example_data.py
@@ -0,0 +1,50 @@
+# Temporary script to create some test data.
+# Run with: python manage.py shell < add_phenotype_inventory_input_example_data.py
+
+from anvil_consortium_manager.tests.factories import (
+    ManagedGroupFactory,
+    WorkspaceGroupSharingFactory,
+)
+
+from primed.cdsa.tests.factories import CDSAWorkspaceFactory
+from primed.dbgap.tests.factories import dbGaPWorkspaceFactory
+from primed.miscellaneous_workspaces.tests.factories import OpenAccessWorkspaceFactory
+from primed.primed_anvil.tests.factories import StudyFactory
+
+# Create a dbGaP workspace.
+fhs = StudyFactory.create(short_name="FHS", full_name="Framingham Heart Study")
+workspace_dbgap = dbGaPWorkspaceFactory.create(
+    dbgap_study_accession__dbgap_phs=7,
+    dbgap_study_accession__studies=[fhs],
+    dbgap_version=33,
+    dbgap_participant_set=12,
+    dbgap_consent_code=1,
+    dbgap_consent_abbreviation="HMB",
+    workspace__name="DBGAP_FHS_v33_p12_HMB",
+)
+
+
+# Create a CDSA workspace.
+workspace_cdsa = CDSAWorkspaceFactory.create(
+    study__short_name="MESA",
+    workspace__name="CDSA_MESA_HMB",
+)
+
+# Create an open access workspace
+workspace_open_access = OpenAccessWorkspaceFactory.create(
+    workspace__name="OPEN_ACCESS_FHS",
+)
+workspace_open_access.studies.add(fhs)
+
+
+# Share workspaces with PRIMED_ALL
+primed_all = ManagedGroupFactory.create(name="PRIMED_ALL")
+WorkspaceGroupSharingFactory.create(
+    workspace=workspace_dbgap.workspace, group=primed_all
+)
+WorkspaceGroupSharingFactory.create(
+    workspace=workspace_cdsa.workspace, group=primed_all
+)
+WorkspaceGroupSharingFactory.create(
+    workspace=workspace_open_access.workspace, group=primed_all
+)
diff --git a/primed/primed_anvil/helpers.py b/primed/primed_anvil/helpers.py
@@ -1,6 +1,9 @@
+from itertools import groupby
+
 import pandas as pd
-from anvil_consortium_manager.models import WorkspaceGroupSharing
-from django.db.models import Exists, F, OuterRef, Value
+from anvil_consortium_manager.models import ManagedGroup, WorkspaceGroupSharing
+from django.db.models import CharField, Exists, F, OuterRef, Value
+from django.db.models.functions import Concat
 
 from primed.cdsa.models import CDSAWorkspace
 from primed.dbgap.models import dbGaPWorkspace
@@ -127,3 +130,96 @@ def get_summary_table_data():
     # Convert to a list of dictionaries for passing to the django-tables2 table.
     data = data.to_dict(orient="records")
     return data
+
+
+def get_workspaces_for_phenotype_inventory():
+    """Get input to the primed-phenotype-inventory workflow.
+
+    This function generates the input for the "workspaces" field of the primed-phenotype-inventory workflow. Only
+    workspaces that have been shared with the consortium are included.
+    See dockstore link: https://dockstore.org/workflows/github.com/UW-GAC/primed-inventory-workflows/primed_phenotype_inventory:main?tab=info
+
+    The "workspaces" field has the format:
+    {
+        "billing-project-1/workspace-1": "study1, study2",
+        "billing-project-2/workspace-2": "study3",
+        ...
+    }
+    """  # noqa: E501
+
+    # primed-all group. We will need this to determine if the workspace is shared with PRIMED_ALL.
+    primed_all = ManagedGroup.objects.get(name="PRIMED_ALL")
+
+    dbgap_workspaces = (
+        dbGaPWorkspace.objects.filter(
+            # Just those that are shared with PRIMED_ALL.
+            workspace__workspacegroupsharing__group=primed_all,
+        )
+        .annotate(
+            workspace_name=Concat(
+                F("workspace__billing_project__name"),
+                Value("/"),
+                F("workspace__name"),
+                output_field=CharField(),
+            ),
+            study_names=F("dbgap_study_accession__studies__short_name"),
+        )
+        .values(
+            # "workspace",
+            # "workspace_billing_project",
+            "workspace_name",
+            "study_names",
+        )
+    )
+
+    cdsa_workspaces = (
+        CDSAWorkspace.objects.filter(
+            # Just those that are shared with PRIMED_ALL.
+            workspace__workspacegroupsharing__group=primed_all,
+        )
+        .annotate(
+            workspace_name=Concat(
+                F("workspace__billing_project__name"),
+                Value("/"),
+                F("workspace__name"),
+                output_field=CharField(),
+            ),
+            study_names=F("study__short_name"),
+        )
+        .values(
+            "workspace_name",
+            "study_names",
+        )
+    )
+
+    open_access_workspaces = (
+        OpenAccessWorkspace.objects.filter(
+            # Just those that are shared with PRIMED_ALL.
+            workspace__workspacegroupsharing__group=primed_all,
+        )
+        .annotate(
+            workspace_name=Concat(
+                F("workspace__billing_project__name"),
+                Value("/"),
+                F("workspace__name"),
+                output_field=CharField(),
+            ),
+            study_names=F("studies__short_name"),
+        )
+        .values(
+            "workspace_name",
+            "study_names",
+        )
+    )
+
+    # Combine all querysets and process into the expected output for the AnVIL workflow.
+    workspaces = dbgap_workspaces.union(cdsa_workspaces).union(open_access_workspaces)
+
+    json = {}
+    for key, group in groupby(workspaces, lambda x: x["workspace_name"]):
+        study_names = [x["study_names"] if x["study_names"] else "" for x in group]
+        if not study_names:
+            study_names = ""
+        json[key] = ", ".join(sorted(study_names))
+
+    return json