Merge pull request #6 from belowlab/refactor

Refactor
belowlab · Feb 23, 2024 · 750acb9 · 750acb9
2 parents 192dbb4 + d741aab
commit 750acb9
Show file tree

Hide file tree

Showing 5 changed files with 173 additions and 106 deletions.
diff --git a/drive/cluster/cluster.py b/drive/cluster/cluster.py
@@ -383,86 +383,96 @@ def redo_clustering(
 
         redo_vs = ibd_vs[ibd_vs.idnum.isin(network.haplotypes)]
 
-        # We are going to generate a new Networks object using the redo graph
-        redo_networks = ClusterHandler.generate_graph(redopd, redo_vs)
-        # redo_networks = ClusterHandler.generate_graph(redopd)
-        # performing the random walk
-        redo_walktrap_clusters = self.random_walk(redo_networks)
-
-        # If only one cluster is found
-        if len(redo_walktrap_clusters.sizes()) == 1:
-            # creates an empty dataframe with these columns
-            clst_conn = DataFrame(columns=["idnum", "conn", "conn.N", "TP"])
-            # iterate over each member id
-            # for idnum in network.haplotypes:
-            for idnum in network.members:
-                conn = sum(
-                    list(
-                        map(
-                            lambda x: 1 / x,
-                            redopd.loc[
-                                (redopd["idnum1"] == idnum)
-                                | (redopd["idnum2"] == idnum)
-                            ]["cm"],
+        # If the redopd or redo_vs is empty it causes strange behavior and the code will
+        # usually fail. The desired behavior is for the program to tell teh user that
+        # the graph could not be constructed and then for it to move on.
+        if not redopd.empty and not redo_vs.empty:
+            # We are going to generate a new Networks object using the redo graph
+            redo_networks = ClusterHandler.generate_graph(redopd, redo_vs)
+            # redo_networks = ClusterHandler.generate_graph(redopd)
+            # performing the random walk
+            redo_walktrap_clusters = self.random_walk(redo_networks)
+
+            # If only one cluster is found
+            if len(redo_walktrap_clusters.sizes()) == 1:
+                # creates an empty dataframe with these columns
+                clst_conn = DataFrame(columns=["idnum", "conn", "conn.N", "TP"])
+                # iterate over each member id
+                # for idnum in network.haplotypes:
+                for idnum in network.members:
+                    conn = sum(
+                        list(
+                            map(
+                                lambda x: 1 / x,
+                                redopd.loc[
+                                    (redopd["idnum1"] == idnum)
+                                    | (redopd["idnum2"] == idnum)
+                                ]["cm"],
+                            )
                         )
                     )
+                    conn_idnum = list(
+                        redopd.loc[(redopd["idnum1"] == idnum)]["idnum2"]
+                    ) + list(redopd.loc[(redopd["idnum2"] == idnum)]["idnum1"])
+                    conn_tp = len(
+                        redopd.loc[
+                            redopd["idnum1"].isin(conn_idnum)
+                            & redopd["idnum2"].isin(conn_idnum)
+                        ].index
+                    )
+                    # assert 1 == 0
+                    if len(conn_idnum) == 1:
+                        connTP = 1
+                    else:
+                        try:
+                            connTP = conn_tp / (
+                                len(conn_idnum) * (len(conn_idnum) - 1) / 2
+                            )
+                        except ZeroDivisionError:
+                            raise ZeroDivisionError(
+                                f"There was a zero division error encountered when looking at the network with the id {idnum}"  # noqa: E501
+                            )  # noqa: E501
+
+                    clst_conn.loc[idnum] = [idnum, conn, len(conn_idnum), connTP]
+                rmID = list(
+                    clst_conn.loc[
+                        (
+                            clst_conn["conn.N"]
+                            > (self.segment_dist_threshold * len(network.members))
+                        )
+                        & (clst_conn["TP"] < self.minimum_connected_thres)
+                        & (
+                            clst_conn["conn"]
+                            > sorted(clst_conn["conn"], reverse=True)[
+                                int(self.hub_threshold * len(network.members))
+                            ]
+                        )
+                    ]["idnum"]
                 )
-                conn_idnum = list(
-                    redopd.loc[(redopd["idnum1"] == idnum)]["idnum2"]
-                ) + list(redopd.loc[(redopd["idnum2"] == idnum)]["idnum1"])
-                conn_tp = len(
-                    redopd.loc[
-                        redopd["idnum1"].isin(conn_idnum)
-                        & redopd["idnum2"].isin(conn_idnum)
-                    ].index
+
+                redopd = redopd.loc[
+                    (~redopd["idnum1"].isin(rmID)) & (~redopd["idnum2"].isin(rmID))
+                ]
+                redo_graph = self.generate_graph(
+                    redopd,
                 )
-                # assert 1 == 0
-                if len(conn_idnum) == 1:
-                    connTP = 1
-                else:
-                    try:
-                        connTP = conn_tp / (len(conn_idnum) * (len(conn_idnum) - 1) / 2)
-                    except ZeroDivisionError:
-                        raise ZeroDivisionError(
-                            f"There was a zero division error encountered when looking at the network with the id {idnum}"  # noqa: E501
-                        )  # noqa: E501
-
-                clst_conn.loc[idnum] = [idnum, conn, len(conn_idnum), connTP]
-            rmID = list(
-                clst_conn.loc[
-                    (
-                        clst_conn["conn.N"]
-                        > (self.segment_dist_threshold * len(network.members))
-                    )
-                    & (clst_conn["TP"] < self.minimum_connected_thres)
-                    & (
-                        clst_conn["conn"]
-                        > sorted(clst_conn["conn"], reverse=True)[
-                            int(self.hub_threshold * len(network.members))
-                        ]
-                    )
-                ]["idnum"]
+                # redo_g = ig.Graph.DataFrame(redopd, directed=False)
+                redo_walktrap_clusters = self.random_walk(redo_graph)
+                # redo_walktrap = ig.Graph.community_walktrap(
+                #     redo_g, weights="cm", steps=self.random_walk_step_size
+                # )
+                # redo_walktrap_clusters = redo_walktrap.as_clustering()
+
+            # Filter to the clusters that are llarger than the minimum size
+            allclst = self.filter_cluster_size(redo_walktrap_clusters.sizes())
+
+            self.gather_cluster_info(
+                redo_networks, allclst, redo_walktrap_clusters, original_id
             )
-
-            redopd = redopd.loc[
-                (~redopd["idnum1"].isin(rmID)) & (~redopd["idnum2"].isin(rmID))
-            ]
-            redo_graph = self.generate_graph(
-                redopd,
+        else:
+            logger.debug(
+                f"A graph was not able to be generated when we attempted to recluster the network: {original_id}. This error probably indicates that there were There were none of the {len(network.haplotypes)} individuals in that specific network that shared ibd segments with one another."
             )
-            # redo_g = ig.Graph.DataFrame(redopd, directed=False)
-            redo_walktrap_clusters = self.random_walk(redo_graph)
-            # redo_walktrap = ig.Graph.community_walktrap(
-            #     redo_g, weights="cm", steps=self.random_walk_step_size
-            # )
-            # redo_walktrap_clusters = redo_walktrap.as_clustering()
-
-        # Filter to the clusters that are llarger than the minimum size
-        allclst = self.filter_cluster_size(redo_walktrap_clusters.sizes())
-
-        self.gather_cluster_info(
-            redo_networks, allclst, redo_walktrap_clusters, original_id
-        )
 
 
 def cluster(

diff --git a/drive/drive.py b/drive/drive.py
@@ -3,7 +3,6 @@
 from datetime import datetime
 from pathlib import Path
 import argparse
-from typing import Optional
 from rich_argparse import RichHelpFormatter
 
 
@@ -200,6 +199,13 @@ def main() -> None:
         help="Threshold to filter the network length to remove hub individuals. (default: %(default)s)",
     )
 
+    parser.add_argument(
+        "--phenotype-name",
+        default=None,
+        type=str,
+        help="If the user wishes to only run 1 phenotype from a file with multiple phenotypes they can prioritize a column that has the phenotype name. The name must match with the column.",
+    )
+
     parser.add_argument(
         "--hub-threshold",
         default=0.01,
@@ -220,7 +226,7 @@ def main() -> None:
         type=bool,
         default=True,
         action=argparse.BooleanOptionalAction,
-        help="whether or not the user wishes the program to automically recluster based on things lik hub threshold, max network size and how connected the graph is. ",  # noqa: E501
+        help="whether or not the user wishes the program to automically recluster based on things like hub threshold, max network size and how connected the graph is. ",  # noqa: E501
     )
 
     parser.add_argument(
@@ -279,7 +285,7 @@ def main() -> None:
     # if the user has provided a phenotype file then we will determine case/control/
     # exclusion counts. Otherwise we return an empty dictionary
     if args.cases:
-        with PhenotypeFileParser(args.cases) as phenotype_file:
+        with PhenotypeFileParser(args.cases, args.phenotype_name) as phenotype_file:
             phenotype_counts, cohort_ids = phenotype_file.parse_cases_and_controls()
 
             logger.info(

diff --git a/drive/filters/filter.py b/drive/filters/filter.py
@@ -68,7 +68,14 @@ def load_file(
         if not ibd_file.is_file():
             raise FileNotFoundError(f"The file, {ibd_file}, was not found")
 
-        input_file_chunks = read_csv(ibd_file, sep="\t", header=None, chunksize=100_100)
+        # we need to make sure that the id columns read in as strings no matter what
+        input_file_chunks = read_csv(
+            ibd_file,
+            sep="\t",
+            header=None,
+            chunksize=100_100,
+            dtype={indices.id1_indx: str, indices.id2_indx: str},
+        )
 
         return cls(input_file_chunks, indices, target_gene)
 

diff --git a/drive/utilities/parser/case_file_parser.py b/drive/utilities/parser/case_file_parser.py
@@ -4,7 +4,7 @@
 import gzip
 from logging import Logger
 from pathlib import Path
-from typing import Dict, List, Set, Tuple, TypeVar, Union
+from typing import Dict, List, Optional, Set, Tuple, TypeVar, Union
 
 from drive.log import CustomLogger
 
@@ -18,19 +18,26 @@ class PhenotypeFileParser:
     """Parser used to read in the phenotype file. This will allow use to account for
     different delimiters in files as well as catch errors."""
 
-    def __init__(self, filepath: Union[Path, str]) -> None:
+    def __init__(
+        self, filepath: Union[Path, str], phenotype_name: Optional[str] = None
+    ) -> None:
         """Initialize the PhenotypeFileParser class.
 
         Parameters
         ----------
         filepath : Path | str
             filepath to the phenotype file that has case control status for individuals
 
+        phenotype_name : str
+            Phenotype name that can be used specify a specific column in a
+            phenotype matrix if the user only wants ot focus on 1 phenotype.
+
         Raises
         ------
         FileNotFoundError
         """
         self.individuals: List[str] = []
+        self.specific_phenotype: str = phenotype_name
         # we are going to make sure the filepath variable is a
         # PosixPath
         filepath = Path(filepath)
@@ -140,28 +147,33 @@ def _determine_status(
 
         # go through each value in the file
         for indx, value in enumerate(line[1:]):
-            phenotype_mapping = phenotype_indx.get(indx)
-
-            if value == "1" or value == "1.0":
-                phenotype_dict[phenotype_mapping]["cases"].add(grid_id)
-            elif value == "0" or value == "0.0":
-                phenotype_dict[phenotype_mapping]["controls"].add(grid_id)
-            # we are going to excluded on values na, n/a, -1, -1.
-            # 0, "", " " to try to catch different values
-            elif value.lower() in ["na", "n/a", "-1", "-1.0", " ", ""]:
-                phenotype_dict[phenotype_mapping]["excluded"].add(grid_id)
-            else:
-                logger.warning(
-                    f"The status for individual, {grid_id}, was not recognized. The status found in the file was {value} for phenotype {phenotype_mapping}. This individual will be added to the exclusion list but it is recommended that the user checks to ensure that this is not a typo in the phenotype file."  # noqa: E501
-                )
-                phenotype_dict[phenotype_mapping]["excluded"].add(grid_id)
+            # If the user only wants to specify a specific column then there is a
+            # possible of the indx number not being in the phenotype indx. We can
+            # use the walrus operator to make sure that we only attempt to
+            # determine the case control status for the phenotype if .get function # returns a value and not none
+            if phenotype_mapping := phenotype_indx.get(indx):
+                if value == "1" or value == "1.0":
+                    phenotype_dict[phenotype_mapping]["cases"].add(grid_id)
+                elif value == "0" or value == "0.0":
+                    phenotype_dict[phenotype_mapping]["controls"].add(grid_id)
+                # we are going to excluded on values na, n/a, -1, -1.
+                # 0, "", " " to try to catch different values
+                elif value.lower() in ["na", "n/a", "-1", "-1.0", " ", ""]:
+                    phenotype_dict[phenotype_mapping]["excluded"].add(grid_id)
+                else:
+                    logger.warning(
+                        f"The status for individual, {grid_id}, was not recognized. The status found in the file was {value} for phenotype {phenotype_mapping}. This individual will be added to the exclusion list but it is recommended that the user checks to ensure that this is not a typo in the phenotype file."  # noqa: E501
+                    )
+                    phenotype_dict[phenotype_mapping]["excluded"].add(grid_id)
+
+        logger.debug(f"Phenotype dictionary after adding individuals: {phenotype_dict}")
 
     def _create_phenotype_dictionary(
         self,
         header_line: str,
     ) -> Tuple[Dict[str, Dict[str, Set[str]]], Dict[int, str], str]:
         """Function that will generate a dictionary where the keys are
-        phenotypes and the values list of cases/exclusions/controls
+        phenotypes and the values are lists of the cases/exclusions/controls
 
         Parameters
         ----------
@@ -178,6 +190,14 @@ def _create_phenotype_dictionary(
             a dictionary that maps the index of the phenotype in the
             header line to the phenotype name. The third element is the
             separator string
+
+        Raises
+        ------
+        ValueError
+            This function will raise a Value error if the value 'grid' or 'grids'
+            (uppercase or lowercase) is not found in the header line. The function
+            can also raise a value error if the user tries to specify a phenotype
+            name that is not in the header line.
         """
 
         # determining what the appropriate separator should be
@@ -203,17 +223,41 @@ def _create_phenotype_dictionary(
         # controls
         phenotype_dict = {}
 
-        # build each dictionary
-        for indx, phenotype in enumerate(split_line_phenotypes):
-            phenotype_indx[indx] = phenotype
-            phenotype_dict[phenotype] = {
-                "cases": set(),
-                "controls": set(),
-                "excluded": set(),
-            }
+        # build a dictionary for each phenotype of the cases and the controls.
+        # If the user has specified a phenotype, we will only add that phenotype
+        # and we will add its corresponding index to the phenotype_indx dictionary
+        # This block will catch a value error if the user provides a phenotype name
+        # that is not in the file
+        if self.specific_phenotype:
+            try:
+                indx = split_line_phenotypes.index(self.specific_phenotype)
+            except ValueError:
+                logger.critical(
+                    f"The value {self.specific_phenotype} was not found in one of the phenotype column files. Please make sure you spelled the phenotype name the exact same as it is in the phenotype file."
+                )
+                raise ValueError(
+                    "The value {self.specific_phenotype} was not found in the phenotype file."
+                )
+            else:
+                phenotype_indx[indx] = self.specific_phenotype
+                phenotype_dict[self.specific_phenotype] = {
+                    "cases": set(),
+                    "controls": set(),
+                    "excluded": set(),
+                }
+        else:
+            for indx, phenotype in enumerate(split_line_phenotypes):
+                phenotype_indx[indx] = phenotype
+                phenotype_dict[phenotype] = {
+                    "cases": set(),
+                    "controls": set(),
+                    "excluded": set(),
+                }
 
         logger.debug(f"Phenotype index dictionary:\n {phenotype_indx}")
-        logger.debug(f"Phenotype counts dictionary: \n {phenotype_dict}")
+        logger.debug(
+            f"Phenotype counts dictionary has {len(phenotype_dict.keys())} PheCodes as keys after creation."
+        )
         return phenotype_dict, phenotype_indx, separator
 
     def parse_cases_and_controls(

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "drive-ibd"
 
-version = "2.2.0"
+version = "2.3.3"
 description = "cli tool to identify networks of individuals who share IBD segments overlapping loci of interest"
 authors = ["James Baker <[email protected]>", "Hung-Hsin Chen <[email protected]>", "Jennifer E. Below <[email protected]>"]
 maintainers = ["James Baker <[email protected]>", "Hung-Hsin Chen <[email protected]>"]