diff --git a/drive/cluster/cluster.py b/drive/cluster/cluster.py index 54697ff..ca04248 100644 --- a/drive/cluster/cluster.py +++ b/drive/cluster/cluster.py @@ -383,86 +383,96 @@ def redo_clustering( redo_vs = ibd_vs[ibd_vs.idnum.isin(network.haplotypes)] - # We are going to generate a new Networks object using the redo graph - redo_networks = ClusterHandler.generate_graph(redopd, redo_vs) - # redo_networks = ClusterHandler.generate_graph(redopd) - # performing the random walk - redo_walktrap_clusters = self.random_walk(redo_networks) - - # If only one cluster is found - if len(redo_walktrap_clusters.sizes()) == 1: - # creates an empty dataframe with these columns - clst_conn = DataFrame(columns=["idnum", "conn", "conn.N", "TP"]) - # iterate over each member id - # for idnum in network.haplotypes: - for idnum in network.members: - conn = sum( - list( - map( - lambda x: 1 / x, - redopd.loc[ - (redopd["idnum1"] == idnum) - | (redopd["idnum2"] == idnum) - ]["cm"], + # If the redopd or redo_vs is empty it causes strange behavior and the code will + # usually fail. The desired behavior is for the program to tell teh user that + # the graph could not be constructed and then for it to move on. + if not redopd.empty and not redo_vs.empty: + # We are going to generate a new Networks object using the redo graph + redo_networks = ClusterHandler.generate_graph(redopd, redo_vs) + # redo_networks = ClusterHandler.generate_graph(redopd) + # performing the random walk + redo_walktrap_clusters = self.random_walk(redo_networks) + + # If only one cluster is found + if len(redo_walktrap_clusters.sizes()) == 1: + # creates an empty dataframe with these columns + clst_conn = DataFrame(columns=["idnum", "conn", "conn.N", "TP"]) + # iterate over each member id + # for idnum in network.haplotypes: + for idnum in network.members: + conn = sum( + list( + map( + lambda x: 1 / x, + redopd.loc[ + (redopd["idnum1"] == idnum) + | (redopd["idnum2"] == idnum) + ]["cm"], + ) ) ) + conn_idnum = list( + redopd.loc[(redopd["idnum1"] == idnum)]["idnum2"] + ) + list(redopd.loc[(redopd["idnum2"] == idnum)]["idnum1"]) + conn_tp = len( + redopd.loc[ + redopd["idnum1"].isin(conn_idnum) + & redopd["idnum2"].isin(conn_idnum) + ].index + ) + # assert 1 == 0 + if len(conn_idnum) == 1: + connTP = 1 + else: + try: + connTP = conn_tp / ( + len(conn_idnum) * (len(conn_idnum) - 1) / 2 + ) + except ZeroDivisionError: + raise ZeroDivisionError( + f"There was a zero division error encountered when looking at the network with the id {idnum}" # noqa: E501 + ) # noqa: E501 + + clst_conn.loc[idnum] = [idnum, conn, len(conn_idnum), connTP] + rmID = list( + clst_conn.loc[ + ( + clst_conn["conn.N"] + > (self.segment_dist_threshold * len(network.members)) + ) + & (clst_conn["TP"] < self.minimum_connected_thres) + & ( + clst_conn["conn"] + > sorted(clst_conn["conn"], reverse=True)[ + int(self.hub_threshold * len(network.members)) + ] + ) + ]["idnum"] ) - conn_idnum = list( - redopd.loc[(redopd["idnum1"] == idnum)]["idnum2"] - ) + list(redopd.loc[(redopd["idnum2"] == idnum)]["idnum1"]) - conn_tp = len( - redopd.loc[ - redopd["idnum1"].isin(conn_idnum) - & redopd["idnum2"].isin(conn_idnum) - ].index + + redopd = redopd.loc[ + (~redopd["idnum1"].isin(rmID)) & (~redopd["idnum2"].isin(rmID)) + ] + redo_graph = self.generate_graph( + redopd, ) - # assert 1 == 0 - if len(conn_idnum) == 1: - connTP = 1 - else: - try: - connTP = conn_tp / (len(conn_idnum) * (len(conn_idnum) - 1) / 2) - except ZeroDivisionError: - raise ZeroDivisionError( - f"There was a zero division error encountered when looking at the network with the id {idnum}" # noqa: E501 - ) # noqa: E501 - - clst_conn.loc[idnum] = [idnum, conn, len(conn_idnum), connTP] - rmID = list( - clst_conn.loc[ - ( - clst_conn["conn.N"] - > (self.segment_dist_threshold * len(network.members)) - ) - & (clst_conn["TP"] < self.minimum_connected_thres) - & ( - clst_conn["conn"] - > sorted(clst_conn["conn"], reverse=True)[ - int(self.hub_threshold * len(network.members)) - ] - ) - ]["idnum"] + # redo_g = ig.Graph.DataFrame(redopd, directed=False) + redo_walktrap_clusters = self.random_walk(redo_graph) + # redo_walktrap = ig.Graph.community_walktrap( + # redo_g, weights="cm", steps=self.random_walk_step_size + # ) + # redo_walktrap_clusters = redo_walktrap.as_clustering() + + # Filter to the clusters that are llarger than the minimum size + allclst = self.filter_cluster_size(redo_walktrap_clusters.sizes()) + + self.gather_cluster_info( + redo_networks, allclst, redo_walktrap_clusters, original_id ) - - redopd = redopd.loc[ - (~redopd["idnum1"].isin(rmID)) & (~redopd["idnum2"].isin(rmID)) - ] - redo_graph = self.generate_graph( - redopd, + else: + logger.debug( + f"A graph was not able to be generated when we attempted to recluster the network: {original_id}. This error probably indicates that there were There were none of the {len(network.haplotypes)} individuals in that specific network that shared ibd segments with one another." ) - # redo_g = ig.Graph.DataFrame(redopd, directed=False) - redo_walktrap_clusters = self.random_walk(redo_graph) - # redo_walktrap = ig.Graph.community_walktrap( - # redo_g, weights="cm", steps=self.random_walk_step_size - # ) - # redo_walktrap_clusters = redo_walktrap.as_clustering() - - # Filter to the clusters that are llarger than the minimum size - allclst = self.filter_cluster_size(redo_walktrap_clusters.sizes()) - - self.gather_cluster_info( - redo_networks, allclst, redo_walktrap_clusters, original_id - ) def cluster( diff --git a/drive/drive.py b/drive/drive.py index 87d12c7..c1ce91d 100755 --- a/drive/drive.py +++ b/drive/drive.py @@ -3,7 +3,6 @@ from datetime import datetime from pathlib import Path import argparse -from typing import Optional from rich_argparse import RichHelpFormatter @@ -200,6 +199,13 @@ def main() -> None: help="Threshold to filter the network length to remove hub individuals. (default: %(default)s)", ) + parser.add_argument( + "--phenotype-name", + default=None, + type=str, + help="If the user wishes to only run 1 phenotype from a file with multiple phenotypes they can prioritize a column that has the phenotype name. The name must match with the column.", + ) + parser.add_argument( "--hub-threshold", default=0.01, @@ -220,7 +226,7 @@ def main() -> None: type=bool, default=True, action=argparse.BooleanOptionalAction, - help="whether or not the user wishes the program to automically recluster based on things lik hub threshold, max network size and how connected the graph is. ", # noqa: E501 + help="whether or not the user wishes the program to automically recluster based on things like hub threshold, max network size and how connected the graph is. ", # noqa: E501 ) parser.add_argument( @@ -279,7 +285,7 @@ def main() -> None: # if the user has provided a phenotype file then we will determine case/control/ # exclusion counts. Otherwise we return an empty dictionary if args.cases: - with PhenotypeFileParser(args.cases) as phenotype_file: + with PhenotypeFileParser(args.cases, args.phenotype_name) as phenotype_file: phenotype_counts, cohort_ids = phenotype_file.parse_cases_and_controls() logger.info( diff --git a/drive/filters/filter.py b/drive/filters/filter.py index 4909a26..2c2826c 100644 --- a/drive/filters/filter.py +++ b/drive/filters/filter.py @@ -68,7 +68,14 @@ def load_file( if not ibd_file.is_file(): raise FileNotFoundError(f"The file, {ibd_file}, was not found") - input_file_chunks = read_csv(ibd_file, sep="\t", header=None, chunksize=100_100) + # we need to make sure that the id columns read in as strings no matter what + input_file_chunks = read_csv( + ibd_file, + sep="\t", + header=None, + chunksize=100_100, + dtype={indices.id1_indx: str, indices.id2_indx: str}, + ) return cls(input_file_chunks, indices, target_gene) diff --git a/drive/utilities/parser/case_file_parser.py b/drive/utilities/parser/case_file_parser.py index 71110bb..56956cc 100644 --- a/drive/utilities/parser/case_file_parser.py +++ b/drive/utilities/parser/case_file_parser.py @@ -4,7 +4,7 @@ import gzip from logging import Logger from pathlib import Path -from typing import Dict, List, Set, Tuple, TypeVar, Union +from typing import Dict, List, Optional, Set, Tuple, TypeVar, Union from drive.log import CustomLogger @@ -18,7 +18,9 @@ class PhenotypeFileParser: """Parser used to read in the phenotype file. This will allow use to account for different delimiters in files as well as catch errors.""" - def __init__(self, filepath: Union[Path, str]) -> None: + def __init__( + self, filepath: Union[Path, str], phenotype_name: Optional[str] = None + ) -> None: """Initialize the PhenotypeFileParser class. Parameters @@ -26,11 +28,16 @@ def __init__(self, filepath: Union[Path, str]) -> None: filepath : Path | str filepath to the phenotype file that has case control status for individuals + phenotype_name : str + Phenotype name that can be used specify a specific column in a + phenotype matrix if the user only wants ot focus on 1 phenotype. + Raises ------ FileNotFoundError """ self.individuals: List[str] = [] + self.specific_phenotype: str = phenotype_name # we are going to make sure the filepath variable is a # PosixPath filepath = Path(filepath) @@ -140,28 +147,33 @@ def _determine_status( # go through each value in the file for indx, value in enumerate(line[1:]): - phenotype_mapping = phenotype_indx.get(indx) - - if value == "1" or value == "1.0": - phenotype_dict[phenotype_mapping]["cases"].add(grid_id) - elif value == "0" or value == "0.0": - phenotype_dict[phenotype_mapping]["controls"].add(grid_id) - # we are going to excluded on values na, n/a, -1, -1. - # 0, "", " " to try to catch different values - elif value.lower() in ["na", "n/a", "-1", "-1.0", " ", ""]: - phenotype_dict[phenotype_mapping]["excluded"].add(grid_id) - else: - logger.warning( - f"The status for individual, {grid_id}, was not recognized. The status found in the file was {value} for phenotype {phenotype_mapping}. This individual will be added to the exclusion list but it is recommended that the user checks to ensure that this is not a typo in the phenotype file." # noqa: E501 - ) - phenotype_dict[phenotype_mapping]["excluded"].add(grid_id) + # If the user only wants to specify a specific column then there is a + # possible of the indx number not being in the phenotype indx. We can + # use the walrus operator to make sure that we only attempt to + # determine the case control status for the phenotype if .get function # returns a value and not none + if phenotype_mapping := phenotype_indx.get(indx): + if value == "1" or value == "1.0": + phenotype_dict[phenotype_mapping]["cases"].add(grid_id) + elif value == "0" or value == "0.0": + phenotype_dict[phenotype_mapping]["controls"].add(grid_id) + # we are going to excluded on values na, n/a, -1, -1. + # 0, "", " " to try to catch different values + elif value.lower() in ["na", "n/a", "-1", "-1.0", " ", ""]: + phenotype_dict[phenotype_mapping]["excluded"].add(grid_id) + else: + logger.warning( + f"The status for individual, {grid_id}, was not recognized. The status found in the file was {value} for phenotype {phenotype_mapping}. This individual will be added to the exclusion list but it is recommended that the user checks to ensure that this is not a typo in the phenotype file." # noqa: E501 + ) + phenotype_dict[phenotype_mapping]["excluded"].add(grid_id) + + logger.debug(f"Phenotype dictionary after adding individuals: {phenotype_dict}") def _create_phenotype_dictionary( self, header_line: str, ) -> Tuple[Dict[str, Dict[str, Set[str]]], Dict[int, str], str]: """Function that will generate a dictionary where the keys are - phenotypes and the values list of cases/exclusions/controls + phenotypes and the values are lists of the cases/exclusions/controls Parameters ---------- @@ -178,6 +190,14 @@ def _create_phenotype_dictionary( a dictionary that maps the index of the phenotype in the header line to the phenotype name. The third element is the separator string + + Raises + ------ + ValueError + This function will raise a Value error if the value 'grid' or 'grids' + (uppercase or lowercase) is not found in the header line. The function + can also raise a value error if the user tries to specify a phenotype + name that is not in the header line. """ # determining what the appropriate separator should be @@ -203,17 +223,41 @@ def _create_phenotype_dictionary( # controls phenotype_dict = {} - # build each dictionary - for indx, phenotype in enumerate(split_line_phenotypes): - phenotype_indx[indx] = phenotype - phenotype_dict[phenotype] = { - "cases": set(), - "controls": set(), - "excluded": set(), - } + # build a dictionary for each phenotype of the cases and the controls. + # If the user has specified a phenotype, we will only add that phenotype + # and we will add its corresponding index to the phenotype_indx dictionary + # This block will catch a value error if the user provides a phenotype name + # that is not in the file + if self.specific_phenotype: + try: + indx = split_line_phenotypes.index(self.specific_phenotype) + except ValueError: + logger.critical( + f"The value {self.specific_phenotype} was not found in one of the phenotype column files. Please make sure you spelled the phenotype name the exact same as it is in the phenotype file." + ) + raise ValueError( + "The value {self.specific_phenotype} was not found in the phenotype file." + ) + else: + phenotype_indx[indx] = self.specific_phenotype + phenotype_dict[self.specific_phenotype] = { + "cases": set(), + "controls": set(), + "excluded": set(), + } + else: + for indx, phenotype in enumerate(split_line_phenotypes): + phenotype_indx[indx] = phenotype + phenotype_dict[phenotype] = { + "cases": set(), + "controls": set(), + "excluded": set(), + } logger.debug(f"Phenotype index dictionary:\n {phenotype_indx}") - logger.debug(f"Phenotype counts dictionary: \n {phenotype_dict}") + logger.debug( + f"Phenotype counts dictionary has {len(phenotype_dict.keys())} PheCodes as keys after creation." + ) return phenotype_dict, phenotype_indx, separator def parse_cases_and_controls( diff --git a/pyproject.toml b/pyproject.toml index cfa35f7..801dd59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "drive-ibd" -version = "2.2.0" +version = "2.3.3" description = "cli tool to identify networks of individuals who share IBD segments overlapping loci of interest" authors = ["James Baker ", "Hung-Hsin Chen ", "Jennifer E. Below "] maintainers = ["James Baker ", "Hung-Hsin Chen "]