Skip to content

Commit

Permalink
Merge pull request #6 from belowlab/refactor
Browse files Browse the repository at this point in the history
Refactor
  • Loading branch information
jtb324 authored Feb 23, 2024
2 parents 192dbb4 + d741aab commit 750acb9
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 106 deletions.
158 changes: 84 additions & 74 deletions drive/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,86 +383,96 @@ def redo_clustering(

redo_vs = ibd_vs[ibd_vs.idnum.isin(network.haplotypes)]

# We are going to generate a new Networks object using the redo graph
redo_networks = ClusterHandler.generate_graph(redopd, redo_vs)
# redo_networks = ClusterHandler.generate_graph(redopd)
# performing the random walk
redo_walktrap_clusters = self.random_walk(redo_networks)

# If only one cluster is found
if len(redo_walktrap_clusters.sizes()) == 1:
# creates an empty dataframe with these columns
clst_conn = DataFrame(columns=["idnum", "conn", "conn.N", "TP"])
# iterate over each member id
# for idnum in network.haplotypes:
for idnum in network.members:
conn = sum(
list(
map(
lambda x: 1 / x,
redopd.loc[
(redopd["idnum1"] == idnum)
| (redopd["idnum2"] == idnum)
]["cm"],
# If the redopd or redo_vs is empty it causes strange behavior and the code will
# usually fail. The desired behavior is for the program to tell teh user that
# the graph could not be constructed and then for it to move on.
if not redopd.empty and not redo_vs.empty:
# We are going to generate a new Networks object using the redo graph
redo_networks = ClusterHandler.generate_graph(redopd, redo_vs)
# redo_networks = ClusterHandler.generate_graph(redopd)
# performing the random walk
redo_walktrap_clusters = self.random_walk(redo_networks)

# If only one cluster is found
if len(redo_walktrap_clusters.sizes()) == 1:
# creates an empty dataframe with these columns
clst_conn = DataFrame(columns=["idnum", "conn", "conn.N", "TP"])
# iterate over each member id
# for idnum in network.haplotypes:
for idnum in network.members:
conn = sum(
list(
map(
lambda x: 1 / x,
redopd.loc[
(redopd["idnum1"] == idnum)
| (redopd["idnum2"] == idnum)
]["cm"],
)
)
)
conn_idnum = list(
redopd.loc[(redopd["idnum1"] == idnum)]["idnum2"]
) + list(redopd.loc[(redopd["idnum2"] == idnum)]["idnum1"])
conn_tp = len(
redopd.loc[
redopd["idnum1"].isin(conn_idnum)
& redopd["idnum2"].isin(conn_idnum)
].index
)
# assert 1 == 0
if len(conn_idnum) == 1:
connTP = 1
else:
try:
connTP = conn_tp / (
len(conn_idnum) * (len(conn_idnum) - 1) / 2
)
except ZeroDivisionError:
raise ZeroDivisionError(
f"There was a zero division error encountered when looking at the network with the id {idnum}" # noqa: E501
) # noqa: E501

clst_conn.loc[idnum] = [idnum, conn, len(conn_idnum), connTP]
rmID = list(
clst_conn.loc[
(
clst_conn["conn.N"]
> (self.segment_dist_threshold * len(network.members))
)
& (clst_conn["TP"] < self.minimum_connected_thres)
& (
clst_conn["conn"]
> sorted(clst_conn["conn"], reverse=True)[
int(self.hub_threshold * len(network.members))
]
)
]["idnum"]
)
conn_idnum = list(
redopd.loc[(redopd["idnum1"] == idnum)]["idnum2"]
) + list(redopd.loc[(redopd["idnum2"] == idnum)]["idnum1"])
conn_tp = len(
redopd.loc[
redopd["idnum1"].isin(conn_idnum)
& redopd["idnum2"].isin(conn_idnum)
].index

redopd = redopd.loc[
(~redopd["idnum1"].isin(rmID)) & (~redopd["idnum2"].isin(rmID))
]
redo_graph = self.generate_graph(
redopd,
)
# assert 1 == 0
if len(conn_idnum) == 1:
connTP = 1
else:
try:
connTP = conn_tp / (len(conn_idnum) * (len(conn_idnum) - 1) / 2)
except ZeroDivisionError:
raise ZeroDivisionError(
f"There was a zero division error encountered when looking at the network with the id {idnum}" # noqa: E501
) # noqa: E501

clst_conn.loc[idnum] = [idnum, conn, len(conn_idnum), connTP]
rmID = list(
clst_conn.loc[
(
clst_conn["conn.N"]
> (self.segment_dist_threshold * len(network.members))
)
& (clst_conn["TP"] < self.minimum_connected_thres)
& (
clst_conn["conn"]
> sorted(clst_conn["conn"], reverse=True)[
int(self.hub_threshold * len(network.members))
]
)
]["idnum"]
# redo_g = ig.Graph.DataFrame(redopd, directed=False)
redo_walktrap_clusters = self.random_walk(redo_graph)
# redo_walktrap = ig.Graph.community_walktrap(
# redo_g, weights="cm", steps=self.random_walk_step_size
# )
# redo_walktrap_clusters = redo_walktrap.as_clustering()

# Filter to the clusters that are llarger than the minimum size
allclst = self.filter_cluster_size(redo_walktrap_clusters.sizes())

self.gather_cluster_info(
redo_networks, allclst, redo_walktrap_clusters, original_id
)

redopd = redopd.loc[
(~redopd["idnum1"].isin(rmID)) & (~redopd["idnum2"].isin(rmID))
]
redo_graph = self.generate_graph(
redopd,
else:
logger.debug(
f"A graph was not able to be generated when we attempted to recluster the network: {original_id}. This error probably indicates that there were There were none of the {len(network.haplotypes)} individuals in that specific network that shared ibd segments with one another."
)
# redo_g = ig.Graph.DataFrame(redopd, directed=False)
redo_walktrap_clusters = self.random_walk(redo_graph)
# redo_walktrap = ig.Graph.community_walktrap(
# redo_g, weights="cm", steps=self.random_walk_step_size
# )
# redo_walktrap_clusters = redo_walktrap.as_clustering()

# Filter to the clusters that are llarger than the minimum size
allclst = self.filter_cluster_size(redo_walktrap_clusters.sizes())

self.gather_cluster_info(
redo_networks, allclst, redo_walktrap_clusters, original_id
)


def cluster(
Expand Down
12 changes: 9 additions & 3 deletions drive/drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from datetime import datetime
from pathlib import Path
import argparse
from typing import Optional
from rich_argparse import RichHelpFormatter


Expand Down Expand Up @@ -200,6 +199,13 @@ def main() -> None:
help="Threshold to filter the network length to remove hub individuals. (default: %(default)s)",
)

parser.add_argument(
"--phenotype-name",
default=None,
type=str,
help="If the user wishes to only run 1 phenotype from a file with multiple phenotypes they can prioritize a column that has the phenotype name. The name must match with the column.",
)

parser.add_argument(
"--hub-threshold",
default=0.01,
Expand All @@ -220,7 +226,7 @@ def main() -> None:
type=bool,
default=True,
action=argparse.BooleanOptionalAction,
help="whether or not the user wishes the program to automically recluster based on things lik hub threshold, max network size and how connected the graph is. ", # noqa: E501
help="whether or not the user wishes the program to automically recluster based on things like hub threshold, max network size and how connected the graph is. ", # noqa: E501
)

parser.add_argument(
Expand Down Expand Up @@ -279,7 +285,7 @@ def main() -> None:
# if the user has provided a phenotype file then we will determine case/control/
# exclusion counts. Otherwise we return an empty dictionary
if args.cases:
with PhenotypeFileParser(args.cases) as phenotype_file:
with PhenotypeFileParser(args.cases, args.phenotype_name) as phenotype_file:
phenotype_counts, cohort_ids = phenotype_file.parse_cases_and_controls()

logger.info(
Expand Down
9 changes: 8 additions & 1 deletion drive/filters/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,14 @@ def load_file(
if not ibd_file.is_file():
raise FileNotFoundError(f"The file, {ibd_file}, was not found")

input_file_chunks = read_csv(ibd_file, sep="\t", header=None, chunksize=100_100)
# we need to make sure that the id columns read in as strings no matter what
input_file_chunks = read_csv(
ibd_file,
sep="\t",
header=None,
chunksize=100_100,
dtype={indices.id1_indx: str, indices.id2_indx: str},
)

return cls(input_file_chunks, indices, target_gene)

Expand Down
98 changes: 71 additions & 27 deletions drive/utilities/parser/case_file_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import gzip
from logging import Logger
from pathlib import Path
from typing import Dict, List, Set, Tuple, TypeVar, Union
from typing import Dict, List, Optional, Set, Tuple, TypeVar, Union

from drive.log import CustomLogger

Expand All @@ -18,19 +18,26 @@ class PhenotypeFileParser:
"""Parser used to read in the phenotype file. This will allow use to account for
different delimiters in files as well as catch errors."""

def __init__(self, filepath: Union[Path, str]) -> None:
def __init__(
self, filepath: Union[Path, str], phenotype_name: Optional[str] = None
) -> None:
"""Initialize the PhenotypeFileParser class.
Parameters
----------
filepath : Path | str
filepath to the phenotype file that has case control status for individuals
phenotype_name : str
Phenotype name that can be used specify a specific column in a
phenotype matrix if the user only wants ot focus on 1 phenotype.
Raises
------
FileNotFoundError
"""
self.individuals: List[str] = []
self.specific_phenotype: str = phenotype_name
# we are going to make sure the filepath variable is a
# PosixPath
filepath = Path(filepath)
Expand Down Expand Up @@ -140,28 +147,33 @@ def _determine_status(

# go through each value in the file
for indx, value in enumerate(line[1:]):
phenotype_mapping = phenotype_indx.get(indx)

if value == "1" or value == "1.0":
phenotype_dict[phenotype_mapping]["cases"].add(grid_id)
elif value == "0" or value == "0.0":
phenotype_dict[phenotype_mapping]["controls"].add(grid_id)
# we are going to excluded on values na, n/a, -1, -1.
# 0, "", " " to try to catch different values
elif value.lower() in ["na", "n/a", "-1", "-1.0", " ", ""]:
phenotype_dict[phenotype_mapping]["excluded"].add(grid_id)
else:
logger.warning(
f"The status for individual, {grid_id}, was not recognized. The status found in the file was {value} for phenotype {phenotype_mapping}. This individual will be added to the exclusion list but it is recommended that the user checks to ensure that this is not a typo in the phenotype file." # noqa: E501
)
phenotype_dict[phenotype_mapping]["excluded"].add(grid_id)
# If the user only wants to specify a specific column then there is a
# possible of the indx number not being in the phenotype indx. We can
# use the walrus operator to make sure that we only attempt to
# determine the case control status for the phenotype if .get function # returns a value and not none
if phenotype_mapping := phenotype_indx.get(indx):
if value == "1" or value == "1.0":
phenotype_dict[phenotype_mapping]["cases"].add(grid_id)
elif value == "0" or value == "0.0":
phenotype_dict[phenotype_mapping]["controls"].add(grid_id)
# we are going to excluded on values na, n/a, -1, -1.
# 0, "", " " to try to catch different values
elif value.lower() in ["na", "n/a", "-1", "-1.0", " ", ""]:
phenotype_dict[phenotype_mapping]["excluded"].add(grid_id)
else:
logger.warning(
f"The status for individual, {grid_id}, was not recognized. The status found in the file was {value} for phenotype {phenotype_mapping}. This individual will be added to the exclusion list but it is recommended that the user checks to ensure that this is not a typo in the phenotype file." # noqa: E501
)
phenotype_dict[phenotype_mapping]["excluded"].add(grid_id)

logger.debug(f"Phenotype dictionary after adding individuals: {phenotype_dict}")

def _create_phenotype_dictionary(
self,
header_line: str,
) -> Tuple[Dict[str, Dict[str, Set[str]]], Dict[int, str], str]:
"""Function that will generate a dictionary where the keys are
phenotypes and the values list of cases/exclusions/controls
phenotypes and the values are lists of the cases/exclusions/controls
Parameters
----------
Expand All @@ -178,6 +190,14 @@ def _create_phenotype_dictionary(
a dictionary that maps the index of the phenotype in the
header line to the phenotype name. The third element is the
separator string
Raises
------
ValueError
This function will raise a Value error if the value 'grid' or 'grids'
(uppercase or lowercase) is not found in the header line. The function
can also raise a value error if the user tries to specify a phenotype
name that is not in the header line.
"""

# determining what the appropriate separator should be
Expand All @@ -203,17 +223,41 @@ def _create_phenotype_dictionary(
# controls
phenotype_dict = {}

# build each dictionary
for indx, phenotype in enumerate(split_line_phenotypes):
phenotype_indx[indx] = phenotype
phenotype_dict[phenotype] = {
"cases": set(),
"controls": set(),
"excluded": set(),
}
# build a dictionary for each phenotype of the cases and the controls.
# If the user has specified a phenotype, we will only add that phenotype
# and we will add its corresponding index to the phenotype_indx dictionary
# This block will catch a value error if the user provides a phenotype name
# that is not in the file
if self.specific_phenotype:
try:
indx = split_line_phenotypes.index(self.specific_phenotype)
except ValueError:
logger.critical(
f"The value {self.specific_phenotype} was not found in one of the phenotype column files. Please make sure you spelled the phenotype name the exact same as it is in the phenotype file."
)
raise ValueError(
"The value {self.specific_phenotype} was not found in the phenotype file."
)
else:
phenotype_indx[indx] = self.specific_phenotype
phenotype_dict[self.specific_phenotype] = {
"cases": set(),
"controls": set(),
"excluded": set(),
}
else:
for indx, phenotype in enumerate(split_line_phenotypes):
phenotype_indx[indx] = phenotype
phenotype_dict[phenotype] = {
"cases": set(),
"controls": set(),
"excluded": set(),
}

logger.debug(f"Phenotype index dictionary:\n {phenotype_indx}")
logger.debug(f"Phenotype counts dictionary: \n {phenotype_dict}")
logger.debug(
f"Phenotype counts dictionary has {len(phenotype_dict.keys())} PheCodes as keys after creation."
)
return phenotype_dict, phenotype_indx, separator

def parse_cases_and_controls(
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
name = "drive-ibd"

version = "2.2.0"
version = "2.3.3"
description = "cli tool to identify networks of individuals who share IBD segments overlapping loci of interest"
authors = ["James Baker <[email protected]>", "Hung-Hsin Chen <[email protected]>", "Jennifer E. Below <[email protected]>"]
maintainers = ["James Baker <[email protected]>", "Hung-Hsin Chen <[email protected]>"]
Expand Down

0 comments on commit 750acb9

Please sign in to comment.