Skip to content

Commit

Permalink
added sig. figs to metrics, removed CKD data generator, versioned to …
Browse files Browse the repository at this point in the history
…0.1.8a4
  • Loading branch information
lshpaner committed Jul 27, 2024
1 parent 07d74e5 commit d0a097c
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 171 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name="kfre",
version="0.1.8a3",
version="0.1.8a4",
author="Leonid Shpaner",
author_email="[email protected]",
description="A Python library for estimating kidney failure risk using the KFRE model developed by Tangri et al.",
Expand Down
2 changes: 1 addition & 1 deletion src/kfre/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.1.8a3"
__version__ = "0.1.8a4"


from .perform_eval import *
Expand Down
193 changes: 24 additions & 169 deletions src/kfre/perform_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def calc_esrd_outcome(
if create_years_col:
# Create a 'years' column based on the duration_col
years_col = "ESRD_duration_years"
df[years_col] = round(df[duration_col] / 365.25)
df[years_col] = df[duration_col] / 365.25

else:
# Use the provided duration_col directly
Expand Down Expand Up @@ -151,6 +151,7 @@ def plot_kfre_metrics(
show_years=[2, 5],
plot_combinations=False,
show_grids=False,
decimal_places=2,
):
"""
Generate the true labels and predicted probabilities for 2-year and 5-year
Expand Down Expand Up @@ -203,6 +204,10 @@ def plot_kfre_metrics(
show_grids : bool, optional
Whether to show grid plots of all combinations. Default is False.
decimal_places : int, optional
Number of decimal places for AUC and AP scores in the plot legends.
Default is 2.
Returns:
-------
tuple (optional)
Expand Down Expand Up @@ -321,7 +326,7 @@ def plot_kfre_metrics(
plt.plot(
fpr,
tpr,
label=f"{n}-variable {outcome} outcome (AUC = {auc_score:.02f})",
label=f"{n}-variable {outcome} outcome (AUC = {auc_score:.{decimal_places}f})",
) # Plot ROC curve
plt.plot(
[0, 1], [0, 1], linestyle="--", color="red"
Expand Down Expand Up @@ -370,7 +375,7 @@ def plot_kfre_metrics(
plt.plot(
recall,
precision,
label=f"{n}-variable {outcome} outcome (AP = {ap_score:.02f})",
label=f"{n}-variable {outcome} outcome (AP = {ap_score:.{decimal_places}f})",
) # Plot PR curve
plt.xlabel("Recall")
plt.ylabel("Precision")
Expand Down Expand Up @@ -418,7 +423,7 @@ def plot_kfre_metrics(
plt.plot(
fpr,
tpr,
label=f"{n}-variable {outcome} outcome (AUC = {auc_score:.02f})",
label=f"{n}-variable {outcome} outcome (AUC = {auc_score:.{decimal_places}f})",
) # Plot ROC curve
plt.plot(
[0, 1], [0, 1], linestyle="--", color="red"
Expand Down Expand Up @@ -465,7 +470,7 @@ def plot_kfre_metrics(
plt.plot(
recall,
precision,
label=f"{n}-variable {outcome} outcome (AP = {ap_score:.02f})",
label=f"{n}-variable {outcome} outcome (AP = {ap_score:.{decimal_places}f})",
) # Plot PR curve
plt.xlabel("Recall")
plt.ylabel("Precision")
Expand Down Expand Up @@ -562,7 +567,12 @@ def plot_kfre_metrics(
################################################################################


def eval_kfre_metrics(df, n_var_list, outcome_years=[2, 5]):
def eval_kfre_metrics(
df,
n_var_list,
outcome_years=[2, 5],
decimal_places=6,
):
"""
Calculate metrics for multiple outcomes and store the results in a DataFrame.
Expand All @@ -581,6 +591,8 @@ def eval_kfre_metrics(df, n_var_list, outcome_years=[2, 5]):
List of variable numbers to consider, e.g., [4, 6, 8].
outcome_years : list of int, optional
List of outcome years to consider, default is [2, 5].
decimal_places : int, optional
Number of decimal places for the calculated metrics. Default is 6.
Returns:
-------
Expand Down Expand Up @@ -662,12 +674,12 @@ def eval_kfre_metrics(df, n_var_list, outcome_years=[2, 5]):

# Create a dictionary to store the calculated metrics
metrics = {
"Precision/PPV": precision,
"Average Precision": average_precision,
"Sensitivity": sensitivity,
"Specificity": specificity,
"AUC ROC": auc_roc,
"Brier Score": brier,
"Precision/PPV": round(precision, decimal_places),
"Average Precision": round(average_precision, decimal_places),
"Sensitivity": round(sensitivity, decimal_places),
"Specificity": round(specificity, decimal_places),
"AUC ROC": round(auc_roc, decimal_places),
"Brier Score": round(brier, decimal_places),
"Outcome": f"{outcome}_{n_var}_var_kfre",
}

Expand All @@ -686,160 +698,3 @@ def eval_kfre_metrics(df, n_var_list, outcome_years=[2, 5]):

# Return the resulting DataFrame containing the performance metrics
return metrics_df_n_var


################################################################################
######################### CKD Random Data Generator ############################
################################################################################


class CKDDataGenerator:
def __new__(cls, *args, **kwargs):
instance = super(CKDDataGenerator, cls).__new__(cls)
instance.__init__(*args, **kwargs)
return instance.df

def __init__(
self,
n_samples=100,
n=1000,
random_state=42,
use_bootstrap=True,
ranges=None,
):
self.n_samples = n_samples
self.n = n
self.random_state = random_state
self.use_bootstrap = use_bootstrap
self.samples = []
self.ranges = ranges if ranges else self.default_ranges()
self.df = (
self._generate_data()
) # Automatically generate data upon initialization

def default_ranges(self):
return {
"Age": (18, 100),
"eGFR-EPI": (2, 120),
"uACR": (0.1, 2000),
"Albumin_g_dl": (3.0, 5.5),
"Phosphorous_mg_dl": (2.5, 5.0),
"Bicarbonate (mmol/L)": (16, 32),
"Calcium_mg_dl": (8.5, 10.5),
}

def generate_ckd_data(self, n, random_state):
"""
Generate a DataFrame with random CKD data.
Parameters:
----------
n : int
Number of random rows to generate.
random_state : int
Random seed for reproducibility.
Returns:
-------
pd.DataFrame
A DataFrame with random CKD data, including ESRD status and duration.
"""
# Set random seed for reproducibility
np.random.seed(random_state)

# Generate random values that are clinically relevant for CKD patients
data = {
"Age": np.random.randint(
self.ranges["Age"][0], self.ranges["Age"][1], size=n
), # Age in years, typical range for CKD patients
"SEX": np.random.choice(["male", "female"], size=n),
"eGFR-EPI": np.random.uniform(
self.ranges["eGFR-EPI"][0], self.ranges["eGFR-EPI"][1], size=n
), # eGFR in mL/min/1.73 m^2, covering all CKD stages
"uACR": np.random.uniform(
self.ranges["uACR"][0], self.ranges["uACR"][1], size=n
), # uACR in mg/g, covering all CKD stages
"Diabetes (1=yes; 0=no)": np.random.choice(
[0, 1], size=n
), # Prevalence of diabetes in CKD
"Hypertension (1=yes; 0=no)": np.random.choice(
[0, 1], size=n
), # Prevalence of hypertension in CKD
"Albumin_g_dl": np.random.uniform(
self.ranges["Albumin_g_dl"][0],
self.ranges["Albumin_g_dl"][1],
size=n,
), # Serum albumin in g/dL, lower range for CKD
"Phosphorous_mg_dl": np.random.uniform(
self.ranges["Phosphorous_mg_dl"][0],
self.ranges["Phosphorous_mg_dl"][1],
size=n,
), # Serum phosphorus in mg/dL, can be elevated in CKD
"Bicarbonate (mmol/L)": np.random.uniform(
self.ranges["Bicarbonate (mmol/L)"][0],
self.ranges["Bicarbonate (mmol/L)"][1],
size=n,
), # Serum bicarbonate in mEq/L, often lower in CKD
"Calcium_mg_dl": np.random.uniform(
self.ranges["Calcium_mg_dl"][0],
self.ranges["Calcium_mg_dl"][1],
size=n,
), # Serum calcium in mg/dL, slightly adjusted for CKD
}

df = pd.DataFrame(data)

# Ensure no values fall below their specified ranges
for col in df.columns:
if col in self.ranges:
min_val = self.ranges[col][0]
df[col] = df[col].clip(lower=min_val)

# Define ESRD based on eGFR value
df["ESRD (1=yes; 0=no)"] = df["eGFR-EPI"].apply(lambda x: 1 if x < 15 else 0)

# Create a column with random ESRD duration in years between 0 and 10
df["ESRD_duration_years"] = np.random.uniform(0, 10, size=n)

return df

def bootstrap_ckd_data(self):
"""
Bootstrap CKD data multiple times and store all samples.
Parameters:
----------
None
Returns:
-------
None
"""
np.random.seed(self.random_state)
self.samples = []

for i in tqdm(range(self.n_samples)):
sample_random_state = self.random_state + i
sample = self.generate_ckd_data(
self.n,
random_state=sample_random_state,
)
self.samples.append(sample)

def _generate_data(self):
"""
Generate CKD data, optionally using bootstrapping.
Parameters:
----------
None
Returns:
-------
pd.DataFrame
"""
if self.use_bootstrap:
self.bootstrap_ckd_data()
return self.samples[np.random.randint(0, self.n_samples)]
else:
return self.generate_ckd_data(self.n, self.random_state)

0 comments on commit d0a097c

Please sign in to comment.