Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

generating results for a sequential modeling pipeline (to compare to our current pipeline) #132

Open
wants to merge 1 commit into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 154 additions & 0 deletions yeastdnnexplorer/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,9 @@ def run_lasso_bootstrap(args: argparse.Namespace) -> None:
bootstrap_alphas.to_csv(bootstrap_alphas_path, index=False)


# 12/1: new attempt


def find_interactors_workflow(args: argparse.Namespace) -> None:
"""
Run the find_interactors_workflow with the specified arguments.
Expand Down Expand Up @@ -245,6 +248,10 @@ def find_interactors_workflow(args: argparse.Namespace) -> None:
}

# Save the results from bootstrapping for further analysis
# also perform the sequential processing where we take the
# results from "all" and call get_significant_predictors
# using only the TFs from these results on the top x% of data
# Sequential analysis
if args.method == "bootstrap_lassocv":
# Iterate through "all" and "top"
for suffix, lasso_results in lasso_res.items():
Expand All @@ -260,6 +267,60 @@ def find_interactors_workflow(args: argparse.Namespace) -> None:
output_dirpath, f"bootstrap_coef_df_{suffix}.csv"
)
bootstrap_coef_df.to_csv(bootstrap_coef_df_path, index=False)
# Get significant predictors from "all" results
assert isinstance(lasso_res["all"]["sig_coefs"], dict)
significant_predictors_all = list(lasso_res["all"]["sig_coefs"].keys())

# Build the formula using these predictors
response_variable = f"{args.response_tf}_LRR"
predictor_variable = re.sub(r"_rep\\d+", "", args.response_tf)

# Ensure the main effect of the perturbed TF is included
formula_terms = significant_predictors_all.copy()
if predictor_variable not in formula_terms:
formula_terms.insert(0, predictor_variable)

# Build the formula string
formula = f"{response_variable} ~ {' + '.join(formula_terms)}"

# Add '-1' to the formula if drop_intercept is True
formula += " - 1"

# Run get_significant_predictors with the custom formula
sequential_lasso_res = get_significant_predictors(
args.method,
args.response_tf,
response_df,
predictors_df,
ci_percentile=args.top_ci_percentile,
n_bootstraps=args.n_bootstraps,
add_max_lrb=True,
quantile_threshold=args.data_quantile,
formula=formula, # Pass the formula via kwargs
)

# Rest of the code remains the same...
# Store the results under a new directory
sequential_output_dir = os.path.join(
output_dirpath, "sequential_top_genes_results"
Copy link
Member

@cmatKhan cmatKhan Dec 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in your results_20241202, I see this pattern:

results_20241202/ACE2/ACE2/

Where there is a directory with the same name nested under the other. As far as I know, this wasn't happening before. It isn't immediately clear to me that it is in this line that this occurs, but nowhere else stands out.

On a related note: somewhere in the docs, in the cmd line --help, or even as a README that gets deposited in every results otuput directory, there needs to be a definition of what "sequential_top_genes_results" are.

I agree with you that keeping this in the main workflow for now (meaning, not creating another cmd line option to turn this feature on and off) is a good idea. It just needs to be documented so that it is clear what these results are.

Copy link
Member

@cmatKhan cmatKhan Dec 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other than this, it looks good to me in terms of format. I haven't pulled and run this, and __main__ is where my testing is not strict. My strategy is to implement the logic elsewhere, and do only very simple things that I am quite certain of in __main__ (not a strong strategy). Please do make sure that you go over your added code carefully. Play with it in a console/notebook if you need to be sure of something.

)
os.makedirs(sequential_output_dir, exist_ok=True)

# Save ci_dict and bootstrap_coef_df from sequential_lasso_res
ci_dict_seq = sequential_lasso_res["bootstrap_lasso_output"][0]
ci_dict_seq_path = os.path.join(
sequential_output_dir, "ci_dict_sequential.json"
)
with open(ci_dict_seq_path, "w") as f:
json.dump(ci_dict_seq, f, indent=4)

bootstrap_coef_df_seq = pd.DataFrame(
sequential_lasso_res["bootstrap_lasso_output"][1]
)
bootstrap_coef_df_seq_path = os.path.join(
sequential_output_dir, "bootstrap_coef_df_sequential.csv"
)
bootstrap_coef_df_seq.to_csv(bootstrap_coef_df_seq_path, index=False)

# Ensure lasso_res["all"]["sig_coefs"] and
# lasso_res["top"]["sig_coefs"] are dictionaries
Expand Down Expand Up @@ -400,6 +461,99 @@ def find_interactors_workflow(args: argparse.Namespace) -> None:
with open(output_path, "w") as f:
json.dump(output_dict, f, indent=4)

# If bootstrap_lassocv is the chosen method, need to repeat steps 3 and 4 for the
# sequential results
if args.method == "bootstrap_lassocv":
# Step 3 and 4 for the sequential method

# Get the significant coefficients from the sequential results
sequential_sig_coefs = sequential_lasso_res["sig_coefs"]
assert isinstance(sequential_sig_coefs, dict)
sequential_final_features = set(sequential_sig_coefs.keys())

# Save the sequential significant coefficients as a dictionary
intersection_seq_path = os.path.join(
sequential_output_dir, "intersection_sequential.json"
)
with open(intersection_seq_path, "w") as f:
json.dump(list(sequential_final_features), f, indent=4)

# Get main effects
main_effects_seq = []
for term in sequential_final_features:
if ":" in term:
main_effects_seq.append(term.split(":")[1])
else:
main_effects_seq.append(term)

# Combine main effects with final features
interactor_terms_and_main_effects_seq = (
list(sequential_final_features) + main_effects_seq
)

# Generate model matrix
_, full_X_seq = generate_modeling_data(
args.response_tf,
response_df,
predictors_df,
formula=f"~ {' + '.join(interactor_terms_and_main_effects_seq)}",
drop_intercept=False,
)

# Add max_lrb column
full_X_seq["max_lrb"] = max_lrb

# Use the response and classes from the "all" data (NOT sequential data)
response_seq = lasso_res["all"]["response"]
classes_seq = lasso_res["all"]["classes"]

# Step 3: Get interactor importance using "all" data
(
full_avg_rsquared_seq,
interactor_results_seq,
) = get_interactor_importance(
response_seq,
full_X_seq,
classes_seq,
sequential_final_features,
)

# Update final features based on interactor results (no change needed here)
for interactor_variant in interactor_results_seq:
k = interactor_variant["interactor"]
v = interactor_variant["variant"]
sequential_final_features.remove(k)
sequential_final_features.add(v)

# Step 4: Compare results of final model vs. a univariate model on "all" data
assert isinstance(lasso_res["all"]["predictors"], pd.DataFrame)
avg_r2_univariate_seq = stratified_cv_r2(
response_seq,
lasso_res["all"]["predictors"][[model_tf]],
classes_seq,
)

final_model_avg_r_squared_seq = stratified_cv_r2(
response_seq,
full_X_seq[list(sequential_final_features)],
classes_seq,
)

# Prepare output dictionary
output_dict_seq = {
"response_tf": args.response_tf,
"final_features": list(sequential_final_features),
"avg_r2_univariate": avg_r2_univariate_seq,
"final_model_avg_r_squared": final_model_avg_r_squared_seq,
}

# Save the output
output_path_seq = os.path.join(
sequential_output_dir, "final_output_sequential.json"
)
with open(output_path_seq, "w") as f:
json.dump(output_dict_seq, f, indent=4)


class CustomHelpFormatter(argparse.HelpFormatter):
"""Custom help formatter to format the subcommands and general options sections."""
Expand Down
1 change: 1 addition & 0 deletions yeastdnnexplorer/ml_models/lasso_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,7 @@ def get_significant_predictors(
predictors_df,
quantile_threshold=kwargs.get("quantile_threshold", None),
drop_intercept=True,
formula=kwargs.get("formula", None), # Pass formula from kwargs
)

# NOTE: fit_intercept is set to `true`
Expand Down
Loading