Skip to content

Latest commit

 

History

History
547 lines (444 loc) · 20.2 KB

231228_downstream_processing.md

File metadata and controls

547 lines (444 loc) · 20.2 KB

Make metadata

#meta file generator
import gzip
from Bio import SeqIO
import sys
from Bio.SeqIO.QualityIO import FastqGeneralIterator
import pandas as pd

idx="/volumes/seq/projects/wgd/ref/echo_idx.tsv"
meta="/volumes/seq/projects/wgd/231228_RM_WGD_ACT/meta.tsv"

idx=pd.read_table(idx)
met=pd.read_table(meta)

apriori_meta=list()
for index, row in met.iterrows():
        sample=row["sample"]
        i7=row["sample"][:8]
        i5=row["sample"][8:16]
        i7_well=idx.loc[(idx['idx'] == 'i7') & (idx['idx_seq'] == i7)]['well'].values[0]
        i5_well=idx.loc[(idx['idx'] == 'i5') & (idx['idx_seq'] == i5)]['well'].values[0]
        i7_set=idx.loc[(idx['idx'] == 'i7') & (idx['idx_seq'] == i7)]['set'].values[0]
        i5_set=idx.loc[(idx['idx'] == 'i5') & (idx['idx_seq'] == i5)]['set'].values[0]
        if i7_set==1:
                if i5_set==1:
                        plate=1
                else:
                        plate=2
        else:
                if i5_set==1:
                        plate=3
                else:
                        plate=4
        tmp=[sample,plate,i7_well,i5_well]
        apriori_meta.append(tmp)

df = pd.DataFrame(apriori_meta,columns=['sample','plate','row','column'])
df['ploidy_gate'] = ["diploid" if int(row['column']) <= 12 else "aneuploid" for index,row in df.iterrows()]
df['genotype'] = ["WT" if int(row['plate']) <= 2 else "p53ko" for index,row in df.iterrows()]

treatment=list()
for index, row in df.iterrows():
        if row['column'] in ['1','2','13','14']:
                tmp="mon_mpi"
        elif row['column'] in ['3','4','15','16']:
                tmp="dcb"
        elif row['column'] in ['5','6','17','18']:
                tmp="dcb"
        elif row['column'] in ['7','8','19','20']:
                tmp="sp"
        elif row['column'] in ['9','10','21','22']:
                tmp="ro"
        elif row['column'] in ['11','12','23','24']:
                tmp="veh"
        treatment.append(tmp)

df['treatment']=treatment
df.to_csv("/volumes/seq/projects/wgd/231228_RM_WGD_ACT/meta_apriori.csv",index=False)

Plot Heatmap

library(copykit)
library(ggplot2)
library(reshape2)
library(HMMcopy)
setwd("/volumes/seq/projects/wgd/231228_RM_WGD_ACT")
meta_apriori<-read.csv("meta_apriori.csv")
meta<-read.csv("meta.tsv",sep="\t")
dat<-readRDS("scCNA.rds")

row.names(meta_apriori)<-meta_apriori$sample
dat$plate<-meta_apriori[row.names(dat@colData),]$plate
dat$plate_row<-meta_apriori[row.names(dat@colData),]$row
dat$plate_column<-meta_apriori[row.names(dat@colData),]$column
dat$ploidy_gate<-meta_apriori[row.names(dat@colData),]$ploidy_gate
dat$treatment<-meta_apriori[row.names(dat@colData),]$treatment
dat$treatment_gate<-paste(dat$treatment,dat$ploidy_gate,sep="_")


dat$genotype<-"wt"
colData(dat)[colData(dat)$plate %in% c("3","4"),]$genotype<-"p53ko"

# Plot a copy number heatmap with clustering annotation
pdf("subclone.heatmap.expanded.pdf")
plotHeatmap(dat, label = c('treatment_gate','subclones',"plate","ploidy_gate","treatment"),group = 'treatment_gate',order="hclust",row_split="treatment")
dev.off()


dat <- calcInteger(dat, method = 'scquantum', assay = 'smoothed_bincounts')

pdf("/volumes/seq/projects/wgd/231228_RM_WGD_ACT/subclone.ploidy.pdf")
plotMetrics(dat, metric = 'ploidy', label = 'ploidy_score')
dev.off()

saveRDS(dat,"scCNA.rds")

Output for PERT

library(copykit)
library(ggplot2)
library(reshape2)
library(HMMcopy)
setwd("/volumes/seq/projects/wgd/231228_RM_WGD_ACT")
dat<-readRDS("scCNA.rds")


#cell metadata
out_meta<-as.data.frame(dat@colData[c("sample","plate","plate_row","plate_column","ploidy_gate","treatment","genotype","reads_assigned_bins")])
colnames(out_meta)<-c("cell_id","plate","plate_row","plate_column","ploidy_gate","treatment","genotype","reads_assigned_bins")

#per bin info
range_dat<-data.frame(chr=dat@rowRanges@seqnames,
        start=dat@rowRanges@ranges@start,
        end=dat@rowRanges@ranges@start+dat@rowRanges@ranges@width,
        gc=dat@rowRanges$gc_content)
range_dat$bin_id<-1:nrow(range_dat)

out_dat<-cbind(range_dat,dat@assays@data$bincounts)
out_dat_counts<-melt(dat@assays@data$bincounts)
out_dat_counts$bin_id<-1:nrow(range_dat)
out_dat<-merge(range_dat,out_dat_counts,by="bin_id")

out_dat_state<-melt(dat@assays@data$integer)
out_dat_state$bin_id<-1:nrow(range_dat)
out_dat<-cbind(out_dat,out_dat_state$value)

colnames(out_dat)<-c("bin_id","chr","start","end","gc","cell_id","reads","state")
out_dat<-merge(out_dat,out_meta,by="cell_id")

cn_s_p53ko=out_dat[(out_dat$genotype=="p53ko") & (out_dat$ploidy_gate=="aneuploid"),]
cn_g1_p53ko=out_dat[(out_dat$genotype=="p53ko") & (out_dat$ploidy_gate=="diploid"),]

cn_s_wt=out_dat[(out_dat$genotype=="wt") & (out_dat$ploidy_gate=="aneuploid"),]
cn_g1_wt=out_dat[(out_dat$genotype=="wt") & (out_dat$ploidy_gate=="diploid"),]

write.table(cn_s_p53ko,file="~/s_phase_cells_p53ko_hmmcopy_trimmed.csv",sep=",")
system("gzip -f ~/s_phase_cells_p53ko_hmmcopy_trimmed.csv")

write.table(cn_g1_p53ko,file="~/g1_phase_cells_p53ko_hmmcopy_trimmed.csv",sep=",")
system("gzip -f ~/g1_phase_cells_p53ko_hmmcopy_trimmed.csv")


write.table(cn_s_wt,file="~/s_phase_cells_wt_hmmcopy_trimmed.csv",sep=",")
system("gzip -f ~/s_phase_cells_wt_hmmcopy_trimmed.csv")

write.table(cn_g1_wt,file="~/g1_phase_cells_wt_hmmcopy_trimmed.csv",sep=",")
system("gzip -f ~/g1_phase_cells_wt_hmmcopy_trimmed.csv")

More heatmap plots

library(copykit)
library(ggplot2)
library(reshape2)
library(HMMcopy)
setwd("/volumes/seq/projects/wgd/231228_RM_WGD_ACT")
meta_apriori<-read.csv("meta_apriori.csv")
meta<-read.csv("meta.tsv",sep="\t")
dat<-readRDS("scCNA.rds")


dat <- calcInteger(dat, method = 'scquantum', assay = 'smoothed_bincounts')
dat$genotype<-"wt"
colData(dat)[colData(dat)$plate %in% c("3","4"),]$genotype<-"p53ko"

met<-as.data.frame(colData(dat)) 
met$treatment<-factor(met$treatment,levels=c("veh","ro","sp","mon_mpi","dcb"))

plt<-ggplot(met,aes(y=ploidy,x=treatment,color=ploidy_score))+geom_jitter()+theme_minimal()+facet_wrap(.~genotype,ncol=1)
ggsave(plt,file="test.pdf")

pdf("/volumes/seq/projects/wgd/231228_RM_WGD_ACT/subclone.ploidy.pdf")
plotMetrics(dat, metric = 'ploidy', label = 'ploidy_score')
dev.off()

dat$genotype<-"wt"
colData(dat)[colData(dat)$plate %in% c("3","4"),]$genotype<-"p53ko"
dat_sub<-dat[,colData(dat)$treatment %in% c("veh","ro")]

# Plot a copy number heatmap with clustering annotation
pdf("subclone.heatmap.expanded.pdf")
plotHeatmap(dat_sub, label = c("ploidy_gate","treatment","genotype"),group = 'ploidy_gate',order="hclust",row_split="genotype")
dev.off()

# Plot a copy number heatmap with clustering annotation
pdf("subclone.heatmap.expanded.pdf")
plotHeatmap(dat, label = c("ploidy_gate","treatment","genotype"),group = 'ploidy_gate',order="hclust",row_split="genotype")
dev.off()


met<-as.data.frame(colData(dat))
plt<-ggplot(met,aes(x=percentage_duplicates,y=log10(reads_assigned_bins),color=ploidy_gate,shape=treatment))+geom_point()+theme_minimal()
ggsave(plt,file="test.pdf")

Set up PERT https://github.com/shahcompbio/scdna_replication_tools

cd ~/singularity
singularity pull docker://adamcweiner/scdna_replication_tools:main
singularity shell ~/singularity/scdna_replication_tools_main.sif
#also clone repo to scdna_replication_tools-main.zip

https://github.com/shahcompbio/scdna_replication_tools/blob/main/notebooks/inference_tutorial_pt2.ipynb

singularity shell ~/singularity/scdna_replication_tools_main.sif

import sys
sys.path.append('/volumes/USR2/Ryan/tools/scdna_replication_tools') #add this path as a module
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scdna_replication_tools.infer_scRT import scRT
from scdna_replication_tools.plot_pert_output import plot_model_results
from scdna_replication_tools.compute_ccc_features import compute_ccc_features
from scdna_replication_tools.predict_cycle_phase import predict_cycle_phase

from scgenome import refgenome



# load the simulated data from the paper
# this corresponds to the data seen in simulated dataset P8.2 which has subclonal and cell-specific CNAs
#cn_s = pd.read_csv('/volumes/USR2/Ryan/tools/scdna_replication_tools/data/P8.2/s_phase_cells_hmmcopy_trimmed.csv.gz', dtype={'chr': str})
#cn_g1 = pd.read_csv('/volumes/USR2/Ryan/tools/scdna_replication_tools/data/P8.2/g1_phase_cells_hmmcopy_trimmed.csv.gz', dtype={'chr': str})

#load data
cn_s = pd.read_csv('~/s_phase_cells_p53ko_hmmcopy_trimmed.csv.gz', dtype={'chr': str})
cn_g1 = pd.read_csv('~/g1_phase_cells_p53ko_hmmcopy_trimmed.csv.gz', dtype={'chr': str})

# add the replication columns for the G1-phase cells
cn_g1['true_rep'] = 0.0
cn_g1['true_p_rep'] = 0.0
cn_g1['true_t'] = 1.0
cn_g1['true_G1_state']=1.0

cn_s['true_rep'] = 0.0
cn_s['true_p_rep'] = 0.0
cn_s['true_t'] = 1.0
cn_s['true_G1_state']=0.0

cn_g1['true_reads_norm']=(cn_g1['reads']/cn_g1['reads_assigned_bins'])*1000000
cn_g1['clone_id']=cn_g1['treatment']
cn_g1['chr']=[i[3:] for i in cn_g1['chr']]
cn_s['true_reads_norm']=(cn_s['reads']/cn_s['reads_assigned_bins'])*1000000
cn_s['clone_id']=cn_s['treatment']
cn_s['chr']=[i[3:] for i in cn_s['chr']]

# plot the true simulated data
plot_model_results(
    cn_s, cn_g1, 
    clone_col='clone_id', second_sort_col='true_t', 
    input_cn_col='state', output_cn_col='true_G1_state', 
    output_rep_col='true_rep', rpm_col='true_reads_norm',
    rpm_title='Read depth', input_cn_title='HMMcopy states',
    output_cn_title='True somatic CN states', rep_title='True replication states'
)
plt.show()
plt.savefig("pert_g2m_s.png")

# note the true cell cycle phase of each cell and merge all the cells into one dataframe
cn_s['true_phase'] = 'S'
cn_g1['true_phase'] = 'G1/2'

cn = pd.concat([cn_s, cn_g1], ignore_index=True)
cn.head()
# compute per-cell features for all cells
cn, cell_features = compute_ccc_features(
    cn, 
    rpm_col='true_reads_norm', 
    cn_col='state',
    clone_col='treatment', 
    madn_col='madn', 
    lrs_col='lrs',
    num_reads_col='reads_assigned_bins' 
)
cell_features.head()
cn.head()

# merge the true_phase information back into the cell_features dataframe
cell_features = cell_features.merge(cn[['cell_id', 'true_phase']].drop_duplicates(), on='cell_id')
cell_features.head()

# split the data so cells with the 400 highest corrected breakpoints are in the G1/2-phase dataset
# sort the cells by corrected breakpoints
cell_features = cell_features.sort_values('corrected_breakpoints', ascending=False)
# get the cell_ids for the top 400 cells and the threshold for the 401st cell
top_400_cell_ids = list(cell_features['cell_id'].iloc[:405].values)
thresh = cell_features['corrected_breakpoints'].iloc[405]

# look at distribution of corrected breakpoints to set a threshold for identifying the high-confidence G1/2-phase cells
sns.histplot(cell_features['corrected_breakpoints'], kde=True, bins=100)
# plot a vertical line at the threshold
plt.axvline(thresh, color='red', linestyle='--')
plt.show()
plt.savefig("pert_g2m_s2.png")

# # split the data into G1/2-phase and S-phase
cn_s_input = cn[cn['true_G1_state'] == 0.0].reset_index(drop=True)
cn_g_input = cn[cn['true_G1_state'] == 1.0].reset_index(drop=True)
cn_g_input.cell_id.unique().shape

cn_s_input['library_id']=cn_s_input['treatment']+"_"+cn_s_input['ploidy_gate']
cn_g_input['library_id']=cn_g_input['treatment']+"_"+cn_g_input['ploidy_gate']

cn_s_input['copy']=cn_s_input['state']
cn_g_input['copy']=cn_g_input['state']

# temporarily remove columns that don't get used by PERT
temp_cn_s = cn_s_input[['cell_id', 'chr', 'start', 'end', 'gc', 'state','copy','library_id', 'true_reads_norm']]
temp_cn_g1 = cn_g_input[['cell_id', 'chr', 'start', 'end', 'gc', 'state', 'copy','library_id', 'true_reads_norm']]

print('number of cells in S-phase: ', len(temp_cn_s['cell_id'].unique()))
print('number of cells in G1-phase: ', len(temp_cn_s['cell_id'].unique()))
print('number of loci in S-phase: ', len(temp_cn_g1[['chr', 'start']].drop_duplicates()))
print('number of loci in G1-phase: ', len(temp_cn_g1[['chr', 'start']].drop_duplicates()))


# create scRT object with input columns denoted
scrt = scRT(temp_cn_s, temp_cn_g1, 
        input_col='true_reads_norm', 
        clone_col='library_id', 
        assign_col='copy', 
        rt_prior_col=None,
        cn_state_col='state', 
        gc_col='gc', 
        cn_prior_method='g1_composite', 
        max_iter=300)

# run inference using PERT
cn_s_with_scrt, supp_s_output, cn_g_with_scrt, supp_g_output = scrt.infer(level='pyro')
cn_s_with_scrt.head()

cn_g_with_scrt.head()

# plot the loss curve for 'loss_s' and 'loss_g' param values in the supp_s_output and supp_g_output dataframes
fig, ax = plt.subplots(1, 3, figsize=(12, 4))
sns.lineplot(data=supp_s_output.query("param=='loss_g'"), x='level', y='value', ax=ax[0])
sns.lineplot(data=supp_s_output.query("param=='loss_s'"), x='level', y='value', ax=ax[1])
sns.lineplot(data=supp_g_output.query("param=='loss_s'"), x='level', y='value', ax=ax[2])

for i in range(3):
    ax[i].set_xlabel('Iteration')
    ax[i].set_ylabel('Loss')
    ax[i].set_title('Loss curve PERT Step {}'.format(i+1))

plt.show()
plt.savefig("tutorial2_true_sim_losscurve.png")

cn_s_with_scrt['clone_id']=cn_s['treatment']
cn_g_with_scrt['clone_id']=cn_g1['treatment']

# plot the results of the inference
plot_model_results(
    cn_s_with_scrt, cn_g_with_scrt, clone_col='library_id', second_sort_col='model_tau',
    input_cn_col='state', output_cn_col='model_cn_state', output_rep_col='model_rep_state',
    rpm_col='true_reads_norm', rpm_title='Read depth', input_cn_title='HMMcopy states',
    output_cn_title='PERT CN states', rep_title='PERT replication states',
    top_title_prefix='Input set: unknown', bottom_title_prefix='Input set: high-confidence G1/2-phase'
)
plt.show()
plt.savefig("tutorial2_true_sim_losscurve.png")

# concatenate the two dataframes containing PERT output
cn_out = pd.concat([cn_s_with_scrt, cn_g_with_scrt], ignore_index=True)

# predict the cycle phase for each clone based on the PERT output with default parameters
cn_s_out, cn_g_out, cn_lq_out = predict_cycle_phase(
    cn_out, frac_rt_col='cell_frac_rep', rep_state_col='model_rep_state', 
    cn_state_col='model_cn_state', rpm_col='true_reads_norm'
)

cn_s_out.cell_id.unique().shape, cn_g_out.cell_id.unique().shape, cn_lq_out.cell_id.unique().shape

# plot the results of the inference, now sorted by the PERT predicted phase
plot_model_results(
    cn_s_out, cn_g_out, clone_col='library_id', second_sort_col='model_tau',
    input_cn_col='state', output_cn_col='model_cn_state', output_rep_col='model_rep_state',
    rpm_col='true_reads_norm', rpm_title='Read depth', input_cn_title='HMMcopy states',
    output_cn_title='PERT xCN states', rep_title='PERT replication states',
    top_title_prefix='PERT: S-phase', bottom_title_prefix='PERT: G1/2-phase'
)
plt.show()
plt.savefig("tutorial2_true_sim_inferreddat.png")

# show what the PERT predicted low-quality cells look like in contrast to the predicted S-phase cells
plot_model_results(
    cn_s_out, cn_lq_out, clone_col='library_id', second_sort_col='model_tau',
    input_cn_col='state', output_cn_col='model_cn_state', output_rep_col='model_rep_state',
    rpm_col='true_reads_norm', rpm_title='Read depth', input_cn_title='HMMcopy states',
    output_cn_title='PERT CN states', rep_title='PERT replication states',
    top_title_prefix='PERT: S-phase', bottom_title_prefix='PERT: LQ'
)
plt.show()
plt.savefig("tutorial2_true_sim_inferreddat_lowquality.png")


# merge the true time in S-phase with the PERT cell_frac_rep output for all cells
output_cell_times = pd.concat([
    cn_s_out[['cell_id', 'cell_frac_rep']].drop_duplicates(), 
    cn_g_out[['cell_id', 'cell_frac_rep']].drop_duplicates(), 
    cn_lq_out[['cell_id', 'cell_frac_rep']].drop_duplicates()
    ], ignore_index=True
)

true_cell_times = pd.concat([
    cn_s[['cell_id', 'true_t', 'true_phase']].drop_duplicates(), 
    cn_g1[['cell_id', 'true_t', 'true_phase']].drop_duplicates(), 
    ], ignore_index=True
)

cell_times = pd.merge(true_cell_times, output_cell_times, on='cell_id')

print(cell_times.shape)

cell_times.head()

sns.jointplot(data=cell_times, x='true_t', y='cell_frac_rep', hue='true_phase', alpha=0.1)
plt.xlabel('True time in S-phase')
plt.ylabel('PERT predicted time in S-phase')
plt.show()
plt.savefig("tutorial2_true_sim_inferreddat_dotplot.png")