Added geneset_plot_multi in ov.bulk to visualize the multi result…

…s of enrichment. #103
Starlitnightly · Jul 16, 2024 · 6f3f4cb · 6f3f4cb
1 parent f8d8006
commit 6f3f4cb
Show file tree

Hide file tree

Showing 7 changed files with 649 additions and 270 deletions.
diff --git a/omicverse/bulk/_Enrichment.py b/omicverse/bulk/_Enrichment.py
@@ -271,6 +271,89 @@ def geneset_enrichment_GSEA(gene_rnk:pd.DataFrame,pathways_dict:dict,
     enrich_res['P-value']=enrich_res['fdr']
     return enrich_res
 
+def geneset_plot_multi(enr_dict,colors_dict,num:int=5,fontsize=10,
+                        fig_title:str='',fig_xlabel:str='Fractions of genes',
+                        figsize:tuple=(2,4),cmap:str='YlGnBu',
+                        text_knock:int=5,text_maxsize:int=20,ax=None,
+                        ):
+    """
+    Enrichment multi genesets analysis using GSEA
+
+    Arguments:
+        enr_dict: A dictionary of enrichment results.
+        colors_dict: A dictionary of colors for each gene set.
+        num: The number of enriched terms to plot. Default is 5.
+        fontsize: The fontsize of the plot. Default is 10.
+        fig_title: The title of the plot. Default is an empty string.
+        fig_xlabel: The label of the x-axis. Default is 'Fractions of genes'.
+        figsize: The size of the plot. Default is (2,4).
+        cmap: The colormap to use for the plot. Default is 'YlGnBu'.
+        text_knock: The number of characters to knock off the end of the term name. Default is 5.
+        text_maxsize: The maximum fontsize of the term names. Default is 20.
+        ax: A matplotlib.axes.Axes object.
+
+    """
+    from PyComplexHeatmap import HeatmapAnnotation,DotClustermapPlotter,anno_label,anno_simple,AnnotationBase
+    for key in enr_dict.keys():
+        enr_dict[key]['Type']=key
+    enr_all=pd.concat([enr_dict[i].iloc[:num] for i in enr_dict.keys()],axis=0)
+    enr_all['Term']=[ov.utils.plot_text_set(i.split('(')[0],text_knock=text_knock,text_maxsize=text_maxsize) for i in enr_all.Term.tolist()]
+    enr_all.index=enr_all.Term
+    enr_all['Term1']=[i for i in enr_all.index.tolist()]
+    del enr_all['Term']
+
+    colors=colors_dict
+
+    left_ha = HeatmapAnnotation(
+                          label=anno_label(enr_all.Type, merge=True,rotation=0,colors=colors,relpos=(1,0.8)),
+                          Category=anno_simple(enr_all.Type,cmap='Set1',
+                                           add_text=False,legend=False,colors=colors),
+                           axis=0,verbose=0,label_kws={'rotation':45,'horizontalalignment':'left','visible':False})
+    right_ha = HeatmapAnnotation(
+                              label=anno_label(enr_all.Term1, merge=True,rotation=0,relpos=(0,0.5),arrowprops=dict(visible=True),
+                                               colors=enr_all.assign(color=enr_all.Type.map(colors)).set_index('Term1').color.to_dict(),
+                                              fontsize=fontsize,luminance=0.8,height=2),
+                               axis=0,verbose=0,#label_kws={'rotation':45,'horizontalalignment':'left'},
+                                orientation='right')
+    if ax==None:
+        fig, ax = plt.subplots(figsize=figsize) 
+    else:
+        ax=ax
+    #plt.figure(figsize=figsize)
+    cm = DotClustermapPlotter(data=enr_all, x='fraction',y='Term1',value='logp',c='logp',s='num',
+                              cmap=cmap,
+                              row_cluster=True,#col_cluster=True,#hue='Group',
+                              #cmap={'Group1':'Greens','Group2':'OrRd'},
+                              vmin=-1*np.log10(0.1),vmax=-1*np.log10(1e-10),
+                              #colors={'Group1':'yellowgreen','Group2':'orange'},
+                              #marker={'Group1':'*','Group2':'$\\ast$'},
+                              show_rownames=True,show_colnames=False,row_dendrogram=False,
+                              col_names_side='top',row_names_side='right',
+                              xticklabels_kws={'labelrotation': 30, 'labelcolor': 'blue','labelsize':fontsize},
+                              #yticklabels_kws={'labelsize':10},
+                              #top_annotation=col_ha,left_annotation=left_ha,right_annotation=right_ha,
+                              left_annotation=left_ha,right_annotation=right_ha,
+                              spines=False,
+                              row_split=enr_all.Type,# row_split_gap=1,
+                              #col_split=df_col.Group,col_split_gap=0.5,
+                              verbose=1,legend_gap=10,
+                              #dot_legend_marker='*',
+                              xlabel='Fractions of genes',xlabel_side="bottom",
+                              xlabel_kws=dict(labelpad=8,fontweight='normal',fontsize=fontsize+2),
+                              # xlabel_bbox_kws=dict(facecolor=facecolor)
+                             )
+    tesr=plt.gcf().axes
+    for ax in plt.gcf().axes:
+        if hasattr(ax, 'get_xlabel'):
+            if ax.get_xlabel() == 'Fractions of genes':  # 假设 colorbar 有一个特定的标签
+                cbar = ax
+                cbar.grid(False)
+            if ax.get_ylabel() == 'logp':  # 假设 colorbar 有一个特定的标签
+                cbar = ax
+                cbar.tick_params(labelsize=fontsize+2)
+                cbar.set_ylabel(r'$−Log_{10}(P_{adjusted})$',fontsize=fontsize+2)
+                cbar.grid(False)
+    return ax
 
 def geneset_plot(enrich_res,num:int=10,node_size:list=[5,10,15],
                         cax_loc:list=[2, 0.55, 0.5, 0.02],cax_fontsize:int=12,

diff --git a/omicverse/bulk/__init__.py b/omicverse/bulk/__init__.py
@@ -6,7 +6,7 @@
 #from Pyomic.bulk.Gene_module import pywgcna
 
 from ._Gene_module import pyWGCNA
-from ._Enrichment import pyGSEA,pyGSE,geneset_enrichment,geneset_plot,geneset_enrichment_GSEA
+from ._Enrichment import pyGSEA,pyGSE,geneset_enrichment,geneset_plot,geneset_enrichment_GSEA,geneset_plot_multi
 from ._network import pyPPI,string_interaction,string_map,generate_G
 from ._chm13 import get_chm13_gene,find_chm13_gene
 from ._Deseq2 import pyDEG,deseq2_normalize,estimateSizeFactors,estimateDispersions,Matrix_ID_mapping,data_drop_duplicates_index

diff --git a/omicverse/pp/_preprocess.py b/omicverse/pp/_preprocess.py
@@ -780,11 +780,31 @@ def leiden(adata, **kwargs):
         rsc.tl.leiden(adata, **kwargs)
 
 
-def score_genes_cell_cycle(adata,s_genes=None, g2m_genes=None):
+def score_genes_cell_cycle(adata,species='human',s_genes=None, g2m_genes=None):
+    """
+    Score cell cycle .
+
+    Arguments:
+        adata: Annotated data matrix with rows for cells and columns for genes.
+        species: The species of the data. It can be either 'human' or 'mouse'.
+        s_genes: The list of genes that are specific to the S phase of the cell cycle.
+        g2m_genes: The list of genes that are specific to the G2/M phase of the cell cycle.
+    
+    """
     if s_genes==None:
-        s_genes=['MCM5', 'PCNA', 'TYMS', 'FEN1', 'MCM2', 'MCM4', 'RRM1', 'UNG', 'GINS2', 'MCM6', 'CDCA7', 'DTL', 'PRIM1', 'UHRF1', 'MLF1IP', 'HELLS', 'RFC2', 'RPA2', 'NASP', 'RAD51AP1', 'GMNN', 'WDR76', 'SLBP', 'CCNE2', 'UBR7', 'POLD3', 'MSH2', 'ATAD2', 'RAD51', 'RRM2', 'CDC45', 'CDC6', 'EXO1', 'TIPIN', 'DSCC1', 'BLM', 'CASP8AP2', 'USP1', 'CLSPN', 'POLA1', 'CHAF1B', 'BRIP1', 'E2F8']
+        if species=='human':
+            s_genes=['MCM5', 'PCNA', 'TYMS', 'FEN1', 'MCM2', 'MCM4', 'RRM1', 'UNG', 'GINS2', 'MCM6', 'CDCA7', 'DTL', 'PRIM1', 'UHRF1', 'MLF1IP', 'HELLS', 'RFC2', 'RPA2', 'NASP', 'RAD51AP1', 'GMNN', 'WDR76', 'SLBP', 'CCNE2', 'UBR7', 'POLD3', 'MSH2', 'ATAD2', 'RAD51', 'RRM2', 'CDC45', 'CDC6', 'EXO1', 'TIPIN', 'DSCC1', 'BLM', 'CASP8AP2', 'USP1', 'CLSPN', 'POLA1', 'CHAF1B', 'BRIP1', 'E2F8']
+        elif species=='mouse':
+            s_genes=['Cdca7', 'Mcm4', 'Mcm7', 'Rfc2', 'Ung', 'Mcm6', 'Rrm1', 'Slbp', 'Pcna', 'Atad2', 'Tipin', 'Mcm5', 'Uhrf1', 'Polr1b', 'Dtl', 'Prim1', 'Fen1', 'Hells', 'Gmnn', 'Pold3', 'Nasp', 'Chaf1b', 'Gins2', 'Pola1', 'Msh2', 'Casp8ap2', 'Cdc6', 'Ubr7', 'Ccne2', 'Wdr76', 'Tyms', 'Cdc45', 'Clspn', 'Rrm2', 'Dscc1', 'Rad51', 'Usp1', 'Exo1', 'Blm', 'Rad51ap1', 'Cenpu', 'E2f8', 'Mrpl36']
+        else:
+            s_genes=s_genes
     if g2m_genes==None:
-        g2m_genes=['HMGB2', 'CDK1', 'NUSAP1', 'UBE2C', 'BIRC5', 'TPX2', 'TOP2A', 'NDC80', 'CKS2', 'NUF2', 'CKS1B', 'MKI67', 'TMPO', 'CENPF', 'TACC3', 'FAM64A', 'SMC4', 'CCNB2', 'CKAP2L', 'CKAP2', 'AURKB', 'BUB1', 'KIF11', 'ANP32E', 'TUBB4B', 'GTSE1', 'KIF20B', 'HJURP', 'CDCA3', 'HN1', 'CDC20', 'TTK', 'CDC25C', 'KIF2C', 'RANGAP1', 'NCAPD2', 'DLGAP5', 'CDCA2', 'CDCA8', 'ECT2', 'KIF23', 'HMMR', 'AURKA', 'PSRC1', 'ANLN', 'LBR', 'CKAP5', 'CENPE', 'CTCF', 'NEK2', 'G2E3', 'GAS2L3', 'CBX5', 'CENPA']
+        if species=='human':
+            g2m_genes=['HMGB2', 'CDK1', 'NUSAP1', 'UBE2C', 'BIRC5', 'TPX2', 'TOP2A', 'NDC80', 'CKS2', 'NUF2', 'CKS1B', 'MKI67', 'TMPO', 'CENPF', 'TACC3', 'FAM64A', 'SMC4', 'CCNB2', 'CKAP2L', 'CKAP2', 'AURKB', 'BUB1', 'KIF11', 'ANP32E', 'TUBB4B', 'GTSE1', 'KIF20B', 'HJURP', 'CDCA3', 'HN1', 'CDC20', 'TTK', 'CDC25C', 'KIF2C', 'RANGAP1', 'NCAPD2', 'DLGAP5', 'CDCA2', 'CDCA8', 'ECT2', 'KIF23', 'HMMR', 'AURKA', 'PSRC1', 'ANLN', 'LBR', 'CKAP5', 'CENPE', 'CTCF', 'NEK2', 'G2E3', 'GAS2L3', 'CBX5', 'CENPA']
+        elif species=='mouse':
+            g2m_genes=['Cbx5', 'Aurkb', 'Cks1b', 'Cks2', 'Jpt1', 'Hmgb2', 'Anp32e', 'Lbr', 'Tmpo', 'Top2a', 'Tacc3', 'Tubb4b', 'Ncapd2', 'Rangap1', 'Cdk1', 'Smc4', 'Kif20b', 'Cdca8', 'Ckap2', 'Ndc80', 'Dlgap5', 'Hjurp', 'Ckap5', 'Bub1', 'Ckap2l', 'Ect2', 'Kif11', 'Birc5', 'Cdca2', 'Nuf2', 'Cdca3', 'Nusap1', 'Ttk', 'Aurka', 'Mki67', 'Pimreg', 'Ccnb2', 'Tpx2', 'Hjurp', 'Anln', 'Kif2c', 'Cenpe', 'Gtse1', 'Kif23', 'Cdc20', 'Ube2c', 'Cenpf', 'Cenpa', 'Hmmr', 'Ctcf', 'Psrc1', 'Cdc25c', 'Nek2', 'Gas2l3', 'G2e3']
+        else:
+            g2m_genes=g2m_genes
     sc.tl.score_genes_cell_cycle(adata,s_genes=s_genes, g2m_genes=g2m_genes)
 
 

diff --git a/omicverse/pp/_qc.py b/omicverse/pp/_qc.py
@@ -266,13 +266,13 @@ def qc_cpu(adata:anndata.AnnData, mode='seurat',
             adata.obs['sccomposite_doublet']=0
             adata.obs['sccomposite_consistency']=0
             if batch_key is None:
-                from _sccomposite import composite_rna
+                from ._sccomposite import composite_rna
                 multiplet_classification, consistency = composite_rna(adata)
                 adata.obs['sccomposite_doublet']=multiplet_classification
                 adata.obs['sccomposite_consistency']=consistency
             else:
                 for batch in adata.obs[batch_key].unique():
-                    from _sccomposite import composite_rna
+                    from ._sccomposite import composite_rna
                     adata_batch=adata[adata.obs[batch_key]==batch]
                     multiplet_classification, consistency = composite_rna(adata_batch)
                     adata.obs.loc[adata_batch.obs.index,'sccomposite_doublet']=multiplet_classification
@@ -381,13 +381,13 @@ def qc_gpu(adata, mode='seurat',
             adata.obs['sccomposite_doublet']=0
             adata.obs['sccomposite_consistency']=0
             if batch_key is None:
-                from _sccomposite import composite_rna
+                from ._sccomposite import composite_rna
                 multiplet_classification, consistency = composite_rna(adata)
                 adata.obs['sccomposite_doublet']=multiplet_classification
                 adata.obs['sccomposite_consistency']=consistency
             else:
                 for batch in adata.obs[batch_key].unique():
-                    from _sccomposite import composite_rna
+                    from ._sccomposite import composite_rna
                     adata_batch=adata[adata.obs[batch_key]==batch]
                     multiplet_classification, consistency = composite_rna(adata_batch)
                     adata.obs.loc[adata_batch.obs.index,'sccomposite_doublet']=multiplet_classification

diff --git a/omicverse_guide/docs/Release_notes.md b/omicverse_guide/docs/Release_notes.md
@@ -404,6 +404,7 @@ Support Raw Windows platform
 - Fixed an error of `pyTCGA.survival_analysis` when the matrix is sparse. #62, #68, #95
 - Added tqdm to visualize the process of `pyTCGA.survial_analysis_all`
 - Fixed an error of `data_drop_duplicates_index` with remove duplicate indexes to retain only the highest expressed genes #45
+- Added `geneset_plot_multi` in `ov.bulk` to visualize the multi results of enrichment. #103
 
 ### Single Module
 
@@ -412,3 +413,4 @@ Support Raw Windows platform
 ### PP Module
 - Fixed an error of `ov.pp.pca` when pcs smaller than 13. #102
 - Added `COMPOSITE` in `ov.pp.qc`'s method to predicted doublet cells. #103
+- Added `species` argument in `score_genes_cell_cycle` to calculate the cell phase without gene manual input