Added gptcelltype in omicverse.single to annotate celltype using …

…large language model #82.
Starlitnightly · May 29, 2024 · bede8fe · bede8fe
1 parent c1f7dc8
commit bede8fe
Show file tree

Hide file tree

Showing 7 changed files with 782 additions and 9 deletions.
diff --git a/omicverse/single/__init__.py b/omicverse/single/__init__.py
@@ -21,4 +21,5 @@
 from ._aucell import aucell
 from ._metacell import MetaCell,plot_metacells,get_obs_value
 from ._mdic3 import pyMDIC3
-from ._cnmf import *
+from ._cnmf import *
+from ._gptcelltype import gptcelltype
diff --git a/omicverse/single/_anno.py b/omicverse/single/_anno.py
@@ -311,7 +311,9 @@ def scanpy_cellanno_from_dict(adata:anndata.AnnData,
 def get_celltype_marker(adata:anndata.AnnData,
                             clustertype:str='leiden',
                             log2fc_min:int=2,scores_type='scores',
-                            pval_cutoff:float=0.05,rank:bool=False)->dict:
+                            pval_cutoff:float=0.05,rank:bool=False,
+                            key='rank_genes_groups',method='wilcoxon',
+                            foldchange=None,topgenenumber=10)->dict:
         r"""Get marker genes for each clusters.
         
         Arguments:
@@ -329,15 +331,19 @@ def get_celltype_marker(adata:anndata.AnnData,
         celltypes = sorted(adata.obs[clustertype].unique())
         cell_marker_dict={}
         if rank==False:
-            sc.tl.rank_genes_groups(adata, clustertype, method='wilcoxon')
+            sc.tl.rank_genes_groups(adata, clustertype, method=method)
         for celltype in celltypes:
-            degs = sc.get.rank_genes_groups_df(adata, group=celltype, key='rank_genes_groups', log2fc_min=log2fc_min, 
+            degs = sc.get.rank_genes_groups_df(adata, group=celltype, key=key, log2fc_min=log2fc_min, 
                                             pval_cutoff=pval_cutoff)
             foldp=np.histogram(degs[scores_type])
-            foldchange=(foldp[1][np.where(foldp[1]>0)[0][-5]]+foldp[1][np.where(foldp[1]>0)[0][-6]])/2
-
-            cellmarker=degs.loc[degs[scores_type]>foldchange]['names'].values
+            if foldchange is None:
+                foldchange=(foldp[1][np.where(foldp[1]>0)[0][-5]]+foldp[1][np.where(foldp[1]>0)[0][-6]])/2
+            cellmarker=degs.loc[degs[scores_type]>foldchange]['names'].values[:topgenenumber]
             cell_marker_dict[celltype]=cellmarker
+
+        for key in cell_marker_dict.keys():
+            cell_marker_dict[key]=list(cell_marker_dict[key])
+
 
         return cell_marker_dict
 

diff --git a/omicverse/single/_gptcelltype.py b/omicverse/single/_gptcelltype.py
@@ -0,0 +1,77 @@
+
+
+def gptcelltype(input, tissuename=None, speciename='human',
+                provider='qwen',model='qwen-plus', topgenenumber=10,
+                base_url=None):
+    """
+    Annotation of cell types using AGI model.
+
+    Arguments:
+        input: dict, input dictionary with clusters as keys and gene markers as values.
+        tissuename: str, tissue name.
+        provider: str, provider of the model. Default: 'qwen', you can select from ['openai','kimi','qwen'] now.
+
+    """
+    from openai import OpenAI
+    import os
+    import numpy as np
+    import pandas as pd
+    if base_url is None:
+        if provider == 'openai':
+            base_url = "https://api.openai.com/v1/"
+        elif provider == 'kimi':
+            base_url = "https://api.moonshot.cn/v1"
+        elif provider == 'qwen':
+            base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+    QWEN_API_KEY = os.getenv("AGI_API_KEY")
+    if QWEN_API_KEY == "":
+        print("Note: AGI API key not found: returning the prompt itself.")
+        API_flag = False
+    else:
+        API_flag = True
+    client = OpenAI(
+        api_key=QWEN_API_KEY, # 如果您没有配置环境变量，请在此处用您的API Key进行替换
+        base_url=base_url,
+    )
+
+    if isinstance(input, dict):
+        input = {k: 'unknown' if not v else ','.join(v[:topgenenumber]) for k, v in input.items()}
+    elif isinstance(input, pd.DataFrame):
+        # Filter genes with positive log fold change and group by cluster, selecting top genes
+        input = input[input['logfoldchanges'] > 0]
+        input = input.groupby('cluster')['names'].apply(lambda x: ','.join(x.iloc[:topgenenumber]))
+    else:
+        raise ValueError("Input must be either a dictionary of lists or a pandas DataFrame.")
+
+    if not API_flag:
+        message = f'Identify cell types of {tissuename} cells in {speciename} using the following markers separately for each row. Only provide the cell type name. Do not show numbers before the name. Do not show numbers before the name. Some can be a mixture of multiple cell types.\n' + '\n'.join([f'{k}: {v}' for k, v in input.items()])
+        return message
+    else:
+        print("Note: AGI API key found: returning the cell type annotations.")
+        cutnum = int(np.ceil(len(input) / 30))
+        if cutnum > 1:
+            cid = np.digitize(range(1, len(input) + 1), bins=np.linspace(1, len(input), cutnum + 1))
+        else:
+            cid = np.ones(len(input), dtype=int)
+
+        allres = {}
+        for i in range(1, cutnum + 1):
+            id_list = [j for j, x in enumerate(cid) if x == i]
+            flag = False
+            while not flag:
+                response = client.chat.completions.create(
+                    model=model,
+                    messages=[{"role": "user", 
+                               "content": f'Identify cell types of {tissuename} cells using the following markers separately for each row. Only provide the cell type name. Do not show numbers before the name. Some can be a mixture of multiple cell types.\n' + '\n'.join([input[list(input.keys())[j]] for j in id_list if input[list(input.keys())[j]] != 'unknown'])}]
+                )
+                #return response
+                res = response.choices[0].message.content.split('\n')
+                if len(res) == len(id_list):
+                    flag = True
+            for idx, cell_type in zip(id_list, res):
+                key = list(input.keys())[idx]
+                allres[key] = 'unknown' if input[key] == 'unknown' else cell_type.strip(',')
+
+
+        print('Note: It is always recommended to check the results returned by GPT-4 in case of AI hallucination, before going to downstream analysis.')
+        return allres
diff --git a/omicverse_guide/docs/Release_notes.md b/omicverse_guide/docs/Release_notes.md
@@ -359,4 +359,5 @@ Move `CEFCON`,`GNTD`,`mofapy2`,`spaceflow`,`spatrio`,`STAligner`,`tosica` from r
 ### single Module
 
 - Added `get_results_rfc` in `omicverse.single.cNMF` to predict the precise cluster in complex scRNA-seq/stRNA-seq
-- - Added `get_results_rfc` in `omicverse.utils.LDA_topic` to predict the precise cluster in complex scRNA-seq/stRNA-seq
+- Added `get_results_rfc` in `omicverse.utils.LDA_topic` to predict the precise cluster in complex scRNA-seq/stRNA-seq
+- Added `gptcelltype` in `omicverse.single` to annotate celltype using large language model #82.
diff --git a/omicverse_guide/docs/Tutorials-single/t_gptanno.ipynb b/omicverse_guide/docs/Tutorials-single/t_gptanno.ipynb
diff --git a/omicverse_guide/mkdocs.yml b/omicverse_guide/mkdocs.yml
@@ -24,6 +24,7 @@ nav:
             - Consensus Non-negative Matrix factorization (cNMF): Tutorials-single/t_cnmf.ipynb
             - Data integration and batch correction: Tutorials-single/t_single_batch.ipynb
         - Annotation: 
+            - Automatic cell type annotation with GPT/Other: Tutorials-single/t_gptanno.ipynb
             - Celltype auto annotation with SCSA: Tutorials-single/t_cellanno.ipynb
             - Celltype auto annotation with MetaTiME: Tutorials-single/t_metatime.ipynb
             - Celltype annotation migration(mapping) with TOSICA: Tutorials-single/t_tosica.ipynb

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "flit_core.buildapi"
 
 [project]
 name = "omicverse"
-version = "1.5.10"
+version = "1.6.0"
 description = "OmicVerse: A single pipeline for exploring the entire transcriptome universe"
 readme = "README.md"
 requires-python = ">=3.8"