Skip to content

Commit

Permalink
Added gptcelltype in omicverse.single to annotate celltype using …
Browse files Browse the repository at this point in the history
…large language model #82.
  • Loading branch information
Starlitnightly committed May 29, 2024
1 parent c1f7dc8 commit bede8fe
Show file tree
Hide file tree
Showing 7 changed files with 782 additions and 9 deletions.
3 changes: 2 additions & 1 deletion omicverse/single/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@
from ._aucell import aucell
from ._metacell import MetaCell,plot_metacells,get_obs_value
from ._mdic3 import pyMDIC3
from ._cnmf import *
from ._cnmf import *
from ._gptcelltype import gptcelltype
18 changes: 12 additions & 6 deletions omicverse/single/_anno.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,9 @@ def scanpy_cellanno_from_dict(adata:anndata.AnnData,
def get_celltype_marker(adata:anndata.AnnData,
clustertype:str='leiden',
log2fc_min:int=2,scores_type='scores',
pval_cutoff:float=0.05,rank:bool=False)->dict:
pval_cutoff:float=0.05,rank:bool=False,
key='rank_genes_groups',method='wilcoxon',
foldchange=None,topgenenumber=10)->dict:
r"""Get marker genes for each clusters.
Arguments:
Expand All @@ -329,15 +331,19 @@ def get_celltype_marker(adata:anndata.AnnData,
celltypes = sorted(adata.obs[clustertype].unique())
cell_marker_dict={}
if rank==False:
sc.tl.rank_genes_groups(adata, clustertype, method='wilcoxon')
sc.tl.rank_genes_groups(adata, clustertype, method=method)
for celltype in celltypes:
degs = sc.get.rank_genes_groups_df(adata, group=celltype, key='rank_genes_groups', log2fc_min=log2fc_min,
degs = sc.get.rank_genes_groups_df(adata, group=celltype, key=key, log2fc_min=log2fc_min,
pval_cutoff=pval_cutoff)
foldp=np.histogram(degs[scores_type])
foldchange=(foldp[1][np.where(foldp[1]>0)[0][-5]]+foldp[1][np.where(foldp[1]>0)[0][-6]])/2

cellmarker=degs.loc[degs[scores_type]>foldchange]['names'].values
if foldchange is None:
foldchange=(foldp[1][np.where(foldp[1]>0)[0][-5]]+foldp[1][np.where(foldp[1]>0)[0][-6]])/2
cellmarker=degs.loc[degs[scores_type]>foldchange]['names'].values[:topgenenumber]
cell_marker_dict[celltype]=cellmarker

for key in cell_marker_dict.keys():
cell_marker_dict[key]=list(cell_marker_dict[key])


return cell_marker_dict

Expand Down
77 changes: 77 additions & 0 deletions omicverse/single/_gptcelltype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@


def gptcelltype(input, tissuename=None, speciename='human',
provider='qwen',model='qwen-plus', topgenenumber=10,
base_url=None):
"""
Annotation of cell types using AGI model.
Arguments:
input: dict, input dictionary with clusters as keys and gene markers as values.
tissuename: str, tissue name.
provider: str, provider of the model. Default: 'qwen', you can select from ['openai','kimi','qwen'] now.
"""
from openai import OpenAI
import os
import numpy as np
import pandas as pd
if base_url is None:
if provider == 'openai':
base_url = "https://api.openai.com/v1/"
elif provider == 'kimi':
base_url = "https://api.moonshot.cn/v1"
elif provider == 'qwen':
base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
QWEN_API_KEY = os.getenv("AGI_API_KEY")
if QWEN_API_KEY == "":
print("Note: AGI API key not found: returning the prompt itself.")
API_flag = False
else:
API_flag = True
client = OpenAI(
api_key=QWEN_API_KEY, # 如果您没有配置环境变量,请在此处用您的API Key进行替换
base_url=base_url,
)

if isinstance(input, dict):
input = {k: 'unknown' if not v else ','.join(v[:topgenenumber]) for k, v in input.items()}
elif isinstance(input, pd.DataFrame):
# Filter genes with positive log fold change and group by cluster, selecting top genes
input = input[input['logfoldchanges'] > 0]
input = input.groupby('cluster')['names'].apply(lambda x: ','.join(x.iloc[:topgenenumber]))
else:
raise ValueError("Input must be either a dictionary of lists or a pandas DataFrame.")

if not API_flag:
message = f'Identify cell types of {tissuename} cells in {speciename} using the following markers separately for each row. Only provide the cell type name. Do not show numbers before the name. Do not show numbers before the name. Some can be a mixture of multiple cell types.\n' + '\n'.join([f'{k}: {v}' for k, v in input.items()])
return message
else:
print("Note: AGI API key found: returning the cell type annotations.")
cutnum = int(np.ceil(len(input) / 30))
if cutnum > 1:
cid = np.digitize(range(1, len(input) + 1), bins=np.linspace(1, len(input), cutnum + 1))
else:
cid = np.ones(len(input), dtype=int)

allres = {}
for i in range(1, cutnum + 1):
id_list = [j for j, x in enumerate(cid) if x == i]
flag = False
while not flag:
response = client.chat.completions.create(
model=model,
messages=[{"role": "user",
"content": f'Identify cell types of {tissuename} cells using the following markers separately for each row. Only provide the cell type name. Do not show numbers before the name. Some can be a mixture of multiple cell types.\n' + '\n'.join([input[list(input.keys())[j]] for j in id_list if input[list(input.keys())[j]] != 'unknown'])}]
)
#return response
res = response.choices[0].message.content.split('\n')
if len(res) == len(id_list):
flag = True
for idx, cell_type in zip(id_list, res):
key = list(input.keys())[idx]
allres[key] = 'unknown' if input[key] == 'unknown' else cell_type.strip(',')


print('Note: It is always recommended to check the results returned by GPT-4 in case of AI hallucination, before going to downstream analysis.')
return allres
3 changes: 2 additions & 1 deletion omicverse_guide/docs/Release_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -359,4 +359,5 @@ Move `CEFCON`,`GNTD`,`mofapy2`,`spaceflow`,`spatrio`,`STAligner`,`tosica` from r
### single Module

- Added `get_results_rfc` in `omicverse.single.cNMF` to predict the precise cluster in complex scRNA-seq/stRNA-seq
- - Added `get_results_rfc` in `omicverse.utils.LDA_topic` to predict the precise cluster in complex scRNA-seq/stRNA-seq
- Added `get_results_rfc` in `omicverse.utils.LDA_topic` to predict the precise cluster in complex scRNA-seq/stRNA-seq
- Added `gptcelltype` in `omicverse.single` to annotate celltype using large language model #82.
687 changes: 687 additions & 0 deletions omicverse_guide/docs/Tutorials-single/t_gptanno.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions omicverse_guide/mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ nav:
- Consensus Non-negative Matrix factorization (cNMF): Tutorials-single/t_cnmf.ipynb
- Data integration and batch correction: Tutorials-single/t_single_batch.ipynb
- Annotation:
- Automatic cell type annotation with GPT/Other: Tutorials-single/t_gptanno.ipynb
- Celltype auto annotation with SCSA: Tutorials-single/t_cellanno.ipynb
- Celltype auto annotation with MetaTiME: Tutorials-single/t_metatime.ipynb
- Celltype annotation migration(mapping) with TOSICA: Tutorials-single/t_tosica.ipynb
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "flit_core.buildapi"

[project]
name = "omicverse"
version = "1.5.10"
version = "1.6.0"
description = "OmicVerse: A single pipeline for exploring the entire transcriptome universe"
readme = "README.md"
requires-python = ">=3.8"
Expand Down

0 comments on commit bede8fe

Please sign in to comment.