-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
75 lines (61 loc) · 2.49 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
Perform diapause signature scoring on scRNA-seq data
"""
import os
import sys
import yaml
import anndata as ad
import pandas as pd
import numpy as np
def signature_score(
adata: ad.AnnData, geneset_path: os.PathLike, geneset_id: str, inplace: bool = True
) -> None:
"""
Calculate gene signature scores from average expression of weighted genes.
Args:
adata (ad.AnnData): AnnData object with scRNA-seq read counts.
geneset_path (os.PathLike): Path to geneset CSV file.
geneset_id (str): Gene set identifier (e.g. diapause).
inplace (bool, optional): Whether signature scores will be amended to
the AnnData .obs layer or returned as a DataFrame. Defaults to True.
Returns:
None: By default, AnnData .obs layer is automatically updated.
"""
try:
geneset = pd.read_csv(geneset_path)
except AttributeError:
"Cannot open path to geneset file"
# Z-score scaling allows for read counts to be centered around the mean, so that
# final scores can be maximized regardless of expression directionality.
try:
raw = adata.X.todense()
except AttributeError:
"Cannot find raw layer in AnnData object"
stdev = np.std(raw, axis=0)
znorm = raw - np.mean(raw, axis=0) / stdev
reads = (
pd.DataFrame(znorm, index=adata.obs_names, columns=adata.var_names)
.fillna(0)
.replace([np.inf, -np.inf], 0)
)
# Weights correspond to upregulated/downregulated genes from the gene set.
# Downregulated genes are multiplied by -1 to ensure that the final score
# represents the magnitude of the gene signature.
gene_up = [gene for gene in geneset[geneset["weight"] >= 0]["gene"]]
gene_down = [gene for gene in geneset[geneset["weight"] < 0]["gene"]]
adata_up = reads[reads.columns.intersection(gene_up)]
adata_down = reads[reads.columns.intersection(gene_down)] * -1
adata_all = pd.concat([adata_up, adata_down], axis=1)
gene_signature_score = np.round(adata_all.mean(axis=1), 3)
if inplace == True:
adata.obs.loc[:, f"{geneset_id}_score"] = gene_signature_score
else:
return gene_signature_score
def main():
with open("./config.yaml", "r") as file:
config = yaml.load(file, Loader=yaml.FullLoader)
adata = ad.read_h5ad(config['adata_path'])
signature_score(adata, geneset_path=config['geneset_path'], geneset_id="diapause")
adata.obs.to_csv(config['output_path'])
if __name__ == "__main__":
sys.exit(main())