forked from rajewsky-lab/novosparc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
reconstruct_tissue.py
108 lines (78 loc) · 3.83 KB
/
reconstruct_tissue.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
###########
# imports #
###########
import novosparc
import time
import numpy as np
from scipy.spatial.distance import cdist
from scipy.stats import pearsonr
if __name__ == '__main__':
###################################
# 1. Import and subset the data ###
###################################
start_time = time.time()
print ('Loading data ... ', end='', flush=True)
# Read the gene names from the DGE. Assumes that the DGE is formatted
# with genes are rows and cells as columns.
gene_names = np.genfromtxt('/pathto/dge.txt',
usecols=range(1), dtype='str', skip_header=True)
# Read the DGE itself. Read only a maximum number of cells.
max_cell_number = 1000
dge = np.loadtxt('/pathto/dge.txt',
usecols=range(1, max_cell_number), skiprows=1)
# Optional: downsample number of cells.
cells_selected, dge = novosparc.pp.subsample_dge(dge, 500, 800)
num_cells = dge.shape[0]
# Optional: Subset to the highly variable genes or other set of genes
high_var_genes = np.genfromtxt('/pathto/hvg.txt', dtype='str')
dge = dge[np.nonzero(np.in1d(gene_names, high_var_genes))[0], :]
dge = dge.T
print ('done (', round(time.time()-start_time, 2), 'seconds )')
################################
# 2. Set the target space grid #
################################
print ('Constructing the target grid ... ', end='', flush=True)
# Construct a square target grid or read it if available
locations = novosparc.rc.construct_target_grid(num_cells)
print ('done')
######################################
# 3. Setup for the OT reconstruction #
######################################
cost_expression, cost_locations = novosparc.rc.setup_for_OT_reconstruction(dge, locations, 5)
#############################
# 4. Spatial reconstruction #
#############################
start_time = time.time()
print ('Reconstructing spatial information for',
num_cells, 'cells and', locations.shape[0], 'locations ... ')
# Distributions at target and source spaces. If unkown uniform distributions
# should be used.
p_locations, p_expression = novosparc.rc.create_space_distributions(len(locations), num_cells)
# alpha parameter controls the reconstruction. Set 0 for de novo, between
# 0 and 1 in case markers are available.
alpha_linear = 0
# Compute the mapping probabilities for each cell.
gw = gwa.gromov_wasserstein_adjusted_norm(1, cost_expression, cost_locations,
alpha_linear, p_expression, p_locations,
'square_loss', epsilon=5e-4, verbose=True)
# Compute the sDGE.
sdge = np.dot(dge_full.T, gw)
print (' ... done (', round(time.time()-start_time, 2), 'seconds )')
#########################################
# 5. Write data to disk for further use #
#########################################
start_time = time.time()
print ('Writing data to disk ...', end='', flush=True)
np.savetxt('output/genes.txt', gene_names, fmt='%s')
np.savetxt('output/high_var_genes.txt', high_var_genes, fmt='%s')
np.savetxt('output/grid_' + shape + '_' + str(num_cells) + '_cells_'
+ str(locations.shape[0]) + '_locations.txt', locations, fmt='%.4e')
np.savetxt('output/sdge_' + str(num_cells) + '_cells_'
+ str(locations.shape[0]) + '_locations.txt', sdge, fmt='%.4e')
print ('done (', round(time.time()-start_time, 2), 'seconds )')
###########################
# 6. Plot gene expression #
###########################
gene_list_to_plot = ['gene1', 'gene2']
novosparc.pl.plot_gene_patterns(locations, sdge, gene_list_to_plot,
folder='output/', gene_names=gene_names)