-
Notifications
You must be signed in to change notification settings - Fork 1
/
Run_SparseCC.sh
60 lines (52 loc) · 3.46 KB
/
Run_SparseCC.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
##############################################
#1. Activate conda environment with SparseCC##
##############################################
ENV=metaSNV
source activate $ENV
#################
#2. Run sparseCC#
#################
#Usage: Compute the correlation between components (e.g. OTUs). By default uses the SparCC algorithm to account for compositional effects. Correlation and covariance (when applies) matrices are written out as txt files.
#Counts file needs to be a tab delimited text file where columns are samples and rows are components (e.g. OTUS).
#Usage: SparCC.py counts_file [options]
#Example: SparCC.py example/fake_data.txt -i 20 --cor_file=example/basis_corr/cor_mat_sparcc.out
#Options:
# -h, --help show this help message and exit
# -c COR_FILE, --cor_file=COR_FILE
# File to which correlation matrix will be written.
# -v COV_FILE, --cov_file=COV_FILE
# File to which covariance matrix will be written.
# -a ALGO, --algo=ALGO Name of algorithm used to compute correlations (SparCC
# (default) | pearson | spearman | kendall)
# -i ITER, --iter=ITER Number of inference iterations to average over (20
# default).
# -x XITER, --xiter=XITER
# Number of exclusion iterations to remove strongly
# correlated pairs (10 default).
# -t TH, --thershold=TH
# Correlation strength exclusion threshold (0.1
# default).
#To run this script we first need to preprocess the data, see the main R script for that.
#Preprocessing consists of: 1. Removal of low prevalent taxa (0s are considered to be due to undersampling). 2. Formatting of data (transposition, first column taxa, first row Samples). 3. Dividing taxa from Cases and Controls
#Note: I changed core_methods.py so that in the normalize function, it uses np.apply_along_axis independently of the class of the object. Otherwise the output was a (x,) series object that would fail later on because it had no dimensions in its column
echo "Running SparCC in cases"
#Run in cases
SparCC.py Data/SparseCC_input/Cases.tsv -i 20 -t 0.1 --cor_file=Results/SparseCC/Cases/Corr_matrix_cases.txt
echo "Running SparCC in controls"
#Run in controls
SparCC.py Data/SparseCC_input/Controls.tsv -i 20 -t 0.1 --cor_file=Results/SparseCC/Controls/Corr_matrix_controls.txt
#Run permutations
echo "Permuting cases"
MakeBootstraps.py Data/SparseCC_input/Cases.tsv -p Data/SparseCC_input/Perm_cases/ -t 'permuted_#'
echo "Permuting controls"
MakeBootstraps.py Data/SparseCC_input/Controls.tsv -p Data/SparseCC_input/Perm_controls/ -t 'permuted_#'
#Get null distribution
echo "Generating NULL cases"
for i in `seq 0 99`; do SparCC.py Data/SparseCC_input/Perm_cases/permuted_$i -c Results/SparseCC/Cases/Perm/simulated_sparcc_$i.txt -i 20 -t 0.1 ; done
echo "Generating NULL controls"
for i in `seq 0 99`; do SparCC.py Data/SparseCC_input/Perm_controls/permuted_$i -c Results/SparseCC/Controls/Perm/simulated_sparcc_$i.txt -i 20 -t 0.1 ; done
#Compute Pvalues from Null
echo "Computing Ps in cases"
PseudoPvals.py Results/SparseCC/Cases/Corr_matrix_cases.txt 'Results/SparseCC/Cases/Perm/simulated_sparcc_#.txt' 100 -o Results/SparseCC/Cases/P_matrix.txt -t one_sided
echo "Computing Ps in controls"
PseudoPvals.py Results/SparseCC/Controls/Corr_matrix_controls.txt 'Results/SparseCC/Controls/Perm/simulated_sparcc_#.txt' 100 -o Results/SparseCC/Controls/P_matrix.txt -t two_sided