-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimulate_RNAseq.Rmd
executable file
·110 lines (86 loc) · 2.31 KB
/
simulate_RNAseq.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
```{r warning=FALSE}
library(dplyr)
library(polyester)
library(Biostrings)
```
```{r}
# Read the TSV file
gene_sets <- read.table('./output/gene_sets_with_indices.tsv', header=TRUE, sep='\t')
head(gene_sets)
```
```{r}
# Define fold change values for each gene set
fc_values <- list(
M47362 = 0.5,
M47501 = 0.5,
M47365 = 1,
M47478 = 1,
M47412 = 3,
M47419 = 5,
M47422 = 5,
M47425 = 10,
M47450 = 10,
M47452 = 20
)
```
# Add LogFC value to geneset
```{r}
# Create fold change table for each gene
set.seed(123)
STN <- 0.1
fc_table <- gene_sets %>%
group_by(gene_set) %>%
mutate(fc = runif(n(), fc_values[[unique(gene_set)]] - STN, fc_values[[unique(gene_set)]] + STN))
write.table(fc_table, file = './output/fc_table.tsv', sep = '\t', row.names = FALSE, quote = FALSE)
head(fc_table, 100)
```
```{r}
# Load the FASTA sequences
fasta_file <- './data/genome/gencode.v38.transcripts.fa'
fasta_sequences <- readDNAStringSet(fasta_file)
# Create a fold changes matrix
num_genes <- length(fasta_sequences)
num_groups <- 2 # Case and control
fold_changes <- matrix(1, nrow=num_genes, ncol=num_groups)
# Update fold changes for the genes in the fc_table
for (i in 1:nrow(fc_table)) {
gene_index <- fc_table$index[i]
fc <- fc_table$fc[i]
fold_changes[gene_index, 1] <- fc # Case group
fold_changes[gene_index, 2] <- 1 # Control group (no change)
}
print(head(fold_changes))
```
```{r}
# Check the fold_changes matrix for updated values
updated_indices <- which(fold_changes[, 1] != 1)
print(fold_changes[updated_indices, ])
```
```{r}
# Verify fold_changes matrix dimensions
dim(fold_changes)
```
```{r}
reads_per_transcript <- rep(300, num_genes) # Default 300
# Ensure the reads_per_transcript length is correct
if (length(reads_per_transcript) != num_genes) {
stop("reads_per_transcript length does not match the number of genes")
}
```
# Simulation
```{r}
# Simulate RNA-seq experiment
num_reps <- 5
simulate_experiment(
fasta = fasta_file,
num_reps = c(num_reps, num_reps),
fold_changes = fold_changes,
reads_per_transcript = reads_per_transcript,
# meanmodel=TRUE,
outdir = './output/simulated_rnaseq_data',
paired = FALSE
)
# Verify the simulation
simulated_files <- list.files('./output/simulated_rnaseq_data', full.names = TRUE)
print(simulated_files)
```