KIAT_F1_data_analysis.Rmd

---
title: "KIAT_F1_data_analysis"
author: "Ruijuan Li"
date: "August 12, 2016"
output: html_document
---
######
```{r}
# reformat.vcf.F1(vcf) is the function I wrote to pre-process vcf files from F1 
```
######

```{r}
# 1) download file from ftp using ftp_file_transfer.sh
# 1.1) unzip files using gunzip *  
# 2) get first 1000 sequences for each file extract_1000_reads.sh, actually worked with all data instead of the 1st 1000 seqs
# 3) QC file using fastQC: fastqc *.fq -o /Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/ruijuanli/2016_summer/fastqc_result
# 4) download Brassica_napus.annotation_v5.cds.fa to whitney 
# 5) collapse fastqc summary result into one file & get overrepresented sequences using ./make_summary_report.sh 

# 6) reformat summary.all.txt file 
setwd("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/analysis/")
fastQC_summary <- read.delim("./fastQC_result/summary_all.txt", header = F)
dim(fastQC_summary)
head(fastQC_summary)
tail(fastQC_summary)

fastQC_summary_wide <- reshape(fastQC_summary, timevar="V2",idvar="V3",direction="wide")
colnames(fastQC_summary_wide) <- gsub("(V1)(.)([[:print:]]+)","\\3",colnames(fastQC_summary_wide))
dim(fastQC_summary_wide)

write.csv(fastQC_summary_wide, file = "fastQC_summary_wide.csv")

# 7) checked the source of overrepresented sequences using web nucleotide blast tool, found that most of them are from 
# cholorplast, mitochondra, rRNA, tansposon, etc. Some are illumina or TruSeq adapters 

# blast sequences against Trimmomatic adapters suggest that TruSeq3-PE-2.fa should be used to trimm the data. (Q: how to deal with the others? will they affect the final result? search... )

# Qs on plstids seqs contamination: 1) why are there these sequences? check the lib making step 2) will they affect downstream analysis if kept? CDS mapping VS genome mapping 3) If yes, how to remove them? (I need to figure these Qs out.)

# 8) using kallisto to build the reference cds index 
# kallisto index -k 19 -i Brassica_napus.annotation_v5.cds.19.kai /Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/ruijuanli/Reference/B.napus/Brassica_napus.annotation_v5.cds.fa

# 9) map to reference cds index and get read count file using bunchrun_kallisto.sh (using est_counts from Kallisto)
```

# data summary statisitics read count number & expressed genes 
```{r}
F1.read.count.normalized <- read.table("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/read.count.normalized.tsv", header = T, check.names = F)
rownames(F1.read.count.normalized) <- F1.read.count.normalized[,1]
F1.read.count.normalized <- F1.read.count.normalized[,-1]

# format data 
head(F1.read.count.normalized)
dim(F1.read.count.normalized) # 101040     24 
colnames(F1.read.count.normalized)
colnames(F1.read.count.normalized) <- sub("_2.fq","",colnames(F1.read.count.normalized),fixed = TRUE) # remove _2.fq 

# sample description  
new_sample_ID.F1

# replace sample ID 
colnames(F1.read.count.normalized) <- new_sample_ID.F1
head(F1.read.count.normalized)
save(F1.read.count.normalized,file="/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/F1.read.count.normalized.Rdata")

# check number of genes with reads number > 10 in each lib # make summary table 
expressed_gene_number.F1 <- 
sapply(colnames(F1.read.count.normalized), function(RNAlib){
  sum(F1.read.count.normalized[,RNAlib]>3)
}
)
expressed_gene_number.F1 <- as.data.frame(expressed_gene_number.F1)
expressed_gene_number.F1

# top 10 highly expressed genes in each lib 
most_highly_expressed_genes.F1 <- 
sapply(colnames((F1.read.count.normalized)), function(RNAlib){
  rownames(head(F1.read.count.normalized[order(F1.read.count.normalized[,RNAlib],decreasing = TRUE),], 10))
}
)
# write.csv(t(most_highly_expressed_genes.F1), file = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/most_highly_expressed_genes.F1.csv")

temp <- sample_des.F1[,c(5,6,7,10)]
rownames(temp) <- paste(sample_des.F1[,5], sample_des.F1[,6], sample_des.F1[,7], sep = "_")
sample_statistics.F1 <- merge(temp, expressed_gene_number.F1, by=0) 
colnames(sample_statistics.F1) <- c("sample_ID", "Cultivar", "Tissue", "rep", "total_reads", "expressed_gene_number")
head(sample_statistics.F1)
colnames(sample_statistics.F1)
# write.csv(sample_statistics.F1, file = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/sample_statisitcs.F1.csv")
```

# Expression analysis 1) formatting data   
```{r}
F1.read.count <- read.table("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/read.count.tsv", header = T, check.names = F)

rownames(F1.read.count) <- F1.read.count[,1]
F1.read.count <- F1.read.count[,-1]

# format data 
head(F1.read.count)
dim(F1.read.count) # 101040     24 
colnames(F1.read.count)
colnames(F1.read.count) <- sub("_2.fq","",colnames(F1.read.count),fixed = TRUE) # remove _2.fq 

# sample description  
sample_des.F1 <- read.csv("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/F1_summary.csv")
dim(sample_des.F1)
sorted_sample_des.F1 <- sample_des.F1[order(sample_des.F1$Sample.ID),]

sorted_sample_des.F1[,5:8]
new_sample_ID.F1 <- paste(sorted_sample_des.F1$cultivar, sorted_sample_des.F1$Stage, sorted_sample_des.F1$rep, sep = "_")
new_sample_ID.F1

# calculate GB size of libs 
mean(as.numeric(sub("Gb","",sample_des.F1$TotalBases.Gb.))) 
min(as.numeric(sub("Gb","",sample_des.F1$TotalBases.Gb.))) 
max(as.numeric(sub("Gb","",sample_des.F1$TotalBases.Gb.))) 

# replace sample ID 
colnames(F1.read.count)
colnames(F1.read.count) <- new_sample_ID.F1
head(F1.read.count)
save(F1.read.count,file="/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/F1.read.count.Rdata")
```

# expression analysis 2) set up sample description 
```{r}
# load necessary libs & functions 
library(edgeR)
library(ggplot2)

# filter based on read count, assign group, normalize, design matrix, calculate dispersion   
# set up group 
load("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/F1.read.count.Rdata")
F1.read.count <- F1.read.count[,colSums(F1.read.count) > 1000000]  
dim(F1.read.count) # 101040     24 
colnames(F1.read.count)

F1.read.count.sample<-data.frame(file=colnames(F1.read.count),
                                         batch=factor(gsub("(414F1|415F1)(_)(flowering|early-silique|late-silique|Young)(_)(1|2|3)","\\5",colnames(F1.read.count))),  
                                         genotype=factor(gsub("(414F1|415F1)(_)(flowering|early-silique|late-silique|Young)(_)(1|2|3)","\\1",colnames(F1.read.count))),	
                                         tissue=factor(gsub("(414F1|415F1)(_)(flowering|early-silique|late-silique|Young)(_)(1|2|3)","\\3",colnames(F1.read.count))),	
                                         group=factor(gsub("(414F1|415F1)(_)(flowering|early-silique|late-silique|Young)(_)(1|2|3)","\\1\\3",colnames(F1.read.count)))
)

F1.read.count.sample
ftable(F1.read.count.sample,row.vars="tissue",col.vars=c("batch","genotype"))

# filter based on read count 
F1.read.count.small <- F1.read.count[rowSums(F1.read.count > 10) >= 3,]
dim(F1.read.count.small) # 60176  24 
save(F1.read.count.small, file = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/F1.read.count.small.Rdata")
```

# expression analysis 3) voom transformation # think more about the design in voom transformation... 
```{r}
# voom transformation 
# source("https://bioconductor.org/biocLite.R")
# biocLite("DESeq2")
load("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/F1.read.count.small.Rdata")
library("DESeq2")

dds.F1 <- DESeqDataSetFromMatrix(countData = round(F1.read.count.small), colData = F1.read.count.sample, design = ~ batch + genotype*tissue)

vsd.F1 <- varianceStabilizingTransformation(dds.F1)
vstMat.F1 <- assay(vsd.F1)
colnames(vstMat.F1) <- colnames(F1.read.count.small)
save(vstMat.F1, file = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/vstMat.F1.Rdata")
```

# MDS, clustering 
```{r}
load("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/vstMat.F1.Rdata")
load("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/F1.read.count.small.Rdata")

# normalize 
dge.data.F1 <- DGEList(counts=F1.read.count.small, group=F1.read.count.sample$group)
dge.data.F1 <- calcNormFactors(dge.data.F1, method = "TMM") 
dge.data.F1$sample
hist(dge.data.F1$samples$norm.factors)

# MDS to check sample seperation, also using dengrogram.  
# plotMDS(dge.data.F1, method = "bcv",labels = rownames(dge.data.F1$samples))
mds.F1 <- plotMDS(dge.data.F1, method = "bcv",labels = dge.data.F1$samples$group)

x.F1 <- as.data.frame(mds.F1$x)
y.F1 <- as.data.frame(mds.F1$y)
distance_matrix.F1 <- merge(x.F1, y.F1, by="row.names")
distance_matrix.F1$group <- gsub("(414F1|415F1)(_)(flowering|early-silique|late-silique|Young)(_)(1|2|3)","\\1\\3",distance_matrix.F1$Row.names)
distance_matrix.F1$gt <- gsub("(414F1|415F1)(_)(flowering|early-silique|late-silique|Young)(_)(1|2|3)","\\1",distance_matrix.F1$Row.names)
distance_matrix.F1$tissue <- gsub("(414F1|415F1)(_)(flowering|early-silique|late-silique|Young)(_)(1|2|3)","\\3",distance_matrix.F1$Row.names)

colnames(distance_matrix.F1) <- c("lib","x","y","group","gt","tissue")
head(distance_matrix.F1)

# making color MDS figure 
p <- ggplot(data = distance_matrix.F1)
p <- p + geom_point(aes(x, y, color=factor(gt), shape=factor(tissue)), size=3) 
p <- p + labs(y = "BCV distance 2", x="BCV distance 1")
# p <- p + facet_grid(~gt)
p

ggsave("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/MDS_2.png", width = 6, height = 4)

# clustering to check sample seperation 
# install.packages("ggdendro")
library(ggdendro)
library(pvclust)

hc.F1 <- hclust(dist(t(vstMat.F1[1:1000,])))
# test <- ggdendrogram(hc.F1, theme_dendro = F, rotate = T)

ggdata <- dendro_data(as.dendrogram(hc.F1))
ggdata$labels$gt <- gsub("(414F1|415F1)(_)(flowering|early-silique|late-silique|Young)(_)(1|2|3)","\\1",ggdata$labels$label) 
ggdata$labels$tissue <- gsub("(414F1|415F1)(_)(flowering|early-silique|late-silique|Young)(_)(1|2|3)","\\3",ggdata$labels$label)
ggdata$labels$group <- paste(ggdata$labels$gt, ggdata$labels$tissue, sep = "_")
ggdata$labels

# start ggplot here 
p2 <- ggplot(data = segment(ggdata))
p2 <- p2 + geom_segment(aes(x=x, y=y, xend=xend, yend=yend)) + theme_dendro()
p2 <- p2 + geom_text(data = label(ggdata), aes(x = x, y = y, label = label, hjust=0, color=group)) + coord_flip() + scale_y_reverse(expand=c(0.2, 0))
p2
# ggsave("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/clustering.png", width = 13, height = 8)
```

# expression analysis 
```{r}
# pairwise comparison using GLM model 
# design matrix
design.F1 <- model.matrix(~0+group, data = F1.read.count.sample)

# w/o batch effect
dge.data.F1 <- estimateGLMCommonDisp(dge.data.F1, design.F1,verbose = TRUE) # Disp = 0.08264 , BCV = 0.2875 
dge.data.F1 <- estimateGLMTrendedDisp(dge.data.F1,design.F1)
dge.data.F1 <- estimateGLMTagwiseDisp(dge.data.F1,design.F1)
plotBCV(dge.data.F1)

## w/o batch effect
fit.F1 <- glmFit(dge.data.F1, design.F1)
colnames(fit.F1$design)

# between genotypes, 414 as reference level  
lrt.young.F1 <- glmLRT(fit.F1, contrast = c(0,0,0,-1,0,0,0,1))
topTags(lrt.young.F1) 
summary(de.young.F1 <- decideTestsDGE(lrt.young.F1, p=0.05))
DEgene.young.F1 <- topTags(lrt.young.F1,n = Inf)$table[topTags(lrt.young.F1,n = Inf)$table$FDR<0.05,]
# -1  1640
# 0  56152
# 1   2384

lrt.flowering.F1 <- glmLRT(fit.F1, contrast = c(0,-1,0,0,0,1,0,0))
topTags(lrt.flowering.F1)
summary(de.flowering.F1 <- decideTestsDGE(lrt.flowering.F1, p=0.05))
DEgene.flowering.F1 <- topTags(lrt.flowering.F1,n = Inf)$table[topTags(lrt.flowering.F1,n = Inf)$table$FDR<0.05,]
# -1   961
# 0  58526
# 1    689

lrt.early.silique.F1 <- glmLRT(fit.F1, contrast = c(-1,0,0,0,1,0,0,0))
topTags(lrt.early.silique.F1)
summary(de.early.silique.F1 <- decideTestsDGE(lrt.early.silique.F1, p=0.05))
DEgene.early.silique.F1 <- topTags(lrt.early.silique.F1,n = Inf)$table[topTags(lrt.early.silique.F1,n = Inf)$table$FDR<0.05,]
# -1    69
# 0  60061
# 1     46

lrt.late.silique.F1 <- glmLRT(fit.F1, contrast = c(0,0,-1,0,0,0,1,0))
topTags(lrt.late.silique.F1)
summary(de.late.silique.F1 <- decideTestsDGE(lrt.late.silique.F1, p=0.05))
DEgene.late.silique.F1 <- topTags(lrt.late.silique.F1,n = Inf)$table[topTags(lrt.late.silique.F1,n = Inf)$table$FDR<0.05,]
# -1   170
# 0  59840
# 1    166

### reformat for ggplot barplot 
DEGs_number_between_gt <- data.frame(young = as.data.frame(summary(de.young.F1 <- decideTestsDGE(lrt.young.F1, p=0.05)))$Freq,
                                     flowering = as.data.frame(summary(de.flowering.F1 <- decideTestsDGE(lrt.flowering.F1, p=0.05)))$Freq, 
                                     early.silique = as.data.frame(summary(de.early.silique.F1 <- decideTestsDGE(lrt.early.silique.F1, p=0.05)))$Freq,
                                     late.silique = as.data.frame(summary(de.late.silique.F1 <- decideTestsDGE(lrt.late.silique.F1, p=0.05)))$Freq)

rownames(DEGs_number_between_gt) <- c("down", "no", "up")
DEGs_number_between_gt <- DEGs_number_between_gt[c("down", "up"),]
DEGs_number_between_gt

library(reshape2)
DEGs_number_between_gt.melt <- melt(DEGs_number_between_gt)
DEGs_number_between_gt.melt$DE <- rep(c("down", "up"), 4)
colnames(DEGs_number_between_gt.melt) <- c("tissue", "number", "DE")
DEGs_number_between_gt.melt

# reorder: up 1st down 2nd 
DEGs_number_between_gt.melt$DE <- factor(DEGs_number_between_gt.melt$DE, levels = c("up", "down"))

DEGs_number_between_gt.melt <- DEGs_number_between_gt.melt[order(DEGs_number_between_gt.melt$DE),]
DEGs_number_between_gt.melt

### making ggplot for DEGs 
p.DEGs_number_between_gt <- ggplot(data = DEGs_number_between_gt.melt)
p.DEGs_number_between_gt <- p.DEGs_number_between_gt + geom_bar(mapping = aes(fill=DE, x = factor(DE), y = number), stat = "identity")
p.DEGs_number_between_gt <- p.DEGs_number_between_gt + facet_grid(~tissue)
p.DEGs_number_between_gt <- p.DEGs_number_between_gt + labs(y = "number of differentially expressed genes", x = "")

p.DEGs_number_between_gt
ggsave("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/DEGs_number_between_gt.png", width = 6, height = 4)

colnames(fit.F1$design)
# between developmental stages 
### 414 
lrt.young.vs.flowering.414 <- glmLRT(fit.F1, contrast = c(0,1,0,-1,0,0,0,0))
topTags(lrt.young.vs.flowering.414)
summary(de.young.vs.flowering.414 <- decideTestsDGE(lrt.young.vs.flowering.414, p=0.05))
DEgene.young.vs.flowering.414 <- topTags(lrt.young.vs.flowering.414,n = Inf)$table[topTags(lrt.young.vs.flowering.414,n = Inf)$table$FDR<0.05,]
# -1 12706
# 0  30576
# 1  16894

lrt.flowering.vs.early.silique.414 <- glmLRT(fit.F1, contrast = c(1,-1,0,0,0,0,0,0))
topTags(lrt.flowering.vs.early.silique.414)
summary(de.flowering.vs.early.silique.414 <- decideTestsDGE(lrt.flowering.vs.early.silique.414, p=0.05))
DEgene.flowering.vs.early.silique.414 <- topTags(lrt.flowering.vs.early.silique.414,n = Inf)$table[topTags(lrt.flowering.vs.early.silique.414,n = Inf)$table$FDR<0.05,]
# -1  8920
# 0  44337
# 1   6919

lrt.early.vs.late.silique.414 <- glmLRT(fit.F1, contrast = c(-1,0,1,0,0,0,0,0))
topTags(lrt.early.vs.late.silique.414)
summary(de.early.vs.late.silique.414 <- decideTestsDGE(lrt.early.vs.late.silique.414, p=0.05))
DEgene.early.vs.late.silique.414 <- topTags(lrt.early.vs.late.silique.414,n = Inf)$table[topTags(lrt.early.vs.late.silique.414,n = Inf)$table$FDR<0.05,]
# -1  3049
# 0  53803
# 1   3324

### ggplot 
### reformat for ggplot barplot 
DEGs_number_between_tissue.414 <- data.frame("flowering-vs-young" = as.data.frame(summary(de.young.vs.flowering.414 <- decideTestsDGE(lrt.young.vs.flowering.414, p=0.05)))$Freq, 
                                     "early.slique-vs-flowering"= as.data.frame(summary(de.flowering.vs.early.silique.414 <- decideTestsDGE(lrt.flowering.vs.early.silique.414, p=0.05)))$Freq, 
                                     "late.silique-vs-early.silique" = as.data.frame(summary(de.early.vs.late.silique.414 <- decideTestsDGE(lrt.early.vs.late.silique.414, p=0.05)))$Freq)

rownames(DEGs_number_between_tissue.414) <- c("down", "no", "up")
DEGs_number_between_tissue.414 <- DEGs_number_between_tissue.414[c("down", "up"),]
DEGs_number_between_tissue.414

DEGs_number_between_tissue.414.melt <- melt(DEGs_number_between_tissue.414)
DEGs_number_between_tissue.414.melt$DE <- rep(c("down", "up"), 3)
colnames(DEGs_number_between_tissue.414.melt) <- c("tissue", "number", "DE")
DEGs_number_between_tissue.414.melt

# reorder: up 1st down 2nd 
DEGs_number_between_tissue.414.melt$DE <- factor(DEGs_number_between_tissue.414.melt$DE, levels = c("up", "down"))

DEGs_number_between_tissue.414.melt <- DEGs_number_between_tissue.414.melt[order(DEGs_number_between_tissue.414.melt$DE),]
DEGs_number_between_tissue.414.melt

### making ggplot for DEGs 
p.DEGs_number_between_tissue.414 <- ggplot(data = DEGs_number_between_tissue.414.melt)
p.DEGs_number_between_tissue.414 <- p.DEGs_number_between_tissue.414 + geom_bar(mapping = aes(fill=DE, x = factor(DE), y = number), stat = "identity")
p.DEGs_number_between_tissue.414 <- p.DEGs_number_between_tissue.414 + facet_grid(~tissue)
p.DEGs_number_between_tissue.414 <- p.DEGs_number_between_tissue.414 + labs(y = "number of differentially expressed genes", x = "")

p.DEGs_number_between_tissue.414
# ggsave("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/DEGs_number_between_tissue.414.png", width = 11, height = 8)  

### 415 
lrt.young.vs.flowering.415 <- glmLRT(fit.F1, contrast = c(0,0,0,0,0,1,0,-1))
topTags(lrt.young.vs.flowering.415)
summary(de.young.vs.flowering.415 <- decideTestsDGE(lrt.young.vs.flowering.415, p=0.05))
DEgene.young.vs.flowering.415 <- topTags(lrt.young.vs.flowering.415,n = Inf)$table[topTags(lrt.young.vs.flowering.415,n = Inf)$table$FDR<0.05,]
# -1 12450
# 0  32614
# 1  15112

lrt.flowering.vs.early.silique.415 <- glmLRT(fit.F1, contrast = c(0,0,0,0,1,-1,0,0))
topTags(lrt.flowering.vs.early.silique.415)
summary(de.flowering.vs.early.silique.415 <- decideTestsDGE(lrt.flowering.vs.early.silique.415, p=0.05))
DEgene.flowering.vs.early.silique.415 <- topTags(lrt.flowering.vs.early.silique.415,n = Inf)$table[topTags(lrt.flowering.vs.early.silique.415,n = Inf)$table$FDR<0.05,]
# -1  7666
# 0  47946
# 1   4564

lrt.early.vs.late.silique.415 <- glmLRT(fit.F1, contrast = c(0,0,0,0,-1,0,1,0))
topTags(lrt.early.vs.late.silique.415)
summary(de.early.vs.late.silique.415 <- decideTestsDGE(lrt.early.vs.late.silique.415, p=0.05))
DEgene.early.vs.late.silique.415 <- topTags(lrt.early.vs.late.silique.415,n = Inf)$table[topTags(lrt.early.vs.late.silique.415,n = Inf)$table$FDR<0.05,]
# -1  3074
# 0  53197
# 1   3905

### ggplot 
### reformat for ggplot barplot 
DEGs_number_between_tissue.415 <- data.frame("flowering-vs-young" = as.data.frame(summary(de.young.vs.flowering.415 <- decideTestsDGE(lrt.young.vs.flowering.415, p=0.05)))$Freq, 
                                     "early.slique-vs-flowering"= as.data.frame(summary(de.flowering.vs.early.silique.415 <- decideTestsDGE(lrt.flowering.vs.early.silique.415, p=0.05)))$Freq, 
                                     "late.silique-vs-early.silique" = as.data.frame(summary(de.early.vs.late.silique.415 <- decideTestsDGE(lrt.early.vs.late.silique.415, p=0.05)))$Freq)

rownames(DEGs_number_between_tissue.415) <- c("down", "no", "up")
DEGs_number_between_tissue.415 <- DEGs_number_between_tissue.415[c("down", "up"),]
DEGs_number_between_tissue.415

DEGs_number_between_tissue.415.melt <- melt(DEGs_number_between_tissue.415)
DEGs_number_between_tissue.415.melt$DE <- rep(c("down", "up"), 3)
colnames(DEGs_number_between_tissue.415.melt) <- c("tissue", "number", "DE")
DEGs_number_between_tissue.415.melt

# reorder: up 1st down 2nd 
DEGs_number_between_tissue.415.melt$DE <- factor(DEGs_number_between_tissue.415.melt$DE, levels = c("up", "down"))

DEGs_number_between_tissue.415.melt <- DEGs_number_between_tissue.415.melt[order(DEGs_number_between_tissue.415.melt$DE),]
DEGs_number_between_tissue.415.melt

### making ggplot for DEGs 
p.DEGs_number_between_tissue.415 <- ggplot(data = DEGs_number_between_tissue.415.melt)
p.DEGs_number_between_tissue.415 <- p.DEGs_number_between_tissue.415 + geom_bar(mapping = aes(fill=DE, x = factor(DE), y = number), stat = "identity")
p.DEGs_number_between_tissue.415 <- p.DEGs_number_between_tissue.415 + facet_grid(~tissue)
p.DEGs_number_between_tissue.415 <- p.DEGs_number_between_tissue.415 + labs(y = "number of differentially expressed genes", x = "")

p.DEGs_number_between_tissue.415
# ggsave("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/DEGs_number_between_tissue.415.png", width = 11, height = 8)  
### 

```

# venn diagram between different stages of 414 & 415  
```{r}
# calcuate overlaps & unique genes 
dim(DEgene.young.vs.flowering.414) # 29600     5 
dim(DEgene.young.vs.flowering.415) # 27562     5
summary(rownames(DEgene.young.vs.flowering.414) %in% rownames(DEgene.young.vs.flowering.415)) 
#    Mode   FALSE    TRUE    NA's 
# logical    8561   21039       0

require(VennDiagram)

# young vs flowering
grid.newpage()
png(filename = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/young.vs.bolting.png")
venn.plot <- draw.pairwise.venn(29600, 27562, 21039, # 1st number for group A, 2nd for group B, 3rd for overlaps 
                                c("414", "415"),
                                fill = c("lightblue","pink"),
                                lty          = "blank",
                                cex = 2,
                                cat.cex = 1.5,
                                cat.pos = 9)
grid.draw(venn.plot)
dev.off()

# flowering vs early silique 
dim(DEgene.flowering.vs.early.silique.414) # 15839     5 
dim(DEgene.flowering.vs.early.silique.415) # 12230     5
summary(rownames(DEgene.flowering.vs.early.silique.414) %in% rownames(DEgene.flowering.vs.early.silique.415)) 
#    Mode   FALSE    TRUE    NA's 
# logical    6584    9255       0 

# young vs bolting
grid.newpage()
png(filename = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/flowering.vs.early.silique.png")
venn.plot <- draw.pairwise.venn(15839, 12230, 9255, # 1st number for group A, 2nd for group B, 3rd for overlaps 
                                c("414", "415"),
                                fill = c("lightblue","pink"),
                                lty          = "blank",
                                cex = 2,
                                cat.cex = 1.5,
                                cat.pos = 9)
grid.draw(venn.plot)
dev.off()

# early silique vs late silique 
dim(DEgene.early.vs.late.silique.414) # 6373    5 
dim(DEgene.early.vs.late.silique.415) # 6979    5
summary(rownames(DEgene.early.vs.late.silique.414) %in% rownames(DEgene.early.vs.late.silique.415)) 
#    Mode   FALSE    TRUE    NA's 
# logical    2652    3721       0

# young vs bolting
grid.newpage()
png(filename = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/early.vs.late.silique.png")
venn.plot <- draw.pairwise.venn(6373, 6979, 3721, # 1st number for group A, 2nd for group B, 3rd for overlaps 
                                c("414", "415"),
                                fill = c("lightblue","pink"),
                                lty          = "blank",
                                cex = 2,
                                cat.cex = 1.5,
                                cat.pos = 9)
grid.draw(venn.plot)
dev.off()
```

# GO enrichment analysis 
```{r}
# GO enrichment analysis 
# load all necessary functions 
source("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/analysis/function_BnRNAseq.R")

# young 
DEgene.young.GO.ORA.F1 <- GOseq.Bn.ORA(rownames(DEgene.young.F1))
DEgene.young.GO.ORA.F1
write.csv(DEgene.young.GO.ORA.F1, file = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/DEgene.young.GO.ORA.F1.csv")

# flowering 
DEgene.flowering.GO.ORA.F1 <- GOseq.Bn.ORA(rownames(DEgene.flowering.F1))
DEgene.flowering.GO.ORA.F1
write.csv(DEgene.flowering.GO.ORA.F1, file = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/DEgene.flowering.GO.ORA.F1.csv")

# early silique 
DEgene.early.silique.GO.ORA.F1 <- GOseq.Bn.ORA(rownames(DEgene.early.silique.F1))
DEgene.early.silique.GO.ORA.F1
write.csv(DEgene.early.silique.GO.ORA.F1, file = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/DEgene.early.silique.GO.ORA.F1.csv")

# late silique 
DEgene.late.silique.GO.ORA.F1 <- GOseq.Bn.ORA(rownames(DEgene.late.silique.F1))
DEgene.late.silique.GO.ORA.F1
write.csv(DEgene.late.silique.GO.ORA.F1, file = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/DEgene.late.silique.GO.ORA.F1.csv")

### between stages 
### 414 
# young vs flowering 
DEgene.young.vs.flowering.414.GO.ORA <- GOseq.Bn.ORA(rownames(DEgene.young.vs.flowering.414))
DEgene.young.vs.flowering.414.GO.ORA
nrow(DEgene.young.vs.flowering.414.GO.ORA)
write.csv(DEgene.young.vs.flowering.414.GO.ORA, file = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/DEgene.young.vs.flowering.414.GO.ORA.csv")

# flowering vs early silique 
DEgene.flowering.vs.early.silique.414.GO.ORA <- GOseq.Bn.ORA(rownames(DEgene.flowering.vs.early.silique.414))
DEgene.flowering.vs.early.silique.414.GO.ORA
nrow(DEgene.flowering.vs.early.silique.414.GO.ORA)
write.csv(DEgene.flowering.vs.early.silique.414.GO.ORA, file = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/DEgene.flowering.vs.early.silique.GO.ORA.csv")

# early vs late silique 
DEgene.silique.414.GO.ORA <- GOseq.Bn.ORA(rownames(DEgene.early.vs.late.silique.414))
DEgene.silique.414.GO.ORA
nrow(DEgene.silique.414.GO.ORA)
write.csv(DEgene.silique.414.GO.ORA, file = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/DEgene.silique.414.GO.ORA.csv")

### 415 
# young vs flowering 
DEgene.young.vs.flowering.415.GO.ORA <- GOseq.Bn.ORA(rownames(DEgene.young.vs.flowering.415))
DEgene.young.vs.flowering.415.GO.ORA
write.csv(DEgene.young.vs.flowering.415.GO.ORA, file = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/DEgene.young.vs.flowering.415.GO.ORA.csv")

# flowering vs early silique 
DEgene.flowering.vs.early.silique.415.GO.ORA <- GOseq.Bn.ORA(rownames(DEgene.flowering.vs.early.silique.415))
DEgene.flowering.vs.early.silique.415.GO.ORA
write.csv(DEgene.flowering.vs.early.silique.415.GO.ORA, file = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/DEgene.flowering.vs.early.silique.415.GO.ORA.csv")

# early vs late silique 
DEgene.silique.415.GO.ORA <- GOseq.Bn.ORA(rownames(DEgene.early.vs.late.silique.415))
DEgene.silique.415.GO.ORA
write.csv(DEgene.silique.415.GO.ORA, file = "/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/F1/DEgene.silique.415.GO.ORA.csv")

```

# draw heatmap for GO term between 414 & 415 
```{r}
GO.young.F1 <- DEgene.young.GO.ORA.F1[,c("Term", "over_represented_padjust")] 
GO.flowering.F1 <- DEgene.flowering.GO.ORA.F1[,c("Term", "over_represented_padjust")]
GO.early.silique.F1 <- DEgene.early.silique.GO.ORA.F1[,c("Term", "over_represented_padjust")]
GO.late.silique.F1 <- DEgene.late.silique.GO.ORA.F1[,c("Term", "over_represented_padjust")]
GO.late.silique.F1

GO.F1.gt.1 <- merge(GO.young.F1, GO.flowering.F1, by="Term", all=TRUE)
GO.F1.gt.1
GO.F1.gt.2 <- merge(GO.F1.gt.1, GO.early.silique.F1, by="Term", all=TRUE)
GO.F1.gt.2
names(GO.F1.gt.2)[c(2:4)] <- c("young", "flowering", "early_silique")
GO.F1.gt <- merge(GO.F1.gt.2, GO.late.silique.F1, by="Term", all=TRUE)
GO.F1.gt
names(GO.F1.gt)[5] <- "late_silique"
GO.F1.gt.melt <- melt(GO.F1.gt)
GO.F1.gt.melt
GO.F1.gt.melt$logPvalue <- -log10(GO.F1.gt.melt$value)

# plot 
pl.heatmap <- ggplot(data = GO.F1.gt.melt)
pl.heatmap <- pl.heatmap + geom_tile(color = "black", aes(x = factor(variable), y = Term, fill=logPvalue)) + scale_fill_gradient2(low=muted("green"), high=muted("magenta")) 
pl.heatmap <- pl.heatmap + labs(y = "GO term", x="", title=" ") 
pl.heatmap
ggsave("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/GO.F1.gt.png", width = 8, height = 15)

```


# draw figure for fatty acid genes expression over time for 414 & 415 
```{r}
source("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/analysis/function_BnRNAseq.R")

fatty.acid.genes.annotation <-  read.csv("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/fatty_acid_gene_name.csv", header = F)
rownames(fatty.acid.genes.annotation) <- fatty.acid.genes.annotation$V1
fatty.acid.genes.interaction.F1.ID <- rownames(fatty.acid.genes.annotation) 

expression.pattern.Bn.F1.with.annot(fatty.acid.genes.interaction.F1.ID, fatty.acid.genes.annotation)

ggsave("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/fatty.acid.interaction.F1.png", width = 8, height = 20)

three.fatty.acid.genes <- c("BnaA05g06830D","BnaA10g09300D",  "BnaC09g50630D") 
expression.pattern.Bn.F1.with.annot(three.fatty.acid.genes, fatty.acid.genes.annotation)
# ggsave("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/fatty.acid.three.F1.png", width = 11, height = 8) 
```

### other important genes 
# Acetyl-CoA-ACP-transacetylase: GO:0004313 none 
# B-Ketoacyl-ACP-synthase: IPR014030 --> A 
# Malonyl-CoA-ACP-transacetylase: IPR016036 --> B  
# B-ketoacyl-ACP-reductase: GO:0004316 --> C 
# Enoyl-ACP-hydrase: IPR018376 --> D
# Enoyl-ACP-reductase: GO:0016631 & GO:0004319 none 
# Palmitoyl-thioesterase: IPR002472 --> E 
```{r}
# B-Ketoacyl-ACP-synthase 
A.ID <- read.delim("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/B-Ketoacyl-ACP-synthase", header = F)
p.A <- expression.pattern.Bn.F1(A.ID)
p.A <- p + labs(y = "mean expression value", x="tissue", title="B-Ketoacyl-ACP-synthase")
p.A 
# ggsave("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/5f.png", width = 8, height = 20)

# Malonyl-CoA-ACP-transacetylase  
B.ID <- read.delim("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/Malonyl-CoA-ACP-transacetylase", header = F)
dim(B.ID)
p.B <- expression.pattern.Bn.F1(B.ID)
p.B <- p.B + labs(y = "mean expression value", x="tissue", title="Malonyl-CoA-ACP-transacetylase")
p.B
# ggsave("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/5g.png", width = 8, height = 10)

# B-ketoacyl-ACP-reductase: GO:0004316 --> C 
C.ID <- read.delim("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/B-ketoacyl-ACP-reductase", header = F)
dim(C.ID)
p.C <- expression.pattern.Bn.F1(C.ID)
p.C <- p.C + labs(y = "mean expression value", x="tissue", title="B-ketoacyl-ACP-reductase") 
p.C
# ggsave("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/5h.png", width = 8, height = 8)

# Enoyl-ACP-hydrase: IPR018376 --> D
D.ID <- read.delim("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/Enoyl-ACP-hydrase", header = F)
dim(D.ID)
p.D <- expression.pattern.Bn.F1(D.ID)
p.D <- p.D + labs(y = "mean expression value", x="tissue", title="Enoyl-ACP-hydrase") 
p.D
# ggsave("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/5i.png", width = 8, height = 20)

# Palmitoyl-thioesterase: IPR002472 --> E 
E.ID <- read.delim("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/data/Palmitoyl-thioesterase", header = F)
dim(E.ID)
p.E <- expression.pattern.Bn.F1(E.ID)
p.E <- p.E + labs(y = "mean expression value", x="tissue", title="Palmitoyl-thioesterase") 
p.E
# ggsave("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/figure/F1/5j.png", width = 8, height = 20) 
``` 

# FLC gene expression 
```{r}
FLC <- read.delim("~/Desktop/Brassica_project/KIAT_RNA_seq/FLC_gene/FLC_napus", header = F)
names(FLC) <- "V1"
FLC
F1.read.count.small["BnaC09g46500D",]
# 1 BnaA02g00370D 
# 2 BnaA03g02820D  
# 3 BnaA10g22080D existed 
# 4 BnaC02g00490D
# 5 BnaC03g04170D
# 6 BnaC09g46500D existed 
FLC.expression.F1 <- expression.pattern.Bn.F1(FLC) 
FLC.expression.F1
ggsave(FLC.expression.F1, filename = "~/Desktop/Brassica_project/KIAT_RNA_seq/figure/FLC_414_415.png", width=4, height = 4)

# pairwise expression pattern 
DEgene.young.F1["BnaA10g22080D",]
DEgene.flowering.F1["BnaA10g22080D",]
DEgene.early.silique.F1["BnaA10g22080D",]
DEgene.late.silique.F1["BnaA10g22080D",]


expression.pattern.Bn.F1(FLC.like.act.through.DE.gt)

DEgene.young[c(which(rownames(DEgene.young) %in% FLC.like.act.through$V1)),]
```

# new design for gt effect 
```{r}
# new design to see how many fatty acid genes' expression are differentially expressed between genotypes and during developmental stages  
source("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/analysis/function_BnRNAseq.R")
# assign group 
ftable(F1.read.count.sample,row.vars=c("tissue"),col.vars=c("batch","genotype"))

# normalize 
dge.new.F1 <- DGEList(counts=F1.read.count.small, group=F1.read.count.sample$group)
dim(dge.new.F1) # [1] 360176    24 
dge.new.F1 <- calcNormFactors(dge.new.F1, method = "TMM") 
dge.new.F1$sample
# design matrix 
F1.read.count.sample$genotype <- as.factor(F1.read.count.sample$genotype)
F1.read.count.sample$tissue <- as.factor(F1.read.count.sample$tissue)
F1.read.count.sample$genotype <- relevel(F1.read.count.sample$genotype,ref="414F1")
F1.read.count.sample$tissue <- relevel(F1.read.count.sample$tissue,ref="Young")

design.new.F1 <- model.matrix(~tissue*genotype,data = F1.read.count.sample) 
colnames(design.new.F1)

# calculate dispersion
dge.new.F1 <- estimateGLMCommonDisp(dge.new.F1, design.new.F1,verbose = TRUE) # Disp = 0.25646 , BCV = 0.5064 
dge.new.F1 <- estimateGLMTrendedDisp(dge.new.F1,design.new.F1)
dge.new.F1 <- estimateGLMTagwiseDisp(dge.new.F1,design.new.F1)
# plotBCV(dge.new)

fit.new.F1 <- glmFit(dge.new.F1, design.new.F1)

# genes for gt 
lrt.new.gt.F1 <- glmLRT(fit.new.F1,coef = c("genotype415F1", "tissueearly-silique:genotype415F1", "tissueflowering:genotype415F1", "tissuelate-silique:genotype415F1"))
topTags(lrt.new.gt.F1)
DEgene.new.gt.F1 <- topTags(lrt.new.gt.F1,n = Inf)$table[topTags(lrt.new.gt.F1,n = Inf)$table$FDR<0.05,]
nrow(DEgene.new.gt.F1) # number of genes for gt effect # 15098  

FLC.like.act.through.DE.gt.F1 <- as.data.frame(rownames(DEgene.new.gt.F1[c(which(rownames(DEgene.new.gt.F1) %in% FLC.like.act.through$V1)),]))
names(FLC.like.act.through.DE.gt.F1) <- "V1"
FLC.like.act.through.DE.gt.F1

FLC.2 <- expression.pattern.Bn.F1(FLC.like.act.through.DE.gt.F1)
ggsave(FLC.2, filename = "~/Desktop/FLC_F1.png", width = 6, height = 7)
```

# trimming & mapping 
```{r}
# using trimming_mapping_01_23_2017.sh
mapping.result <- read.table("~/Desktop/Brassica_project/KIAT_RNA_seq/F1/data/Star_Stats_1.tab", header = T)
colnames(mapping.result) 
sample_des.F1.2 <- sample_des.F1[,c("Sample.ID","TotalReads","cultivar","Stage")] 

colnames(sample_des.F1.2)[1] <- "Sample"
sample_des.F1.2
mapping.result.3
mapping.result.3 <- merge(mapping.result, sample_des.F1.2, by="Sample")

library(ggplot2) 
p.uniq.mapping <- ggplot(data=mapping.result.3)
p.uniq.mapping <- p.uniq.mapping + geom_histogram(aes(x=Percent_Unique_Mapped, fill=cultivar), binwidth = 2.5)
p.uniq.mapping <- p.uniq.mapping + facet_wrap(~cultivar, nrow = 2)
p.uniq.mapping <- p.uniq.mapping + labs(list(title = "", x = "Percent of uniquely mapped reads", y = "number of samples"))
p.uniq.mapping  
ggsave(p.uniq.mapping, filename = "~/Desktop/Brassica_project/KIAT_RNA_seq/F2/F2.uniq.mapping.1.png", height = 8, width = 11)  

library(ggplot2) 
p.uniq.mapping <- ggplot(data=mapping.result.3)
p.uniq.mapping <- p.uniq.mapping + geom_histogram(aes(x=Percent_Unique_Mapped, fill=cultivar), binwidth = 2.5)
p.uniq.mapping <- p.uniq.mapping + facet_wrap(~cultivar, nrow = 2)
p.uniq.mapping <- p.uniq.mapping + labs(list(title = "", x = "Percent of uniquely mapped reads", y = "number of samples"))
p.uniq.mapping  
ggsave(p.uniq.mapping, filename = "~/Desktop/Brassica_project/KIAT_RNA_seq/F2/F2.uniq.mapping.1.png", height = 8, width = 11)   

colnames(mapping.result.3)
mapping.result.3$Reads <- mapping.result.3$TotalReads/2

mapping.result.4 <- mapping.result.3[,c("Sample", "Number_Input_Reads", "Number_Unique_Mapped", "cultivar", "Stage", "Reads")]
mapping.result.3$Percent_HQ_reads <- mapping.result.3$Number_Input_Reads/mapping.result.3$Reads
mean(mapping.result.3$Percent_Unique_Mapped)
mean(mapping.result.3$Percent_HQ_reads) 

mapping.result.4$trimmed.off <- mapping.result.4$Reads- mapping.result.4$Number_Input_Reads
mapping.result.4$multiple.mapped <- mapping.result.4$Number_Input_Reads - mapping.result.4$Number_Unique_Mapped
mapping.result.5 <- mapping.result.4[,c("Sample", "trimmed.off", "multiple.mapped", "Number_Unique_Mapped", "cultivar", "Stage")]

mapping.result.5.melt <- melt(mapping.result.5)
mapping.result.5.melt

mapping.result.4.1 <- mapping.result.3[,c("Sample", "Number_Input_Reads", "Number_Unique_Mapped", "cultivar", "Stage", "Reads")]
colnames(mapping.result.4.1)[c(2,3,6)] <- c("HQ_reads", "uniquely_mapped_reaad", "raw_reads")
mapping.result.4.1.melt <- melt(mapping.result.4.1)

pl.mapping.rate <- ggplot(data = mapping.result.4.1.melt)
pl.mapping.rate <- pl.mapping.rate + geom_bar(aes(x=Sample, y = value, fill=cultivar), stat = "identity")
pl.mapping.rate <- pl.mapping.rate + labs(list(title = "", x = " ", y = "reads number")) + theme(axis.text.x=element_blank())
pl.mapping.rate <- pl.mapping.rate + facet_wrap(~variable, ncol = 3)
pl.mapping.rate

ggsave(pl.mapping.rate, filename = "~/Desktop/Brassica_project/KIAT_RNA_seq/F1/figure/mapping.rate.png", height = 6, width = 11) 
```

# call SNPs between 414 & 415 F1 population 
```{r}
# using SNP_calling_F1_young.sh 
# 1) trim & map fastq file using script trimming_mapping_01_23_2017.sh 

# 2) merge bam files from biological replicates 

# 3) extract uniquely mapped file, sort, and convert to bam file 

# 4) call SNPs for each tissue type seperately, include Da-Ae & Da-Ol 
```

# examine F1 dataset to study gene imprinting effect 
```{r}
# ./process_vcf.sh input.vcf input_modified.vcf # to make the right format of vcf file 
vcf.F1 <- read.table("~/2017_winter/F1_SNP/F1_young_modified.vcf",as.is=T,na.strings = ".")  

dim(vcf.F1) # 2543959      13  

vcf.header.F1 <- system("grep '#C' ~/2017_winter/F1_SNP/F1_young_modified.vcf",intern = TRUE) 
vcf.header.F1
vcf.header.F1 <- sub("#","",vcf.header.F1) #get rid of the pound sign
vcf.header.F1

vcf.header.F1 <- unlist(strsplit(vcf.header.F1,split="\t"))
vcf.header.F1
colnames(vcf.F1) <- vcf.header.F1
head(vcf.F1) # why no print out??? 

system("grep '##INFO' ~/2017_winter/F1_SNP/F1_young_modified.vcf") 
system("grep '##FORMAT' ~/2017_winter/F1_SNP/F1_young_modified.vcf") 

# Before splitting add NAs to blank cells
vcf.F1$Ae

vcf.F1$Ae[is.na(vcf.F1$Ae)] <- "NA:NA:NA:NA:NA:NA:NA"

Ae.tmp.unique <- matrix(
  unlist(strsplit(vcf.F1$Ae,split = ":")),
  nrow=nrow(vcf.F1),  
  byrow=TRUE
  )

head(Ae.tmp.unique)

colnames(Ae.tmp.unique) <- paste("Ae",c("gt","tot.depth","ref.depth","ref.qual","alt.depth","alt.qual","gen.lik"),sep="_")

vcf.F1$Ol[is.na(vcf.F1$Ol)] <- "NA:NA:NA:NA:NA:NA:NA"

Ol.tmp.unique <- matrix(
  unlist(strsplit(vcf.F1$Ol,split = ":")),
  nrow=nrow(vcf.F1),
  byrow=TRUE
  )
head(Ol.tmp.unique)
colnames(Ol.tmp.unique) <- paste("Ol",c("gt","tot.depth","ref.depth","ref.qual","alt.depth","alt.qual","gen.lik"),sep="_")

# 414 
vcf.F1$`414F1_young`[is.na(vcf.F1$`414F1_young`)] <- "NA:NA:NA:NA:NA:NA:NA"

F1_young_414.tmp.unique <- matrix(
  unlist(strsplit(vcf.F1$`414F1_young`,split = ":")),
  nrow=nrow(vcf.F1),
  byrow=TRUE
  )
head(F1_young_414.tmp.unique)
colnames(F1_young_414.tmp.unique) <- paste("414F1_young",c("gt","tot.depth","ref.depth","ref.qual","alt.depth","alt.qual","gen.lik"),sep="_")

vcf.F1$`415F1_young`[is.na(vcf.F1$`415F1_young`)] <- "NA:NA:NA:NA:NA:NA:NA"

F1_young_415.tmp.unique <- matrix(
  unlist(strsplit(vcf.F1$`415F1_young`,split = ":")),
  nrow=nrow(vcf.F1),
  byrow=TRUE
  )
head(F1_young_415.tmp.unique)
colnames(F1_young_415.tmp.unique) <- paste("415F1_young",c("gt","tot.depth","ref.depth","ref.qual","alt.depth","alt.qual","gen.lik"),sep="_")

###### 
vcf.F1 <- cbind(vcf.F1,Ae.tmp.unique,Ol.tmp.unique,F1_young_414.tmp.unique, F1_young_415.tmp.unique, stringsAsFactors=FALSE)
summary(vcf.F1)
head(vcf.F1)

vcf.F1[,c("Ae_tot.depth","Ae_ref.depth","Ae_ref.qual","Ae_alt.depth","Ae_alt.qual","Ol_tot.depth","Ol_ref.depth","Ol_ref.qual","Ol_alt.depth","Ol_alt.qual","414F1_young_tot.depth","414F1_young_ref.depth","414F1_young_ref.qual","414F1_young_alt.depth","414F1_young_alt.qual","415F1_young_tot.depth","415F1_young_ref.depth","415F1_young_ref.qual","415F1_young_alt.depth","415F1_young_alt.qual")] <- 
  apply(vcf.F1[,c("Ae_tot.depth","Ae_ref.depth","Ae_ref.qual","Ae_alt.depth","Ae_alt.qual","Ol_tot.depth","Ol_ref.depth","Ol_ref.qual","Ol_alt.depth","Ol_alt.qual","414F1_young_tot.depth","414F1_young_ref.depth","414F1_young_ref.qual","414F1_young_alt.depth","414F1_young_alt.qual","415F1_young_tot.depth","415F1_young_ref.depth","415F1_young_ref.qual","415F1_young_alt.depth","415F1_young_alt.qual")],
        2,
        as.numeric
        )
summary(vcf.F1) 
head(vcf.F1)
dim(vcf.F1) # 2543959      41  
# save(vcf.F1, file = "~/Desktop/Brassica_project/KIAT_RNA_seq/F1/output/vcf.F1.young.Rdata")

######### 

### 1) filter based on QUAl score 
# load("~/Desktop/Brassica_project/KIAT_RNA_seq/F1/output/vcf.F1.young.Rdata")
# make a histogram of QUAL scores 
hist(vcf.F1$QUAL)

length(vcf.F1$QUAL) # a total of 2543959 SNPs 
sum(vcf.F1$QUAL>20) / length(vcf.F1$QUAL) # 73.1% are above QUAL score of 20, which means less than 0.01 probability that it isn't polymorphic 

hist(vcf.F1[vcf.F1$QUAL<20,]$QUAL)
hist(vcf.F1[vcf.F1$QUAL<1,]$QUAL)

# subset the data to keep positions where the quality score is 40 or higher 
vcf.F1.HQ <- vcf.F1[vcf.F1$QUAL>40,]
dim(vcf.F1.HQ) # 1628615      41  
sum(vcf.F1$QUAL>40) / length(vcf.F1$QUAL) # 70% of SNPs were retained with QUAL > 40

# count the number 
table(vcf.F1.HQ$Ae_gt)
table(vcf.F1.HQ$Ol_gt)
young.414 <- as.data.frame(table(vcf.F1.HQ.filtered.1$`414F1_young_gt`))
young.414$type <- rep("414", nrow(young.414))
young.414$class <- rep("young", nrow(young.414))
young.415 <- as.data.frame(table(vcf.F1.HQ.filtered.1$`415F1_young_gt`))
young.415$type <- rep("415", nrow(young.415))
young.415$class <- rep("young", nrow(young.415))
young.414

early.silique.414 <- as.data.frame(table(vcf.F1.HQ.filtered.1.early.silique$`414F1_gt`))
early.silique.414$type <- rep("414", length(early.silique.414)) 
early.silique.414$class <- rep("early.silique", nrow(early.silique.414))
early.silique.414
early.silique.415 <- as.data.frame(table(vcf.F1.HQ.filtered.1.early.silique$`415F1_gt`))
early.silique.415$type <- rep("415", nrow(early.silique.415))
early.silique.415$class <- rep("early.silique", nrow(early.silique.415))

flowering.414 <- as.data.frame(table(vcf.F1.HQ.filtered.1.flowering$`414F1_gt`))
flowering.414$type <- rep("414", nrow(flowering.414))
flowering.414$class <- rep("flowering", nrow(flowering.414))

flowering.415 <- as.data.frame(table(vcf.F1.HQ.filtered.1.flowering$`415F1_gt`)) 
flowering.415$type <- rep("415", nrow(flowering.415))
flowering.415$class <- rep("flowering", nrow(flowering.415))

F1.gt <- rbind(young.414, young.415, flowering.414, flowering.415, early.silique.414, early.silique.415)
F1.gt.2 <- F1.gt[which(F1.gt$Var=="0/0" | F1.gt$Var=="0/1" | F1.gt$Var1=="1/1"),]

# plot 
plot.F1.gt <- ggplot(data=F1.gt.2)
plot.F1.gt <- plot.F1.gt + geom_bar(mapping=aes(fill=Var1, x=factor(Var1), y=Freq), stat="identity")
plot.F1.gt <- plot.F1.gt + facet_grid(class~type)
plot.F1.gt <- plot.F1.gt + labs(y = "number of loci", x = "genotype") 
plot.F1.gt 
ggsave("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/ruijuanli/2017_winter/F1_SNP/figure/F1.gt.png", width = 6, height = 4)

head(vcf.F1)
sum(is.na(vcf.F1$CHROM))
# save(vcf.F1.HQ, file = "~/Desktop/Brassica_project/KIAT_RNA_seq/F1/output/vcf.F1.young.HQ.Rdata")
```

# young tissue 
# filter based on read depth, extract the useful SNPs based on genotype, do binomial test...  
```{r}
# 1) filter based on read depth  
# depth > 3 and < 501 in each one 
# load("~/Desktop/Brassica_project/KIAT_RNA_seq/F1/output/vcf.F1.young.HQ.Rdata")

vcf.F1.HQ.filtered <-
vcf.F1.HQ[which(vcf.F1.HQ$Ae_tot.depth > 3 & vcf.F1.HQ$Ae_tot.depth < 501 & vcf.F1.HQ$Ol_tot.depth > 3 & vcf.F1.HQ$Ol_tot.depth < 501 & vcf.F1.HQ$`414F1_young_tot.depth` > 3 & vcf.F1.HQ$`414F1_young_tot.depth` < 501 & vcf.F1.HQ$`415F1_young_tot.depth` > 3 & vcf.F1.HQ$`415F1_young_tot.depth` < 501),]

max(vcf.F1.HQ.filtered$`415F1_young_tot.depth`)

dim(vcf.F1.HQ) # 1628615      41 
dim(vcf.F1.HQ.filtered) # 459820     41 
head(vcf.F1.HQ.filtered)
vcf.F1.HQ.filtered$CHROM

nrow(vcf.F1.HQ.filtered)/nrow(vcf.F1.HQ) # 28% 

# 2) extract the most useful SNPs based on genotype 
#1) SNPs between the two parents  
vcf.F1.HQ.filtered.1 <- vcf.F1.HQ.filtered[((vcf.F1.HQ.filtered$Ae_gt=="1/1" & vcf.F1.HQ.filtered$Ol_gt=="0/0") | (vcf.F1.HQ.filtered$Ae_gt=="0/0" & vcf.F1.HQ.filtered$Ol_gt=="1/1")),] 
dim(vcf.F1.HQ.filtered.1) # 127019 41 
nrow(vcf.F1.HQ.filtered.1)/nrow(vcf.F1.HQ.filtered) # 54% 
ftable(vcf.F1.HQ.filtered.1[,c("414F1_young_gt","415F1_young_gt", "Ae_gt", "Ol_gt")]) # need stats for this... 

# 3) now take a look at the SNPs for F1s, firstly check the ones that were determined as homozygous in F1s 
vcf.F1.HQ.filtered.1.2 <- vcf.F1.HQ.filtered.1[(vcf.F1.HQ.filtered.1$`414F1_young_gt`=="1/1" | vcf.F1.HQ.filtered.1$`414F1_young_gt`=="0/0" | vcf.F1.HQ.filtered.1$`415F1_young_gt`=="1/1" | vcf.F1.HQ.filtered.1$`415F1_young_gt`=="0/0"),]

head(vcf.F1.HQ.filtered.1.2)
vcf.F1.HQ.filtered.1.2[,c("Ae_gt", "Ol_gt", "414F1_young_gt", "414F1_young_ref.depth", "414F1_young_alt.depth", "415F1_young_gt", "415F1_young_ref.depth", "415F1_young_alt.depth")]

# calcualte how many SNPs are already counted as homozygous in F1 
table(vcf.F1.HQ.filtered.1.2$`414F1_young_gt`)
table(vcf.F1.HQ.filtered.1.2$`415F1_young_gt`)
ftable(vcf.F1.HQ.filtered.1.2[,c("414F1_young_gt","415F1_young_gt", "Ae_gt", "Ol_gt")])

# count SNPs between 414F1 & 415F1 
dim(vcf.F1.HQ.filtered.1.2) # 34394 
vcf.F1.HQ.filtered.1.2.3 <- vcf.F1.HQ.filtered.1.2[!((vcf.F1.HQ.filtered.1.2$`414F1_young_gt`=="0/0" & vcf.F1.HQ.filtered.1.2$`415F1_young_gt`=="0/0") | (vcf.F1.HQ.filtered.1.2$`414F1_young_gt`=="1/1" & vcf.F1.HQ.filtered.1.2$`415F1_young_gt`=="1/1")),]
dim(vcf.F1.HQ.filtered.1.2.3) # 23589 
head(vcf.F1.HQ.filtered.1.2.3)
head(vcf.F1.HQ.filtered.1.2.3[,c("414F1_young_gt", "415F1_young_gt")],100)

ftable(vcf.F1.HQ.filtered.1.2.3[,c("414F1_young_gt", "415F1_young_gt")]) 

```

# header for flowering, early silique and late silique stage 
```{r}
vcf.header.F1 <- system("grep '#C' ~//2017_winter/F1_SNP/F1_flowering_modified_2.vcf",intern = TRUE)
vcf.header.F1
```

# flowering tissue: reformat vcf file  
```{r}
source("~/KIAT/function_BnRNAseq.R")
# change 414F1_stage to 414F1; 415F1_stage to 415F1 in vim, keep the original vcf file in case sth is wrong...  
# ./process_vcf.sh input.vcf input_modified.vcf # to make the right format of vcf file 
vcf.F1.flowering <- read.table("~/2017_winter/F1_SNP/F1_flowering_modified_2.vcf",as.is=T,na.strings = ".")

vcf.F1.flowering <- reformat.vcf.F1(vcf.F1.flowering, vcf.header.F1) 
head(vcf.F1.flowering) 
```

# flowering tissue analysis 
```{r}
######### 
### 1) filter based on QUAl score 

# make a histogram of QUAL scores 
hist(vcf.F1.flowering$QUAL)

length(vcf.F1.flowering$QUAL) # a total of 2559294 SNPs 
sum(vcf.F1.flowering$QUAL>20) / length(vcf.F1.flowering$QUAL) # 72% are above QUAL score of 20, which means less than 0.01 probability that it isn't polymorphic 

hist(vcf.F1.flowering[vcf.F1.flowering$QUAL<20,]$QUAL) 
hist(vcf.F1.flowering[vcf.F1.flowering$QUAL<1,]$QUAL)

# subset the data to keep positions where the quality score is 40 or higher 
vcf.F1.HQ.flowering <- vcf.F1.flowering[vcf.F1.flowering$QUAL>40,]
dim(vcf.F1.HQ.flowering) # 1623025      41  
sum(vcf.F1.flowering$QUAL>40) / length(vcf.F1.flowering$QUAL) # 63% of SNPs were retained with QUAL > 40

# count the number 
table(vcf.F1.HQ.flowering$Ae_gt)
table(vcf.F1.HQ.flowering$Ol_gt)
table(vcf.F1.flowering$`414F1_gt`)
table(vcf.F1.flowering$`415F1_gt`)

head(vcf.F1.flowering)
sum(is.na(vcf.F1.flowering$CHROM)) 

# 1) filter based on read depth: worded till here...   
# depth > 3 and < 501 in each one 
vcf.F1.HQ.filtered.flowering <-
vcf.F1.HQ.flowering[which(vcf.F1.HQ.flowering$Ae_tot.depth > 3 & vcf.F1.HQ.flowering$Ae_tot.depth < 501 & vcf.F1.HQ.flowering$Ol_tot.depth > 3 & vcf.F1.HQ.flowering$Ol_tot.depth < 501 & vcf.F1.HQ.flowering$`414F1_tot.depth` > 3 & vcf.F1.HQ.flowering$`414F1_tot.depth` < 501 & vcf.F1.HQ.flowering$`415F1_tot.depth` > 3 & vcf.F1.HQ.flowering$`415F1_tot.depth` < 501),]

max(vcf.F1.HQ.filtered.flowering$`415F1_tot.depth`)

<<<<<<< HEAD
dim(vcf.F1.HQ.flowering) # 1623025      41 
dim(vcf.F1.HQ.filtered.flowering) # 442740     41
head(vcf.F1.HQ.filtered.flowering)
vcf.F1.HQ.filtered.flowering$CHROM

nrow(vcf.F1.HQ.filtered.flowering)/nrow(vcf.F1.HQ.flowering) # 27% 
=======
dim(vcf.F1.HQ.flowering) # 1623025      41
dim(vcf.F1.HQ.filtered.flowering) # 442740     41 
head(vcf.F1.HQ.filtered.flowering)
vcf.F1.HQ.filtered.flowering$CHROM

nrow(vcf.F1.HQ.filtered.flowering)/nrow(vcf.F1.HQ.flowering) # 28% 
>>>>>>> 0fca7bb1ae4ecbe1dfa259e0ea0ff5b18d259e8b

# 2) extract the most useful SNPs based on genotype 
#1) SNPs between the two parents  
vcf.F1.HQ.filtered.1.flowering <- vcf.F1.HQ.filtered.flowering[((vcf.F1.HQ.filtered.flowering$Ae_gt=="1/1" & vcf.F1.HQ.filtered.flowering$Ol_gt=="0/0") | (vcf.F1.HQ.filtered.flowering$Ae_gt=="0/0" & vcf.F1.HQ.filtered.flowering$Ol_gt=="1/1")),] 
dim(vcf.F1.HQ.filtered.1.flowering) # 124314     41 
<<<<<<< HEAD
nrow(vcf.F1.HQ.filtered.1.flowering)/nrow(vcf.F1.HQ.filtered.flowering) # 28%  
=======
nrow(vcf.F1.HQ.filtered.1.flowering)/nrow(vcf.F1.HQ.filtered.flowering) # 28% 
>>>>>>> 0fca7bb1ae4ecbe1dfa259e0ea0ff5b18d259e8b
ftable(vcf.F1.HQ.filtered.1.flowering[,c("414F1_gt","415F1_gt", "Ae_gt", "Ol_gt")]) # need stats for this... 

# 3) now take a look at the SNPs for F1s, firstly check the ones that were determined as homozygous in F1s 
vcf.F1.HQ.filtered.1.2.flowering <- vcf.F1.HQ.filtered.1.flowering[(vcf.F1.HQ.filtered.1.flowering$`414F1_gt`=="1/1" | vcf.F1.HQ.filtered.1.flowering$`414F1_gt`=="0/0" | vcf.F1.HQ.filtered.1.flowering$`415F1_gt`=="1/1" | vcf.F1.HQ.filtered.1.flowering$`415F1_gt`=="0/0"),]

head(vcf.F1.HQ.filtered.1.2.flowering)
vcf.F1.HQ.filtered.1.2.flowering[,c("Ae_gt", "Ol_gt", "414F1_gt", "414F1_ref.depth", "414F1_alt.depth", "415F1_gt", "415F1_ref.depth", "415F1_alt.depth")]

# calcualte how many SNPs are already counted as homozygous in F1 
table(vcf.F1.HQ.filtered.1.2.flowering$`414F1_gt`)
table(vcf.F1.HQ.filtered.1.2.flowering$`415F1_gt`)
ftable(vcf.F1.HQ.filtered.1.2.flowering[,c("414F1_gt","415F1_gt", "Ae_gt", "Ol_gt")])

# count SNPs between 414F1 & 415F1 
<<<<<<< HEAD
dim(vcf.F1.HQ.filtered.1.2.flowering) # 34453  
vcf.F1.HQ.filtered.1.2.3.flowering <- vcf.F1.HQ.filtered.1.2.flowering[!((vcf.F1.HQ.filtered.1.2.flowering$`414F1_gt`=="0/0" & vcf.F1.HQ.filtered.1.2.flowering$`415F1_gt`=="0/0") | (vcf.F1.HQ.filtered.1.2.flowering$`414F1_gt`=="1/1" & vcf.F1.HQ.filtered.1.2.flowering$`415F1_gt`=="1/1")),]
dim(vcf.F1.HQ.filtered.1.2.3.flowering) # 24214 41  
=======
dim(vcf.F1.HQ.filtered.1.2.flowering) # 34453    41  
vcf.F1.HQ.filtered.1.2.3.flowering <- vcf.F1.HQ.filtered.1.2.flowering[!((vcf.F1.HQ.filtered.1.2.flowering$`414F1_gt`=="0/0" & vcf.F1.HQ.filtered.1.2.flowering$`415F1_gt`=="0/0") | (vcf.F1.HQ.filtered.1.2.flowering$`414F1_gt`=="1/1" & vcf.F1.HQ.filtered.1.2.flowering$`415F1_gt`=="1/1")),]
dim(vcf.F1.HQ.filtered.1.2.3.flowering) # 24214    41  
>>>>>>> 0fca7bb1ae4ecbe1dfa259e0ea0ff5b18d259e8b

ftable(vcf.F1.HQ.filtered.1.2.3.flowering[,c("414F1_gt", "415F1_gt")])
```

<<<<<<< HEAD

# early silique tissue: reformat vcf file  
```{r}
source("~/Desktop/Brassica_project/KIAT_RNA_seq/analysis/function_BnRNAseq.R")
# change 414F1_stage to 414F1; 415F1_stage to 415F1 in vim, keep the original vcf file in case sth is wrong...  
# ./process_vcf.sh input.vcf input_modified.vcf # to make the right format of vcf file 
vcf.F1.early.silique <- read.table("~/Desktop/Brassica_project/KIAT_RNA_seq/F1/data/F1_early_silique_modified.vcf",as.is=T,na.strings = ".")

vcf.F1.early.silique <- reformat.vcf.F1(vcf.F1.early.silique, vcf.header.F1) 
head(vcf.F1.early.silique) 
```

############ start working from here ################ 
# flowering tissue analysis 
=======
# early silique tissue 
# early silique tissue: reformat vcf file  
```{r}
# change 414F1_stage to 414F1; 415F1_stage to 415F1 in vim, keep the original vcf file in case sth is wrong...  
# ./process_vcf.sh input.vcf input_modified.vcf # to make the right format of vcf file 
vcf.F1.early.silique <- read.table("~/2017_winter/F1_SNP/F1_early_silique_modified.vcf",as.is=T,na.strings = ".")

vcf.F1.early.silique <- reformat.vcf.F1(vcf.F1.early.silique, vcf.header.F1) 
head(vcf.F1.early.silique)  
```

# early silique tissue analysis 
>>>>>>> 0fca7bb1ae4ecbe1dfa259e0ea0ff5b18d259e8b
```{r}
######### 
### 1) filter based on QUAl score 

# make a histogram of QUAL scores 
hist(vcf.F1.early.silique$QUAL)

<<<<<<< HEAD
length(vcf.F1.early.silique$QUAL) # a total of 2559294 SNPs 
sum(vcf.F1.early.silique$QUAL>20) / length(vcf.F1.early.silique$QUAL) # 72% are above QUAL score of 20, which means less than 0.01 probability that it isn't polymorphic 
=======
length(vcf.F1.early.silique$QUAL) # a total of 2608310 SNPs 
sum(vcf.F1.early.silique$QUAL>20) / length(vcf.F1.early.silique$QUAL) # 75% are above QUAL score of 20, which means less than 0.01 probability that it isn't polymorphic 
>>>>>>> 0fca7bb1ae4ecbe1dfa259e0ea0ff5b18d259e8b

hist(vcf.F1.early.silique[vcf.F1.early.silique$QUAL<20,]$QUAL) 
hist(vcf.F1.early.silique[vcf.F1.early.silique$QUAL<1,]$QUAL)

# subset the data to keep positions where the quality score is 40 or higher 
vcf.F1.HQ.early.silique <- vcf.F1.early.silique[vcf.F1.early.silique$QUAL>40,]
<<<<<<< HEAD
dim(vcf.F1.HQ.early.silique) # 1623025      41  
sum(vcf.F1.early.silique$QUAL>40) / length(vcf.F1.early.silique$QUAL) # 63% of SNPs were retained with QUAL > 40
=======
dim(vcf.F1.HQ.early.silique) # 1680924      41  
sum(vcf.F1.early.silique$QUAL>40) / length(vcf.F1.early.silique$QUAL) # 64% of SNPs were retained with QUAL > 40
>>>>>>> 0fca7bb1ae4ecbe1dfa259e0ea0ff5b18d259e8b

# count the number 
table(vcf.F1.HQ.early.silique$Ae_gt)
table(vcf.F1.HQ.early.silique$Ol_gt)
table(vcf.F1.early.silique$`414F1_gt`)
<<<<<<< HEAD
table(vcf.F1.ealry.silique$`415F1_gt`)

head(vcf.F1.early.silique)
sum(is.na(vcf.F1.ealry.silique$CHROM)) 

# 1) filter based on read depth  
=======
table(vcf.F1.early.silique$`415F1_gt`)

head(vcf.F1.early.silique)
sum(is.na(vcf.F1.early.silique$CHROM)) 

# 1) filter based on read depth: worded till here...   
>>>>>>> 0fca7bb1ae4ecbe1dfa259e0ea0ff5b18d259e8b
# depth > 3 and < 501 in each one 
vcf.F1.HQ.filtered.early.silique <-
vcf.F1.HQ.early.silique[which(vcf.F1.HQ.early.silique$Ae_tot.depth > 3 & vcf.F1.HQ.early.silique$Ae_tot.depth < 501 & vcf.F1.HQ.early.silique$Ol_tot.depth > 3 & vcf.F1.HQ.early.silique$Ol_tot.depth < 501 & vcf.F1.HQ.early.silique$`414F1_tot.depth` > 3 & vcf.F1.HQ.early.silique$`414F1_tot.depth` < 501 & vcf.F1.HQ.early.silique$`415F1_tot.depth` > 3 & vcf.F1.HQ.early.silique$`415F1_tot.depth` < 501),]

max(vcf.F1.HQ.filtered.early.silique$`415F1_tot.depth`)

<<<<<<< HEAD
dim(vcf.F1.HQ.early.silique) # 1623025      41 
dim(vcf.F1.HQ.filtered.early.silique) # 442740     41
head(vcf.F1.HQ.filtered.early.silique)
vcf.F1.HQ.filtered.early.silique$CHROM

nrow(vcf.F1.HQ.filtered.early.silique)/nrow(vcf.F1.HQ.early.silique) # 27% 
=======
dim(vcf.F1.HQ.early.silique) # 1680924      41
dim(vcf.F1.HQ.filtered.early.silique) # 561189     41 
head(vcf.F1.HQ.filtered.early.silique)
vcf.F1.HQ.filtered.early.silique$CHROM

nrow(vcf.F1.HQ.filtered.early.silique)/nrow(vcf.F1.HQ.early.silique) # 33% 
>>>>>>> 0fca7bb1ae4ecbe1dfa259e0ea0ff5b18d259e8b

# 2) extract the most useful SNPs based on genotype 
#1) SNPs between the two parents  
vcf.F1.HQ.filtered.1.early.silique <- vcf.F1.HQ.filtered.early.silique[((vcf.F1.HQ.filtered.early.silique$Ae_gt=="1/1" & vcf.F1.HQ.filtered.early.silique$Ol_gt=="0/0") | (vcf.F1.HQ.filtered.early.silique$Ae_gt=="0/0" & vcf.F1.HQ.filtered.early.silique$Ol_gt=="1/1")),] 
<<<<<<< HEAD
dim(vcf.F1.HQ.filtered.1.early.silique) # 124314     41 
nrow(vcf.F1.HQ.filtered.1.early.silique)/nrow(vcf.F1.HQ.filtered.early.silique) # 28%  
=======
dim(vcf.F1.HQ.filtered.1.early.silique) # 152315     41 
nrow(vcf.F1.HQ.filtered.1.early.silique)/nrow(vcf.F1.HQ.filtered.early.silique) # 27% 
>>>>>>> 0fca7bb1ae4ecbe1dfa259e0ea0ff5b18d259e8b
ftable(vcf.F1.HQ.filtered.1.early.silique[,c("414F1_gt","415F1_gt", "Ae_gt", "Ol_gt")]) # need stats for this... 


# 3) now take a look at the SNPs for F1s, firstly check the ones that were determined as homozygous in F1s 
vcf.F1.HQ.filtered.1.2.early.silique <- vcf.F1.HQ.filtered.1.early.silique[(vcf.F1.HQ.filtered.1.early.silique$`414F1_gt`=="1/1" | vcf.F1.HQ.filtered.1.early.silique$`414F1_gt`=="0/0" | vcf.F1.HQ.filtered.1.early.silique$`415F1_gt`=="1/1" | vcf.F1.HQ.filtered.1.early.silique$`415F1_gt`=="0/0"),]

head(vcf.F1.HQ.filtered.1.2.early.silique)
vcf.F1.HQ.filtered.1.2.early.silique[,c("Ae_gt", "Ol_gt", "414F1_gt", "414F1_ref.depth", "414F1_alt.depth", "415F1_gt", "415F1_ref.depth", "415F1_alt.depth")]

# calcualte how many SNPs are already counted as homozygous in F1 
<<<<<<< HEAD
table(vcf.F1.HQ.filtered.1.2.flowering$`414F1_gt`)
table(vcf.F1.HQ.filtered.1.2.flowering$`415F1_gt`)
ftable(vcf.F1.HQ.filtered.1.2.flowering[,c("414F1_gt","415F1_gt", "Ae_gt", "Ol_gt")])

# count SNPs between 414F1 & 415F1 
dim(vcf.F1.HQ.filtered.1.2.flowering) # 34453  
vcf.F1.HQ.filtered.1.2.3.flowering <- vcf.F1.HQ.filtered.1.2.flowering[!((vcf.F1.HQ.filtered.1.2.flowering$`414F1_gt`=="0/0" & vcf.F1.HQ.filtered.1.2.flowering$`415F1_gt`=="0/0") | (vcf.F1.HQ.filtered.1.2.flowering$`414F1_gt`=="1/1" & vcf.F1.HQ.filtered.1.2.flowering$`415F1_gt`=="1/1")),]
dim(vcf.F1.HQ.filtered.1.2.3.flowering) # 24214 41  

ftable(vcf.F1.HQ.filtered.1.2.3.flowering[,c("414F1_gt", "415F1_gt")])
=======
table(vcf.F1.HQ.filtered.1.2.early.silique$`414F1_gt`)
table(vcf.F1.HQ.filtered.1.2.early.silique$`415F1_gt`)
ftable(vcf.F1.HQ.filtered.1.2.early.silique[,c("414F1_gt","415F1_gt", "Ae_gt", "Ol_gt")])

# count SNPs between 414F1 & 415F1 
dim(vcf.F1.HQ.filtered.1.2.early.silique) # 40913    41  
vcf.F1.HQ.filtered.1.2.3.early.silique <- vcf.F1.HQ.filtered.1.2.early.silique[!((vcf.F1.HQ.filtered.1.2.early.silique$`414F1_gt`=="0/0" & vcf.F1.HQ.filtered.1.2.early.silique$`415F1_gt`=="0/0") | (vcf.F1.HQ.filtered.1.2.early.silique$`414F1_gt`=="1/1" & vcf.F1.HQ.filtered.1.2.early.silique$`415F1_gt`=="1/1")),]
dim(vcf.F1.HQ.filtered.1.2.3.early.silique) # 27760     41  

ftable(vcf.F1.HQ.filtered.1.2.3.early.silique[,c("414F1_gt", "415F1_gt")])

# SNPs between F1s 
# early silique
vcf.F1.HQ.filtered.early.silique <- vcf.F1.HQ.filtered.early.silique[!((vcf.F1.HQ.filtered.early.silique$`414F1_gt`=="0/0" & vcf.F1.HQ.filtered.early.silique$`415F1_gt`=="0/0") | (vcf.F1.HQ.filtered.early.silique$`414F1_gt`=="1/1" & vcf.F1.HQ.filtered.early.silique$`415F1_gt`=="1/1")),]
dim(vcf.F1.HQ.filtered.early.silique) # 424545     41  

# young
vcf.F1.HQ.filtered.young <- vcf.F1.HQ.filtered[!((vcf.F1.HQ.filtered$`414F1_young_gt`=="0/0" & vcf.F1.HQ.filtered$`415F1_young_gt`=="0/0") | (vcf.F1.HQ.filtered$`414F1_young_gt`=="1/1" & vcf.F1.HQ.filtered$`415F1_young_gt`=="1/1")),]
dim(vcf.F1.HQ.filtered.young) # 424545     41  


<<<<<<< HEAD
=======


>>>>>>> 0fca7bb1ae4ecbe1dfa259e0ea0ff5b18d259e8b
>>>>>>> a499b82f14426ff5a2ae901e74e691db89eaa57b
```