setup.Rmd

---
title: '494 final project'
author: "Kristy Ma, Rita Li"
date: "`r Sys.Date()`"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, error = TRUE)
```

# Install and Load Packages

    if (!require("BiocManager", quietly = TRUE))
        install.packages("BiocManager")
    BiocManager::install("snpStats")

```{r load-packages, message = F}
library(snpStats)
library(dplyr)
library(ggplot2)
library(broom)
library(SNPRelate)
library(GENESIS)
library(GWASTools)
```

# Data preparation

## load data

```{r read data, cache=TRUE}
fam <- './1_QC_GWAS/HapMap_3_r3_1.fam'
bim <- './1_QC_GWAS/HapMap_3_r3_1.bim'
bed <- './1_QC_GWAS/HapMap_3_r3_1.bed'

hapmap <- read.plink(bed, bim, fam)
names(hapmap)


# snpgdsBED2GDS(bed, fam, bim, "lab2.gds")

# open
# genofile <- snpgdsOpen("lab2.gds")
# genofile
# # close
# snpgdsClose(genofile)
```

## Separate two sets: get uncorr, corr names

```{r use lab2.gds to do king robust estimation}
genofile <- snpgdsOpen("lab2.gds")
Kingoutput <- snpgdsIBDKING(genofile)
king.matrix <- Kingoutput$kinship


sample.id <- Kingoutput$sample.id

colnames(king.matrix) <- sample.id
rownames(king.matrix) <- sample.id

partition <- pcairPartition(kinobj = king.matrix ,divobj = king.matrix, )

# 58 individuals
related <- partition$rels

# 107 uncorrelated individuals
unrelated <- partition$unrels
```

-   `genotypes` contains the genotype data
-   `fam` contains information on each individual
-   `map` contains information on each SNP


## Remove monomorphic from map and genotype
```{r remove-mono-from-map}
maf <- col.summary(hapmap$genotypes)$MAF
calls <- col.summary(hapmap$genotypes)$Call.rate

hapmap$map <- hapmap$map %>%
  mutate(MAF = maf) %>% filter(MAF > 0 & calls == 1)
# nrow(hapmap$map) 949972

hapmap$genotypes <- hapmap$genotypes[,hapmap$map$snp.name]
```

## Explore correlated dataset (just hapmap itself!)

Get info about correlated dataset:

```{r explore-correlated_dataset}
cor_geno <- hapmap$genotypes  # genotype information
cor_fam <- hapmap$fam    #family information
map <- hapmap$map    #SNPs information (can be used for both correlated and uncorrelated data set)
```

## Explore uncorrelated dataset (subsetting from the hapmap)

```{r explore-uncorrelated-dataset}
related.index <- c()
a <- 1

for (i in 1:165){
  if (rownames(hapmap$genotypes[i,])%in%unrelated){
    related.index[a] <- i
    a <- a+1
  }
}

uncor_geno <- hapmap$genotypes[related.index]  #genotype information
uncor_fam <- hapmap$fam[rownames(hapmap$fam) %in% unrelated, ] #family information
```

## Shrinking SNP number
```{r select first 500 snps}
## correlated
cor_geno <- cor_geno[,1:500]

## uncorrelated
uncor_geno <- uncor_geno[,1:500]

map <- map %>% 
  filter(snp.name %in% colnames(cor_geno))
```

## Format genodata for LM + PCA
```{r format geno data}
# corr
cor.geno <- as(cor_geno, "numeric")
# nrow(cor.geno)
# ncol(cor.geno)  this is 165*500

# uncorr
uncor.geno <- as(uncor_geno, "numeric")
# nrow(uncor.geno)
# ncol(uncor.geno) this is 107*500
```


## Simulate trait
```{r}
set.seed(494)

# null y:corr
y_cor_null <- rnorm(165, mean = 50, sd = 2)

# null y: uncorr
y_uncor_null <- rnorm(107, mean = 50, sd = 2)

# associative y: corr (based on "rs3766191" and "rs2985855")
y_cor_asso <- cor.geno[,"rs3766191"]*2 + cor.geno[,"rs2985855"] * 2 + rnorm(165, mean = 50, sd = 2)

# associative y: uncorr
y_uncor_asso <- uncor.geno[,"rs3766191"]*2 + uncor.geno[,"rs2985855"] * 2 + rnorm(107, mean = 50, sd = 2)
```

### write plink for LMM (GENESIS)
```{r write plink}
# snp.ids = colnames(cor_geno)
# cor_geno_int <- substr(snp.ids, start = 3, stop = nchar(snp.ids))
# cor_geno_gds <- cor_geno
# colnames(cor_geno_gds) <- cor_geno_int
## cor null
write.plink(file.base="cor_null", 
            snps= cor_geno,
            subject.data=cor_fam, 
            phenotype = as.numeric(y_cor_null), 
            sex = as.numeric(cor_fam$sex),
            snp.major=FALSE)

fam <- 'cor_null.fam'
bim <- 'cor_null.bim'
bed <- 'cor_null.bed'

snpgdsBED2GDS(bed, fam, bim, "cor_null.gds")

## cor asso
write.plink(file.base="cor_asso", 
            snps=cor_geno,
            subject.data=cor_fam, 
            phenotype = as.numeric(y_cor_asso), 
            sex = as.numeric(cor_fam$sex),
            snp.major=FALSE)

fam <- 'cor_asso.fam'
bim <- 'cor_asso.bim'
bed <- 'cor_asso.bed'

snpgdsBED2GDS(bed, fam, bim, "cor_asso.gds")

## uncor null
write.plink(file.base="uncor_null", 
            snps=uncor_geno,
            subject.data=uncor_fam, 
            phenotype = as.numeric(y_uncor_null), 
            sex = as.numeric(uncor_fam$sex),
            snp.major=FALSE)

fam <- 'uncor_null.fam'
bim <- 'uncor_null.bim'
bed <- 'uncor_null.bed'

snpgdsBED2GDS(bed, fam, bim, "uncor_null.gds")

## uncor asso
write.plink(file.base="uncor_asso", 
            snps=uncor_geno,
            subject.data=uncor_fam, 
            phenotype = as.numeric(y_uncor_asso), 
            sex = as.numeric(uncor_fam$sex),
            snp.major=FALSE)

fam <- 'uncor_asso.fam'
bim <- 'uncor_asso.bim'
bed <- 'uncor_asso.bed'

snpgdsBED2GDS(bed, fam, bim, "uncor_asso.gds")
```


## Reformat data for analysis

The `snpstats` package uses a unique format to store data. Currently, genotypes are coded as 01, 02, and 03 (with 00 representing missing values:

```{r look-at-genotypes}
# look at first five rows/columns
hapmap$genotypes@.Data[1:5,1:5]
```

If the conversion was successful, you should now see a matrix of 0's, 1's, and 2's.

**Before you go on, check in with the others at your table. Was everyone able to get to this point successfully? Does anyone have any questions so far?**

## Simulate Trait

Let's simulate a trait that depends on the SNP known as *rs2476601*. Here's what we know about this SNP:

```{r look-at-causal-SNP}
hapmap$map %>%
  filter(snp.name == 'rs2476601')
```


Now, let's create a quantitative trait `y` that depends on the genotype at this SNP plus some random noise:

```{r simulate-trait}
n <- nrow(X)
y <- X[,'rs2476601'] + rnorm(n, 0, 1)
head(y)
```


### Getting started

To start, let's look at what happens when we run marginal regression on the first five SNPs in this dataset:

```{r fit-initial-models}
## fit models at first few SNPs
mod1 <- lm(y ~ X[,1])
mod2 <- lm(y ~ X[,2])
mod3 <- lm(y ~ X[,3])
mod4 <- lm(y ~ X[,4])
mod5 <- lm(y ~ X[,5])
```


```{r remove-mono-from-map}
# keep only those SNPs with MAF > 0
map.clean <- hapmap$map %>%
  filter(MAF > 0)
nrow(map.clean)
```


```{r remove-mono-from-genotypes}
# create vector of which SNPs have a MAF of 0
monomorphic <- which(maf == 0) 
head(monomorphic) 

# remove columns in the monomorphic vector
X.clean <- X[,-monomorphic]

#confirmed the ncol and nrow
ncol(X.clean)
nrow(X.clean)
```

**Confirm that the new "clean" genotype matrix has the correct number of rows and columns before you move on.**

### Analyze chromosome 1

Even after removing the monomorphic SNPs, we still have `r ncol(X, clean)` variants remaining. This might take awhile to analyze in R, so let's focus on just the SNPs on the first chromosome to start.

Run the code chunk below to make a list of which SNPs live on chromosome 1:

```{r find-chr1-snps}
chr1.snps <- which(map.clean$chromosome == 1)
head(chr1.snps)
length(chr1.snps)
```

Now, we're going to loop through each of the SNPs on chromosome 1, fitting a linear regression model at each SNP. For each model, we'll record the estimates (`betas`), standard errors (`ses`), test statistics (`tstats`) and p-values (`pvals`) for the coefficient of interest (the slope).

```{r run-gwas-chr1}
# set up empty vectors for storing results
betas <- c()
ses <- c()
tstats <- c()
pvals <- c()

# loop through chromosome 1 SNPs
for(i in chr1.snps){
  # print out occasional updates telling us what SNP we're analyzing
  if(i %% 5000 == 0) print(paste('Analyzing SNP', i)) 
  # fit model
  mod <- lm(y ~ X.clean[,i])
  # get coefficient information
  coefinfo <- tidy(mod)
  # record estimate, SE, test stat, and p-value
  betas[i] <- coefinfo$estimate[2]
  ses[i] <- coefinfo$std.error[2]
  tstats[i] <- coefinfo$statistic[2]
  pvals[i] <- coefinfo$p.value[2]
}
```

Let's add our results to our map data frame that contains information about each SNP:

```{r chr1-results}
# start with the map info for the chr 1 SNPs
chr1.results <- map.clean %>%
  filter(chromosome == 1)

# then add betas, SEs, etc.
chr1.results <- chr1.results %>%
  mutate(Estimate = betas,
         Std.Error = ses,
         Test.Statistic = tstats,
         P.Value = pvals)

# look at results
head(chr1.results)
```


```{r plot-minus-log-pvals}
ggplot(chr1.results, aes(x=position, y=-log(P.Value)))+geom_point()
```


### Analyze all chromosomes

As time allows, try repeating the analysis above, but now looking at the SNPs on other chromosomes as well. *Hint: the main thing you'll need to change is which SNPs you're looping over in your for loop.*

```{r}
chr3.snps <- which(map.clean$chromosome == 3)
head(chr3.snps)
length(chr3.snps)

betas <- c()
ses <- c()
tstats <- c()
pvals <- c()

# loop through chromosome 3 SNPs
for(i in chr3.snps){
  # print out occasional updates telling us what SNP we're analyzing
  if(i %% 5000 == 0) print(paste('Analyzing SNP', i)) 
  # fit model
  mod <- lm(y ~ X.clean[,i])
  # get coefficient information
  coefinfo <- tidy(mod)
  # record estimate, SE, test stat, and p-value
  betas[i] <- coefinfo$estimate[2]
  ses[i] <- coefinfo$std.error[2]
  tstats[i] <- coefinfo$statistic[2]
  pvals[i] <- coefinfo$p.value[2]
}

chr3.results <- map.clean %>%
  filter(chromosome == 3)

ncol(chr3.results)

head(betas)

# then add betas, SEs, etc.
chr3.results <- chr3.results %>%
  mutate(Estimate = betas,
         Std.Error = ses,
         Test.Statistic = tstats,
         P.Value = pvals)

# look at results
head(chr3.results)
```

```{r}
ggplot(chr3.results, aes(x=position, y=-log(P.Value)))+geom_point()
```