Skip to content

Commit

Permalink
reading release 20231101
Browse files Browse the repository at this point in the history
  • Loading branch information
cmatKhan committed Nov 1, 2023
1 parent 916338e commit 674dcac
Show file tree
Hide file tree
Showing 74 changed files with 1,352 additions and 367 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ llfs_rnaseq_data/*
plots/
inst/CITATION.bib
CITATION.cff
tmp/
Empty file modified .Rprofile
100644 → 100755
Empty file.
Empty file modified .github/.gitignore
100644 → 100755
Empty file.
Empty file modified .github/ISSUE_TEMPLATE/feature_request.md
100644 → 100755
Empty file.
Empty file modified .github/workflows/R-CMD-check.yaml
100644 → 100755
Empty file.
Empty file modified .github/workflows/pkgdown.yaml
100644 → 100755
Empty file.
2 changes: 2 additions & 0 deletions .gitignore
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ llfs_rnaseq_data/*
llfs_rnaseq_data_*/
*.tar.gz
*.csv
tmp/*
data/
Empty file modified CITATION.cff
100644 → 100755
Empty file.
Empty file modified DESCRIPTION
100644 → 100755
Empty file.
44 changes: 44 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Use an official R base image
FROM rocker/r-ver:4.3.1

# Set maintainer label
LABEL maintainer="[email protected]"

# Install required system libraries
RUN apt-get update && apt-get install -y \
libcurl4-openssl-dev \
libssl-dev \
libxml2-dev \
libblas-dev \
liblapack-dev \
libgd-dev \
gfortran \
gzip \
bzip2 \
xz-utils \
p7zip-full \
libsqlite3-dev \
libhdf5-dev \
libbz2-dev \
zlib1g-dev \
libcairo2-dev \
libxt-dev

# Install renv & dependencies
RUN R -e "install.packages('renv')"

# Create a directory for your project
WORKDIR /usr/local/src/my_project

# Copy your project files to the container (assuming they are in your current directory)
# This would typically include an renv.lock and renv/ directory if you have already snapshot your environment
COPY . .

# Restore the renv environment based on your lockfile
# RUN R -e 'renv::restore()'

# Expose port for RStudio or Shiny apps, if needed
# EXPOSE 8787

# Run a command to keep the container running, or specify a default action
CMD ["R"]
Empty file modified LICENSE.md
100644 → 100755
Empty file.
Empty file modified NAMESPACE
100644 → 100755
Empty file.
Empty file modified NEWS.md
100644 → 100755
Empty file.
Empty file modified R/.gitignore
100644 → 100755
Empty file.
Empty file modified R/batch_effect_qc_setup.R
100644 → 100755
Empty file.
32 changes: 16 additions & 16 deletions R/before_after_pca_and_plate_regression_plots.R
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#' @param x_pc which PC to plot on the x axis
#' @param y_pc which PC to plot on the Y axis
#' @param meta sample metadata. Rows should be in same order as counts
#' @param plate_colors vector of colors for each batch_id
#' @param plate_colors vector of colors for each batch
#' @param plot_title title of plot
#' @param pc_levels a vector of PC levels, eg 1,3,6,2,4,5 which will control
#' the order which the PCs are plotted
Expand All @@ -22,13 +22,13 @@
#'
#' @export
before_after_pca_and_plate_regression_plots <- function(counts,
pca_basis_data,
x_pc,
y_pc,
meta,
plate_colors,
plot_title,
pc_levels = NA) {
pca_basis_data,
x_pc,
y_pc,
meta,
plate_colors,
plot_title,
pc_levels = NA) {

counts_scaled_by_pca_basis <- scale(
t(counts),
Expand All @@ -42,17 +42,17 @@ before_after_pca_and_plate_regression_plots <- function(counts,
)

counts_projected_onto_pca_basis_df <-
dplyr::as_tibble(counts_projected_onto_pca_basis, rownames = "sample") %>%
plyr::mutate(sample_id = as.numeric(stringr::str_remove(sample, "sample_"))) %>%
dplyr::select(sample_id, dplyr::all_of(paste0("PC", seq(1, 10)))) %>%
dplyr::as_tibble(counts_projected_onto_pca_basis, rownames = "library") %>%
plyr::mutate(library_id = as.numeric(stringr::str_remove(library, "library_"))) %>%
dplyr::select(library_id, dplyr::all_of(paste0("PC", seq(1, 10)))) %>%
dplyr::left_join(meta)

counts_projects_onto_pca_basis_pcaplot <- counts_projected_onto_pca_basis_df %>%
dplyr::filter(!is.na(batch_id)) %>%
dplyr::mutate(batch_id = as.factor(batch_id)) %>%
ggplot(aes_string(x_pc, y_pc, color = "batch_id")) +
dplyr::filter(!is.na(batch)) %>%
dplyr::mutate(batch = as.factor(batch)) %>%
ggplot(aes_string(x_pc, y_pc, color = "batch")) +
geom_point(alpha = .5, size = 3) +
stat_ellipse(aes(linetype = batch_id)) +
stat_ellipse(aes(linetype = batch)) +
# scale_linetype_manual(values = c(0,0,0,0,1,0,0,0,0,1,0)) +
scale_color_manual(values = plate_colors) +
ylim(-70, 70) +
Expand All @@ -65,7 +65,7 @@ before_after_pca_and_plate_regression_plots <- function(counts,

plate_predicts_pc_rsquared <- data_for_lm %>%
dplyr::select(dplyr::all_of(paste0("PC", seq(1, 10)))) %>% # exclude outcome, leave only predictors
purrr::map(~ stats::lm(.x ~ data_for_lm$batch_id, data = data_for_lm)) %>%
purrr::map(~ stats::lm(.x ~ data_for_lm$batch, data = data_for_lm)) %>%
purrr::map(summary) %>%
purrr::map_dbl("r.squared") %>%
broom::tidy() %>%
Expand Down
Empty file modified R/combine_txi_objects.R
100644 → 100755
Empty file.
Empty file modified R/data.R
100644 → 100755
Empty file.
Empty file modified R/direction_change_in_percent_intergenic.R
100644 → 100755
Empty file.
Empty file modified R/filter_dds_restimate_sizeFactors.R
100644 → 100755
Empty file.
75 changes: 75 additions & 0 deletions R/import_quantification_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
library(tximport)
library(vroom)

process_tximport <- function(quant_df_path, TX_QUANTS = TRUE, tx2gene = NULL) {

if(!TX_QUANTS){
if(is.null(tx2gene)){
stop('tx2gene cannot be null if TX_QUANTS is TRUE')
} else if(!file.exists(tx2gene)){
stop('tx2gene file does not exist')
}
}

if(!file.exists(quant_df_path)){
stop('quant_df_path does not exist')
}

quant_df <- vroom::vroom(quant_df_path)

missing_cols <- setdiff(c('isoform_file', 'library_id'),
colnames(quant_df))
if(length(missing_cols) > 0){
stop(paste("`", paste(missing_cols, collapse="`, `"),
"` must be columns in `quant_df`"))
}

isoform_list <- setNames(quant_df$isoform_file,
paste0('library_', quant_df$library_id))

if(TX_QUANTS){
txi_isoform <- tximport(files = isoform_list,
type = 'rsem',
txIn = TRUE,
txOut = TRUE,
importer = vroom)
# Save the result
saveRDS(txi_isoform, file.path("txi_isoform.rds"))
} else {
tx2gene <- vroom::vroom(tx2gene)
txi_gene <- tximport(files = isoform_list,
type = 'rsem',
txIn = TRUE,
txOut = FALSE,
tx2gene = tx2gene,
importer = vroom)
# Save the result
saveRDS(txi_gene, file.path("txi_gene.rds"))
}

return(invisible(NULL)) # Return nothing, to avoid cluttering output
}

# # Define your parameters
# params_list <- list(quant_df_path = "isoforms_df_20231009.csv",
# TX_QUANTS = FALSE,
# tx2gene="gencode38_tx2gene_20210919.csv")
#
# # Specify required packages
# required_pkgs <- c("tximport", "vroom")
#
# # Set your SLURM options
# # --mem-per-cpu=10G -N 1 -n 5
# slurm_opts <- list(time = '00:60:00',
# nodes = 1,
# mem='80G')
#
#
# rslurm::slurm_call(
# process_tximport,
# params = params_list,
# jobname='tximport',
# pkgs=required_pkgs,
# slurm_options = slurm_opts,
# submit = FALSE
# )
54 changes: 54 additions & 0 deletions R/parse_all_by_all_compiled_results.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#'
#' summarize an all_by_all_compiled_metrics dataframe for upload to the
#' database
#'
#' @param all_by_all_df a wgs all by all compiled metrics dataframe with the
#' columns 'total_variants', 'library_id', 'rna_subject', 'dna_subject',
#' 'total_variants', 'matching_variants', 'match_ratio'
#'
#' @import dplyr
#'
#' @return a dataframe suitable for upload to the full_wgs_compare table
#'
#' @export
parse_all_by_all_compiled_results = function(all_by_all_df){

stopifnot(all(c('total_variants', 'library_id',
'rna_subject', 'dna_subject', 'total_variants',
'matching_variants', 'match_ratio') %in%
colnames(all_by_all_df)))

all_by_all_df %>%
filter(total_variants>=100) %>%
group_by(library_id) %>%
arrange(desc(match_ratio)) %>%
summarize(labelled_dna_subject=first(rna_subject),
labelled_total_variants=
total_variants[rna_subject==dna_subject][1],
labelled_match_variants=
matching_variants[rna_subject==dna_subject][1],
labelled_match_ratio=
match_ratio[rna_subject == dna_subject][1],
emperical_best_subject=first(dna_subject),
emperical_best_total_variants=first(total_variants),
emperical_best_match_variants=first(matching_variants),
emperical_best_match_ratio=first(match_ratio),
emperical_next_subject=nth(dna_subject,2),
emperical_next_total_variants=nth(total_variants,2),
emperical_next_match_variants=nth(matching_variants,2),
emperical_next_match_ratio=nth(match_ratio,2),
chisq_labelled=chisq.test(matrix(
c(total_variants[rna_subject==dna_subject]-
matching_variants[rna_subject==dna_subject],
first(total_variants)-first(matching_variants),
matching_variants[rna_subject==dna_subject],
first(matching_variants)), ncol = 2))$p.value,
chisq_emperical=chisq.test(matrix(
c(first(total_variants)-first(matching_variants),
nth(total_variants,2)-nth(matching_variants,2),
first(matching_variants),
nth(matching_variants,2)),
ncol = 2))$p.value)
}


58 changes: 58 additions & 0 deletions R/parse_wgs_compare.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
library(tidyverse)
library(here)

con = RSQLite::dbConnect(RSQLite::SQLite(),
here('llfs_rnaseq_data/llfs_rnaseq_database.sqlite'))

sample_df = tbl(con,'sample') %>%
collect() %>%
dplyr::rename(library_id = pk) %>%
select(library_id, fastq_id, visit, batch_id)

batch_df = tbl(con,'batch') %>%
collect() %>%
dplyr::rename(batch_id = pk)

sample_info_df = sample_df %>%
left_join(batch_df) %>%
select(fastq_id, visit, data_dir, library_id, batch_id) %>%
mutate(fastq_id = as.character(fastq_id))

compile_files = list.files("/mnt/scratch/llfs_rna_dna_compare_test",
"compiled_metrics.csv",
recursive = TRUE,
full.names = TRUE)

compile_files_new = compile_files[7]

names(compile_files_new) = basename(dirname(compile_files_new))

compiled_wgs_compare = map(
compile_files_new,
read_csv
) %>%
bind_rows(.id='data_dir') %>%
dplyr::rename(fastq_id = rna_sample,
visit = rna_visit)

compiled_wgs_compare_upload =
compiled_wgs_compare %>%
mutate(fastq_id = as.character(fastq_id),
visit = as.character(visit)) %>%
dplyr::rename(dna_subject = dna_sample,
homo_expr_cand = homo_expr_cand_fltr,
total_variants = overlap_fltr,
matching_variants = n_match_fltr) %>%
mutate(match_ratio = matching_variants / total_variants) %>%
left_join(sample_info_df) %>%
select(library_id,
dna_subject,
chr,
total_variants,
matching_variants,
homo_expr_cand,
match_ratio)

# dbAppendTable(con, 'wgs_compare', compiled_wgs_compare_upload)


Empty file modified R/project_counts_onto_orig_pcs.R
100644 → 100755
Empty file.
Empty file modified R/remove_parameter_effects.R
100644 → 100755
Empty file.
Empty file modified R/sex_mislabel_setup.R
100644 → 100755
Empty file.
Empty file modified R/top_500_pca_for_sex_mislabels.R
100644 → 100755
Empty file.
Empty file modified R/utils-pipe.R
100644 → 100755
Empty file.
Empty file modified _pkgdown.yml
100644 → 100755
Empty file.
Empty file modified data/pca_sex_qc.rda
100644 → 100755
Empty file.
Empty file modified inst/CITATION.bib
100644 → 100755
Empty file.
Empty file modified inst/hemoglobin_genes.csv
100644 → 100755
Empty file.
Empty file modified inst/htcf.config
100644 → 100755
Empty file.
Loading

0 comments on commit 674dcac

Please sign in to comment.