reading release 20231101

cmatKhan · Nov 1, 2023 · 674dcac · 674dcac
1 parent 916338e
commit 674dcac
Show file tree

Hide file tree

Showing 74 changed files with 1,352 additions and 367 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -13,3 +13,4 @@ llfs_rnaseq_data/*
 plots/
 inst/CITATION.bib
 CITATION.cff
+tmp/
diff --git a/.Rprofile b/.Rprofile
diff --git a/.github/.gitignore b/.github/.gitignore
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,5 @@ llfs_rnaseq_data/*
 llfs_rnaseq_data_*/
 *.tar.gz
 *.csv
+tmp/*
+data/
diff --git a/CITATION.cff b/CITATION.cff
diff --git a/DESCRIPTION b/DESCRIPTION
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,44 @@
+# Use an official R base image
+FROM rocker/r-ver:4.3.1
+
+# Set maintainer label
+LABEL maintainer="[email protected]"
+
+# Install required system libraries
+RUN apt-get update && apt-get install -y \
+    libcurl4-openssl-dev \
+    libssl-dev \
+    libxml2-dev \
+    libblas-dev \
+    liblapack-dev \
+    libgd-dev \
+    gfortran \
+    gzip \
+    bzip2 \
+    xz-utils \
+    p7zip-full \
+    libsqlite3-dev \
+    libhdf5-dev \
+    libbz2-dev \
+    zlib1g-dev \
+    libcairo2-dev \
+    libxt-dev
+
+# Install renv & dependencies
+RUN R -e "install.packages('renv')"
+
+# Create a directory for your project
+WORKDIR /usr/local/src/my_project
+
+# Copy your project files to the container (assuming they are in your current directory)
+# This would typically include an renv.lock and renv/ directory if you have already snapshot your environment
+COPY . .
+
+# Restore the renv environment based on your lockfile
+# RUN R -e 'renv::restore()'
+
+# Expose port for RStudio or Shiny apps, if needed
+# EXPOSE 8787
+
+# Run a command to keep the container running, or specify a default action
+CMD ["R"]
diff --git a/LICENSE.md b/LICENSE.md
diff --git a/NAMESPACE b/NAMESPACE
diff --git a/NEWS.md b/NEWS.md
diff --git a/R/.gitignore b/R/.gitignore
diff --git a/R/batch_effect_qc_setup.R b/R/batch_effect_qc_setup.R
diff --git a/R/before_after_pca_and_plate_regression_plots.R b/R/before_after_pca_and_plate_regression_plots.R
@@ -6,7 +6,7 @@
 #' @param x_pc which PC to plot on the x axis
 #' @param y_pc which PC to plot on the Y axis
 #' @param meta sample metadata. Rows should be in same order as counts
-#' @param plate_colors vector of colors for each batch_id
+#' @param plate_colors vector of colors for each batch
 #' @param plot_title title of plot
 #' @param pc_levels a vector of PC levels, eg 1,3,6,2,4,5 which will control
 #'   the order which the PCs are plotted
@@ -22,13 +22,13 @@
 #'
 #' @export
 before_after_pca_and_plate_regression_plots <- function(counts,
-                                    pca_basis_data,
-                                    x_pc,
-                                    y_pc,
-                                    meta,
-                                    plate_colors,
-                                    plot_title,
-                                    pc_levels = NA) {
+                                                        pca_basis_data,
+                                                        x_pc,
+                                                        y_pc,
+                                                        meta,
+                                                        plate_colors,
+                                                        plot_title,
+                                                        pc_levels = NA) {
 
   counts_scaled_by_pca_basis <- scale(
     t(counts),
@@ -42,17 +42,17 @@ before_after_pca_and_plate_regression_plots <- function(counts,
   )
 
   counts_projected_onto_pca_basis_df <-
-    dplyr::as_tibble(counts_projected_onto_pca_basis, rownames = "sample") %>%
-    plyr::mutate(sample_id = as.numeric(stringr::str_remove(sample, "sample_"))) %>%
-    dplyr::select(sample_id, dplyr::all_of(paste0("PC", seq(1, 10)))) %>%
+    dplyr::as_tibble(counts_projected_onto_pca_basis, rownames = "library") %>%
+    plyr::mutate(library_id = as.numeric(stringr::str_remove(library, "library_"))) %>%
+    dplyr::select(library_id, dplyr::all_of(paste0("PC", seq(1, 10)))) %>%
     dplyr::left_join(meta)
 
   counts_projects_onto_pca_basis_pcaplot <- counts_projected_onto_pca_basis_df %>%
-    dplyr::filter(!is.na(batch_id)) %>%
-    dplyr::mutate(batch_id = as.factor(batch_id)) %>%
-    ggplot(aes_string(x_pc, y_pc, color = "batch_id")) +
+    dplyr::filter(!is.na(batch)) %>%
+    dplyr::mutate(batch = as.factor(batch)) %>%
+    ggplot(aes_string(x_pc, y_pc, color = "batch")) +
     geom_point(alpha = .5, size = 3) +
-    stat_ellipse(aes(linetype = batch_id)) +
+    stat_ellipse(aes(linetype = batch)) +
     # scale_linetype_manual(values = c(0,0,0,0,1,0,0,0,0,1,0)) +
     scale_color_manual(values = plate_colors) +
     ylim(-70, 70) +
@@ -65,7 +65,7 @@ before_after_pca_and_plate_regression_plots <- function(counts,
 
   plate_predicts_pc_rsquared <- data_for_lm %>%
     dplyr::select(dplyr::all_of(paste0("PC", seq(1, 10)))) %>% # exclude outcome, leave only predictors
-    purrr::map(~ stats::lm(.x ~ data_for_lm$batch_id, data = data_for_lm)) %>%
+    purrr::map(~ stats::lm(.x ~ data_for_lm$batch, data = data_for_lm)) %>%
     purrr::map(summary) %>%
     purrr::map_dbl("r.squared") %>%
     broom::tidy() %>%

diff --git a/R/combine_txi_objects.R b/R/combine_txi_objects.R
diff --git a/R/data.R b/R/data.R
diff --git a/R/direction_change_in_percent_intergenic.R b/R/direction_change_in_percent_intergenic.R
diff --git a/R/filter_dds_restimate_sizeFactors.R b/R/filter_dds_restimate_sizeFactors.R
diff --git a/R/import_quantification_data.R b/R/import_quantification_data.R
@@ -0,0 +1,75 @@
+library(tximport)
+library(vroom)
+
+process_tximport <- function(quant_df_path, TX_QUANTS = TRUE, tx2gene = NULL) {
+
+  if(!TX_QUANTS){
+    if(is.null(tx2gene)){
+      stop('tx2gene cannot be null if TX_QUANTS is TRUE')
+    } else if(!file.exists(tx2gene)){
+      stop('tx2gene file does not exist')
+    }
+  }
+
+  if(!file.exists(quant_df_path)){
+    stop('quant_df_path does not exist')
+  }
+
+  quant_df <- vroom::vroom(quant_df_path)
+
+  missing_cols <- setdiff(c('isoform_file', 'library_id'),
+                          colnames(quant_df))
+  if(length(missing_cols) > 0){
+    stop(paste("`", paste(missing_cols, collapse="`, `"),
+               "` must be columns in `quant_df`"))
+  }
+
+  isoform_list <- setNames(quant_df$isoform_file,
+                           paste0('library_', quant_df$library_id))
+
+  if(TX_QUANTS){
+    txi_isoform <- tximport(files = isoform_list,
+                            type = 'rsem',
+                            txIn = TRUE,
+                            txOut = TRUE,
+                            importer = vroom)
+    # Save the result
+    saveRDS(txi_isoform, file.path("txi_isoform.rds"))
+  } else {
+    tx2gene <- vroom::vroom(tx2gene)
+    txi_gene <- tximport(files = isoform_list,
+                         type = 'rsem',
+                         txIn = TRUE,
+                         txOut = FALSE,
+                         tx2gene = tx2gene,
+                         importer = vroom)
+    # Save the result
+    saveRDS(txi_gene, file.path("txi_gene.rds"))
+  }
+
+  return(invisible(NULL))  # Return nothing, to avoid cluttering output
+}
+
+# # Define your parameters
+# params_list <- list(quant_df_path = "isoforms_df_20231009.csv",
+#                     TX_QUANTS = FALSE,
+#                     tx2gene="gencode38_tx2gene_20210919.csv")
+#
+# # Specify required packages
+# required_pkgs <- c("tximport", "vroom")
+#
+# # Set your SLURM options
+# # --mem-per-cpu=10G -N 1 -n 5
+# slurm_opts <- list(time = '00:60:00',
+#                    nodes = 1,
+#                    mem='80G')
+#
+#
+# rslurm::slurm_call(
+#   process_tximport,
+#   params = params_list,
+#   jobname='tximport',
+#   pkgs=required_pkgs,
+#   slurm_options = slurm_opts,
+#   submit = FALSE
+# )
diff --git a/R/parse_all_by_all_compiled_results.R b/R/parse_all_by_all_compiled_results.R
@@ -0,0 +1,54 @@
+#'
+#' summarize an all_by_all_compiled_metrics dataframe for upload to the
+#' database
+#'
+#' @param all_by_all_df a wgs all by all compiled metrics dataframe with the
+#'   columns 'total_variants', 'library_id', 'rna_subject', 'dna_subject',
+#'   'total_variants', 'matching_variants', 'match_ratio'
+#'
+#' @import dplyr
+#'
+#' @return a dataframe suitable for upload to the full_wgs_compare table
+#'
+#' @export
+parse_all_by_all_compiled_results = function(all_by_all_df){
+
+  stopifnot(all(c('total_variants', 'library_id',
+                     'rna_subject', 'dna_subject', 'total_variants',
+                     'matching_variants', 'match_ratio') %in%
+                     colnames(all_by_all_df)))
+
+  all_by_all_df %>%
+    filter(total_variants>=100) %>%
+    group_by(library_id) %>%
+    arrange(desc(match_ratio)) %>%
+    summarize(labelled_dna_subject=first(rna_subject),
+              labelled_total_variants=
+                total_variants[rna_subject==dna_subject][1],
+              labelled_match_variants=
+                matching_variants[rna_subject==dna_subject][1],
+              labelled_match_ratio=
+                match_ratio[rna_subject == dna_subject][1],
+              emperical_best_subject=first(dna_subject),
+              emperical_best_total_variants=first(total_variants),
+              emperical_best_match_variants=first(matching_variants),
+              emperical_best_match_ratio=first(match_ratio),
+              emperical_next_subject=nth(dna_subject,2),
+              emperical_next_total_variants=nth(total_variants,2),
+              emperical_next_match_variants=nth(matching_variants,2),
+              emperical_next_match_ratio=nth(match_ratio,2),
+              chisq_labelled=chisq.test(matrix(
+                c(total_variants[rna_subject==dna_subject]-
+                    matching_variants[rna_subject==dna_subject],
+                  first(total_variants)-first(matching_variants),
+                  matching_variants[rna_subject==dna_subject],
+                  first(matching_variants)), ncol = 2))$p.value,
+              chisq_emperical=chisq.test(matrix(
+                c(first(total_variants)-first(matching_variants),
+                  nth(total_variants,2)-nth(matching_variants,2),
+                  first(matching_variants),
+                  nth(matching_variants,2)),
+                ncol = 2))$p.value)
+}
+
+
diff --git a/R/parse_wgs_compare.R b/R/parse_wgs_compare.R
@@ -0,0 +1,58 @@
+library(tidyverse)
+library(here)
+
+con = RSQLite::dbConnect(RSQLite::SQLite(),
+                         here('llfs_rnaseq_data/llfs_rnaseq_database.sqlite'))
+
+sample_df = tbl(con,'sample') %>%
+  collect() %>%
+  dplyr::rename(library_id = pk) %>%
+  select(library_id, fastq_id, visit, batch_id)
+
+batch_df = tbl(con,'batch') %>%
+  collect() %>%
+  dplyr::rename(batch_id = pk)
+
+sample_info_df = sample_df %>%
+  left_join(batch_df) %>%
+  select(fastq_id, visit, data_dir, library_id, batch_id) %>%
+  mutate(fastq_id = as.character(fastq_id))
+
+compile_files = list.files("/mnt/scratch/llfs_rna_dna_compare_test",
+                           "compiled_metrics.csv",
+                           recursive = TRUE,
+                           full.names = TRUE)
+
+compile_files_new = compile_files[7]
+
+names(compile_files_new) = basename(dirname(compile_files_new))
+
+compiled_wgs_compare = map(
+  compile_files_new,
+  read_csv
+) %>%
+  bind_rows(.id='data_dir') %>%
+  dplyr::rename(fastq_id = rna_sample,
+                visit = rna_visit)
+
+compiled_wgs_compare_upload =
+  compiled_wgs_compare %>%
+  mutate(fastq_id = as.character(fastq_id),
+         visit = as.character(visit)) %>%
+  dplyr::rename(dna_subject = dna_sample,
+                homo_expr_cand = homo_expr_cand_fltr,
+                total_variants = overlap_fltr,
+                matching_variants = n_match_fltr) %>%
+  mutate(match_ratio = matching_variants / total_variants) %>%
+  left_join(sample_info_df) %>%
+  select(library_id,
+         dna_subject,
+         chr,
+         total_variants,
+         matching_variants,
+         homo_expr_cand,
+         match_ratio)
+
+# dbAppendTable(con, 'wgs_compare', compiled_wgs_compare_upload)
+
+
diff --git a/R/project_counts_onto_orig_pcs.R b/R/project_counts_onto_orig_pcs.R
diff --git a/R/remove_parameter_effects.R b/R/remove_parameter_effects.R
diff --git a/R/sex_mislabel_setup.R b/R/sex_mislabel_setup.R
diff --git a/R/top_500_pca_for_sex_mislabels.R b/R/top_500_pca_for_sex_mislabels.R
diff --git a/R/utils-pipe.R b/R/utils-pipe.R
diff --git a/_pkgdown.yml b/_pkgdown.yml
diff --git a/data/pca_sex_qc.rda b/data/pca_sex_qc.rda
diff --git a/inst/CITATION.bib b/inst/CITATION.bib
diff --git a/inst/hemoglobin_genes.csv b/inst/hemoglobin_genes.csv
diff --git a/inst/htcf.config b/inst/htcf.config
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,3 +13,4 @@ llfs_rnaseq_data/* @@
     plots/
     inst/CITATION.bib
     CITATION.cff
+    tmp/