Skip to content

Commit

Permalink
properly filter MAF output in tumor-only mode
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Aug 1, 2024
1 parent 1a3480c commit 44e01a8
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 0 deletions.
1 change: 1 addition & 0 deletions pcgr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def create_config(arg_dict, workflow = "PCGR"):
conf_options['molecular_data']['fname_cna_tsv'] = "None"
conf_options['molecular_data']['fname_expression_tsv'] = "None"
conf_options['molecular_data']['fname_expression_outliers_tsv'] = "None"
conf_options['molecular_data']['fname_maf_tsv'] = "None"
#conf_options['molecular_data']['fname_expression_csq_tsv'] = "None"
conf_options['molecular_data']['fname_expression_similarity_tsv'] = "None"
conf_options['molecular_data']['fname_tmb_tsv'] = "None"
Expand Down
2 changes: 2 additions & 0 deletions pcgr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,8 @@ def run_pcgr(input_data, output_data,conf_options):
conf_options['output_prefix'] = output_prefix
conf_options['molecular_data']['fname_mut_vcf'] = output_vcf
conf_options['molecular_data']['fname_mut_tsv'] = output_pass_tsv_gz
if conf_options['other']['vcf2maf'] == 1:
conf_options['molecular_data']['fname_maf_tsv'] = output_maf
if conf_options['somatic_snv']['tmb']['run'] == 1:
conf_options['molecular_data']['fname_tmb_tsv'] = tmb_fname
if not input_cna == 'None':
Expand Down
1 change: 1 addition & 0 deletions pcgrr/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ export(exclude_non_chrom_variants)
export(expand_biomarker_items)
export(export_quarto_evars)
export(filter_eitems_by_site)
export(filter_maf_file)
export(filter_read_support)
export(generate_annotation_link)
export(generate_report)
Expand Down
5 changes: 5 additions & 0 deletions pcgrr/R/input_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,11 @@ load_somatic_snv_indel <- function(
dplyr::filter(
.data$SOMATIC_CLASSIFICATION == "SOMATIC")

## filter also MAF file if provided
pcgrr::filter_maf_file(
callset = callset,
settings = settings)

## Issue warning if clinically actionable variants are filtered
## with current filtering settings
n_actionable_filtered <-
Expand Down
132 changes: 132 additions & 0 deletions pcgrr/R/maf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#' Function that takes a MAF file generated with vcf2maf
#' and filters out variants that are presumably germline (tumor-only run)
#'
#' @param callset Callset with pre-processed somatic SNV/InDel variants
#' @param settings PCGR run/configuration settings
#'
#' @export
filter_maf_file <- function(callset, settings) {

## check if vcf2maf is TRUE
if (as.logical(settings[['conf']][['other']][['vcf2maf']]) == FALSE) {
return(0)
}

filtered_vars_maf_like <- data.frame()

if("variant" %in% names(callset)) {
if(NROW(callset[['variant']]) == 0) {
return(0)
}

pcgrr::log4r_info(paste0(
"Updating MAF file with filtered somatic SNV/InDels"))

if(all(c("CHROM",
"POS",
"REF",
"ALT") %in% colnames(callset[['variant']]))) {
filtered_vars_maf_like <- callset[['variant']] |>
dplyr::select(c("CHROM", "POS", "REF", "ALT")) |>
dplyr::mutate(Tumor_Seq_Allele1 = dplyr::case_when(
nchar(.data$REF) > 1 &
nchar(.data$ALT) == 1 &
substr(.data$REF, 1, 1) == .data$ALT ~ substr(.data$REF, 2, nchar(.data$REF)),
nchar(.data$ALT) > 1 &
nchar(.data$REF) == 1 &
substr(.data$ALT, 1, 1) == .data$REF ~ "-", # insertion
TRUE ~ .data$REF
)) |>
dplyr::mutate(Tumor_Seq_Allele2 = dplyr::case_when(
nchar(.data$REF) > 1 &
nchar(.data$ALT) == 1 &
substr(.data$REF, 1, 1) == .data$ALT ~ "-",
nchar(.data$ALT) > 1 &
nchar(.data$REF) == 1 &
substr(.data$ALT, 1, 1) == .data$REF ~ substr(.data$ALT, 2, nchar(.data$ALT)),
TRUE ~ .data$ALT
)) |>
dplyr::mutate(Variant_Type = dplyr::case_when(
nchar(.data$REF) == 1 &
nchar(.data$ALT) == 1 ~ "SNP",
nchar(.data$REF) > 1 &
nchar(.data$ALT) == 1 ~ "DEL",
nchar(.data$ALT) > 1 &
nchar(.data$REF) == 1 ~ "INS",
TRUE ~ "MNP"
)) |>
dplyr::mutate(
Chromosome = .data$CHROM,
Start_Position = dplyr::case_when(
.data$Variant_Type == "DEL" &
substr(.data$REF, 1, 1) == .data$ALT ~ .data$POS + 1,
TRUE ~ .data$POS
)
) |>
dplyr::select(
c("Chromosome",
"Start_Position",
"Tumor_Seq_Allele1",
"Tumor_Seq_Allele2",
"Variant_Type")
)
}

}

maf_data_unfiltered <- data.frame()
maf_data_header <- NULL

## check if unfiltered MAF file exists and read it - if not, return 0
if (file.exists(settings[['molecular_data']][['fname_maf_tsv']])) {
if(!(file.size(settings[['molecular_data']][['fname_maf_tsv']]) == 0)) {
maf_data_header <- readLines(
settings[['molecular_data']][['fname_maf_tsv']], n = 1)

maf_data_unfiltered <- readr::read_tsv(
settings[['molecular_data']][['fname_maf_tsv']],
show_col_types = F, col_names = T,
comment = "#", na = ""
)
} else {
pcgrr::log4r_warn("MAF file is empty - no filtering will be performed")
return(0)
}
}

if(NROW(maf_data_unfiltered) > 0 &
NROW(filtered_vars_maf_like) > 0) {
if(all(c("Chromosome",
"Start_Position",
"Tumor_Seq_Allele1",
"Tumor_Seq_Allele2",
"Variant_Type") %in% colnames(maf_data_unfiltered)) &
all(c("Chromosome",
"Start_Position",
"Tumor_Seq_Allele1",
"Tumor_Seq_Allele2",
"Variant_Type") %in% colnames(filtered_vars_maf_like))) {
maf_data_filtered <- maf_data_unfiltered |>
dplyr::semi_join(
filtered_vars_maf_like,
by = c("Chromosome", "Start_Position",
"Tumor_Seq_Allele1", "Tumor_Seq_Allele2",
"Variant_Type")
)

if(!is.null(maf_data_header) &
NROW(maf_data_filtered) > 0) {
file.remove(settings[['molecular_data']][['fname_maf_tsv']])
writeLines(maf_data_header,
settings[['molecular_data']][['fname_maf_tsv']])
options(scipen = 999)
readr::write_tsv(
maf_data_filtered,
settings[['molecular_data']][['fname_maf_tsv']],
append = TRUE, col_names = T, quote = "none", na = "")
}
}
}

return(0)
}
18 changes: 18 additions & 0 deletions pcgrr/man/filter_maf_file.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 44e01a8

Please sign in to comment.