diff --git a/.dockstore.yml b/.dockstore.yml index ff958fe50..57004902e 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -96,3 +96,6 @@ workflows: - name: CleanupIntermediate subclass: wdl primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/CleanupIntermediate.wdl +- name: GenerateMalariaReport + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/MalariaReportsWorkflow.wdl diff --git a/.gitignore b/.gitignore index e46a4fc83..137469f0d 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ venv/ .tox test/test_output/ __pycache__ +*.zip +docker/lr-malaria-reports/report-files/Senegal_Terra_Metadata.csv diff --git a/docker/lr-malaria-reports/.gitignore b/docker/lr-malaria-reports/.gitignore new file mode 100644 index 000000000..9c2b7c643 --- /dev/null +++ b/docker/lr-malaria-reports/.gitignore @@ -0,0 +1,11 @@ +templates/map.html +test_data/ +data/ +/.vscode +*_report.html +*.jpeg +*.json +*.ipynb +*.txt +*.bed.gz +test.html \ No newline at end of file diff --git a/docker/lr-malaria-reports/Dockerfile b/docker/lr-malaria-reports/Dockerfile new file mode 100644 index 000000000..a55c80ca0 --- /dev/null +++ b/docker/lr-malaria-reports/Dockerfile @@ -0,0 +1,36 @@ +# Base image +FROM continuumio/miniconda3:4.9.2 + +LABEL Bridget Knight + +# install gsutil +RUN apt-get --allow-releaseinfo-change update +RUN apt install -y curl git-lfs parallel +RUN curl https://sdk.cloud.google.com | bash + +# Setup crcmodc for gsutil: +RUN apt-get install -y gcc python3-dev python3-setuptools && \ + pip3 uninstall -y crcmod && \ + pip3 install --no-cache-dir -U crcmod + +#### Specific for google cloud support +RUN apt-get install -y gnupg2 +RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list \ + && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - \ + && apt-get update -y \ + && apt-get install google-cloud-sdk -y + +# make sure pip is up to date +RUN pip3 install --upgrade pip==23.2.1 + +# install other packages +RUN pip3 install --no-cache-dir NumPy==1.24.4 +RUN pip3 install --no-cache-dir pandas==1.3.0 +RUN pip3 install --no-cache-dir argparse==1.4.0 folium==0.14.0 jinja2==3.1.2 \ + MarkupSafe==2.1.3 matplotlib==3.7.3 \ + plotly==5.15.0 tabulate==0.9.0 + +ENV PATH=/root/google-cloud-sdk/bin/:${PATH} + +# copy other resources +COPY ./report-files /report-files/ diff --git a/docker/lr-malaria-reports/Makefile b/docker/lr-malaria-reports/Makefile new file mode 100644 index 000000000..bbd1f6711 --- /dev/null +++ b/docker/lr-malaria-reports/Makefile @@ -0,0 +1,18 @@ +IMAGE_NAME = lr-malaria-reports +VERSION = 0.0.2 + +TAG1 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest + +all: | build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +build_no_cache: + docker build --no-cache -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) + diff --git a/docker/lr-malaria-reports/report-files/report_gen.py b/docker/lr-malaria-reports/report-files/report_gen.py new file mode 100644 index 000000000..d6f7ab224 --- /dev/null +++ b/docker/lr-malaria-reports/report-files/report_gen.py @@ -0,0 +1,859 @@ +import os + +# Data manipulation +import numpy as np +import pandas as pd +import random as rnd +import tabulate +import re +from enum import Enum +import glob +import datetime + +# Plotting +import matplotlib.pyplot as plt +import matplotlib +import plotly.express as px +import folium +import plotly.graph_objects as go +import math +from collections import defaultdict +from matplotlib.collections import PatchCollection + +# HTML +from markupsafe import Markup +import jinja2 +import io +from io import StringIO +import base64 +import urllib.parse + +# Argument Parsing +import argparse + +''' +matplotlib render settings +''' +# Make big figures: +gFIG_SIZE_in = [14, 10] + +# Set plotting defaults: +gPLOT_PARAMS = { + "legend.fontsize": "x-large", + "figure.figsize": gFIG_SIZE_in, + "axes.labelsize": "x-large", + "axes.titlesize": "x-large", + "xtick.labelsize": "x-large", + "ytick.labelsize": "x-large" +} +matplotlib.rcParams.update(gPLOT_PARAMS) + +# Some single-place definitions of sizes for plots / figures: +gFONT_SIZE_UNITS = "pt" +gTITLE_FONT_SIZE = 36 +gAXIS_LABEL_FONT_SIZE = 24 +gTICK_LABEL_FONT_SIZE = 16 +gTEXT_FONT_SIZE = 16 + +# To track global figure number creation: +gFIG_NUM = 0 + +def fix_plot_visuals(fig, + titlesize=gTITLE_FONT_SIZE, + labelsize=gAXIS_LABEL_FONT_SIZE, + ticklabelsize=gTICK_LABEL_FONT_SIZE, + textsize=gTEXT_FONT_SIZE, + tight_rect=None): + """Fix the plot elements to be appropriate sizes for a slide / presentation.""" + + if not textsize: + textsize = ticklabelsize + + for ax in fig.get_axes(): + + for ticklabel in (ax.get_xticklabels()): + ticklabel.set_fontsize(ticklabelsize) + for ticklabel in (ax.get_yticklabels()): + ticklabel.set_fontsize(ticklabelsize) + for c in ax.get_children(): + if c.__class__ == matplotlib.text.Text: + c.set_fontsize(textsize) + + ax.xaxis.get_label().set_fontsize(labelsize) + ax.yaxis.get_label().set_fontsize(labelsize) + ax.title.set_fontsize(titlesize) + + for c in fig.get_children(): + if c.__class__ == matplotlib.legend.Legend: + c.prop.set_size(ticklabelsize) + c.get_title().set_size(ticklabelsize) + + if tight_rect: + fig.tight_layout(rect=tight_rect) + else: + fig.tight_layout() + + if fig._suptitle: + sup_title = fig._suptitle.get_text() + fig.suptitle(sup_title, fontsize=titlesize) + + # Make it so we can actually see what's happening on the plots with "dark mode": + fig.patch.set_facecolor("white") + + +''' +Coverage Plot +''' +def plot_coverage(directory, sample_name, bin_width=500): + + # only looking for .bed.gz files + ext = (".bed.gz") + + # Set up data + beds = pd.DataFrame(columns=["stop", "depth"]) + sorted_file_list = list() + + # Reading and sorting files if coverage data is available + if os.path.exists(directory): + file_list = os.listdir(directory) + sorted_file_list = sorted(file_list, key=lambda item: item.split(".")[-4].split("_")[1] if item.endswith(".bed.gz") else "00") + print(f"Files to be plotted: {sorted_file_list}") + else: + return None + + # Plot setup + fig, ax = plt.subplots(1,1, figsize=(15, 9)) + plt.title(f"Coverage Plot of Sample {sample_name}", pad=12, fontsize=12) + plt.xlabel("Contig (bp)", labelpad=10, fontsize=12) + plt.ylabel("Depth", labelpad=10, fontsize=12) + color = "#2494C7" + tick_positions = [] + contigs = [] + bin_max = 0 + + # Iterate over each bed.gz in test_data folder + for idx, file in enumerate(sorted_file_list): + + if file.endswith(ext): + # Reading current .bed.gz file + f = os.path.join(directory, file) + #print(f"\n{idx} Current file: {f}") + + # Create DataFrame per .bed.gz + bed_df = pd.read_csv(f, delimiter="\\t", names=["contig", "start", "stop", "depth"], engine="python") + + # Get bins + start_min = bed_df["start"].min() + stop_max = bed_df["stop"].max() + bins = np.arange(start_min, stop_max+1, bin_width) + values = np.zeros(len(bins)) + + # Iterrate through each DataFrame and populate bins + #print("Populating bins...") + for _, row in bed_df.iterrows(): + avg = (row["stop"]+row["start"])/2 + index = int(np.floor(avg/bin_width)) + values[index]+=row["depth"] + + # Append new data to DF + #print("Plotting data...") + if(idx == 0): + ax.plot(bins, values, ".", c=color) + tick_positions.append(math.floor(stop_max/2)+bin_max) + bin_max = max(bed_df["stop"]) + else: + ax.plot(bins+bin_max, values, ".", c=color) + tick_positions.append(math.floor(stop_max/2)+bin_max) + bin_max = bin_max + max(bed_df["stop"]) + + # Saving xtick data + contigs.append(bed_df["contig"][0]) + + # Setting xticks + ax.set_xticks(ticks=tick_positions) + ax.set_xticklabels(labels=contigs, rotation=90) + ax.set_yscale("log") + fix_plot_visuals(fig) + + return fig + +''' +Drug Resistance Table +''' +def create_drug_table(file): + if not file: + resistances = ["UNDETERMINED","UNDETERMINED","UNDETERMINED","UNDETERMINED","UNDETERMINED","UNDETERMINED"] + else: + data = open(file, 'r').read() + resistances = list(get_drug_resistance(data, None, None, do_print=True)) + + resistances_tbl = pd.DataFrame(columns = ["Chloroquine", "Pyrimethamine", "Sulfadoxine", "Mefloquine", "Artemisinin", "Piperaquine"]) + resistances = map(str, resistances) + resistances = [s.replace('DrugSensitivity.','') for s in list(resistances)] + + return resistances + +# Set up some functions to determine drug sensitivity: +# vars for markers in drug_resistance_report.txt +present = '+' +absent = '-' + +pchange_regex = re.compile(r"""p\.([A-z]+)([0-9]+)([A-z]+)""") +AA_3_2 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K', 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'} + +DrugSensitivity = Enum("DrugSensitivity", ["UNDETERMINED", "SENSITIVE", "RESISTANT"]) + +def parse_pchange(pchange): + old, pos, new = pchange_regex.match(pchange).groups() + return AA_3_2[old.upper()], int(pos), AA_3_2[new.upper()] + +def get_chloroquine_sensitivity(dr_report): +# Locus utilized: PF3D7_0709000 (crt) +# Codon: 76 +# Workflow: +# Step Genetic change Interpretation Classification +# 1 76 K/T heterozygote Heterozygous mutant Undetermined +# 2 76 missing Missing Undetermined +# 3 K76 Wild type Sensitive +# 4 76T Mutant Resistant +# 5 otherwise Unknown mutant Undetermined + + for line in StringIO(dr_report): + line = ' '.join(line.split()) + # We only care about this locus for this drug: + if line.startswith("pfcrt PF3D7_0709000"): + gene, locus, pchange, marker = line.strip().split(" ") + old, pos, new = parse_pchange(pchange) + if pos == 76: + if (old == "L") and (new == "T") and (marker == absent): + return DrugSensitivity.SENSITIVE + elif (new == "T") and (marker == present): + return DrugSensitivity.RESISTANT + + return DrugSensitivity.UNDETERMINED + +def get_pyrimethamine_sensitivity(dr_report): +# Locus utilized: PF3D7_0417200 (dhfr) +# Codon: 108 +# Workflow: +# Step Genetic change Interpretation Classification +# 1 108 S/N heterozygote Heterozygous mutant Undetermined +# 2 108 missing Missing Undetermined +# 3 S108 Wild type Sensitive +# 4 108N Mutant Resistant +# 5 otherwise Unknown mutant Undetermined + + for line in StringIO(dr_report): + line = ' '.join(line.split()) + # We only care about this locus for this drug: + if line.startswith("pfdhfr PF3D7_0417200"): + gene, locus, pchange, marker = line.strip().split(" ") + old, pos, new = parse_pchange(pchange) + if pos == 108: + if (old == "S") and (new == "N") and (marker == absent): + return DrugSensitivity.SENSITIVE + elif (old == "S") and (new == "N") and (marker == present): + return DrugSensitivity.RESISTANT + elif (new == "N") and (marker == present): + return DrugSensitivity.RESISTANT + elif marker == absent: + return DrugSensitivity.SENSITIVE + + return DrugSensitivity.UNDETERMINED + +def get_sulfadoxine_sensitivity(dr_report): +# Locus utilized: PF3D7_0810800 (dhps) +# Codon: 437 +# Workflow: +# Step Genetic change Interpretation Classification +# 1 437 A/G heterozygote Heterozygous mutant Undetermined +# 2 437 missing Missing Undetermined +# 3 A437 Wild type Sensitive +# 4 437G Mutant Resistant +# 5 otherwise Unknown mutant Undetermined + + for line in StringIO(dr_report): + line = ' '.join(line.split()) + + # We only care about this locus for this drug: + if line.startswith("pfdhps PF3D7_0810800"): + gene, locus, pchange, marker = line.strip().split(" ") + old, pos, new = parse_pchange(pchange) + if pos == 437: + if (old == "A") and (new == "G") and (marker == absent): + return DrugSensitivity.SENSITIVE + elif (old == "A") and (new == "G") and (marker == present): + return DrugSensitivity.RESISTANT + elif (new == "G") and (marker == present): + return DrugSensitivity.RESISTANT + elif marker == absent: + return DrugSensitivity.SENSITIVE + + return DrugSensitivity.UNDETERMINED + +def get_mefloquine_sensitivity(dr_report): +# Locus utilized: PF3D7_0523000 (mdr1) +# Codons: Amplification status of whole gene +# Workflow: +# Step Genetic change Interpretation Classification +# 1 Missing Missing Undetermined +# 2 Heterozygous duplication Heterozygous mutant Undetermined +# 3 Single copy Wild type Sensitive +# 4 Multiple copies Mutant Resistant + + # Currently we can't determine this. + # We need to get CNV calling working first. + + return DrugSensitivity.UNDETERMINED + +def get_artemisinin_sensitivity(dr_report): +# Locus utilized: PF3D7_1343700 (kelch13) +# Codons: 349-726 (BTB/POZ and propeller domains) +# Workflow: +# Step Genetic change Interpretation Classification +# 1 Homozygous non-synonymous mutations in the kelch13 BTB/POZ and propeller +# domain classified by the World Health Organisation as associated with delayed +# parasite clearance +# Mutant – associated with delayed clearance Resistant +# 2 Heterozygous non-synonymous mutations in the kelch13 BTB/POZ and +# propeller domain classified by the World Health Organisation as associated +# with delayed parasite clearance +# Mutant - heterozygous Undetermined +# 3 578S as homozygous Mutant - not associated Sensitive +# 4 Any missing call in amino acids 349-726 Missing Undetermined +# 5 No non-reference calls in amino acids 349-726 Wild-type Sensitive +# 6 otherwise Mutant - not in WHO list Undetermined + + has_variants = False + + for line in StringIO(dr_report): + line = ' '.join(line.split()) + # We only care about this locus for this drug: + if line.startswith("pfkelch13 PF3D7_1343700"): + gene, locus, pchange, marker = line.strip().split(" ") + old, pos, new = parse_pchange(pchange) + + has_non_ref = False + has_variants = False + if 349 <= pos <= 726: + if (old != new) and (marker == present): + return DrugSensitivity.RESISTANT + elif (new == "S") and (marker == present): + return DrugSensitivity.SENSITIVE + elif marker == present: + has_non_ref = True + has_variants = True + if (has_variants) and (not has_non_ref): + return DrugSensitivity.SENSITIVE + + return DrugSensitivity.UNDETERMINED + +def get_piperaquine_sensitivity(dr_report): +# Loci utilized: PF3D7_1408000 (plasmepsin 2) and PF3D7_1408100 (plasmepsin 3) +# Codons: Amplification status of both genes +# Workflow: +# Step Genetic change Interpretation Classification +# 1 Missing Missing Undetermined +# 2 Heterozygous duplication Heterozygous mutant Undetermined +# 3 Single copy Wild type Sensitive +# 4 Multiple copies Mutant Resistant + + # Currently we can't determine this. + # We need to get CNV calling working first. + + return DrugSensitivity.UNDETERMINED + +def get_drug_resistance(data, sample_id, sample_df, do_print=False): + + chloroquine = get_chloroquine_sensitivity(data) + pyrimethamine = get_pyrimethamine_sensitivity(data) + sulfadoxine = get_sulfadoxine_sensitivity(data) + mefloquine = get_mefloquine_sensitivity(data) + artemisinin = get_artemisinin_sensitivity(data) + piperaquine = get_piperaquine_sensitivity(data) + + return chloroquine, pyrimethamine, sulfadoxine, mefloquine, artemisinin, piperaquine + +def plot_dr_bubbles(dr_report_file): + # Set up dataframes using a single report: + # Download the file contents: + with open(dr_report_file, 'r') as f: + dr_report_contents = f.read() + + # Set up our data: + dataframe_dict = dict() + drug_resistance_df = pd.DataFrame(columns=["Sample", "Chloroquine", "Pyrimethamine", "Sulfadoxine", "Mefloquine", "Artemisinin", "Piperaquine"]) + + # Get the raw info here: + last_gene = None + cur_df = None + for line in dr_report_contents.split("\n"): + if len(line) > 1: + gene, loc, variant, presence = line.split(" ") + if gene != last_gene: + # must make a new dataframe + if last_gene: + dataframe_dict[last_gene] = cur_df + cur_df = pd.DataFrame({'Sample': pd.Series(dtype='str')}) + cur_df[variant] = pd.Series(dtype='bool') + last_gene = gene + dataframe_dict[gene] = cur_df + + # Get the raw info here: + sample_id = dr_report_file[dr_report_file.find("/SEN_")+1:dr_report_file.find("_ALL_scored")] + + dr_info = defaultdict(list) + for line in dr_report_contents.split("\n"): + if len(line) > 1: + gene, loc, variant, presence = line.split(" ") + presence = presence == "present" + + dr_info[gene].append([variant, presence]) + + # Now process it into dataframes: + for gene, variant_info in dr_info.items(): + + # set up place to put new markers: + gene_df = dataframe_dict[gene] + columns = list(gene_df.columns) + + gene_df.loc[len(gene_df.index)] = [sample_id] + (len(columns)-1) * [None] + + # Now add the markers: + for v, presence in variant_info: + gene_df.loc[len(gene_df.index)-1, v] = presence + + # Set up logistics of our data: + nrows = len(next(iter(dataframe_dict.values()))) + ncols = sum([len(d.columns) for d in dataframe_dict.values()]) + + aa_dict = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K', + 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', + 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', + 'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'} + aa_dict_rev = {v:k for k, v in aa_dict.items()} + + pchange_string_re = re.compile(r'P\.([A-Z]+)(\d+)([A-Z]+)') + + xlabels = [] + for df in dataframe_dict.values(): + for variant in df.columns[1:]: + m = pchange_string_re.match(variant.upper()) + xlabels.append(f"{aa_dict[m[1]]}{m[2]}{aa_dict[m[3]]}") + + # Now set up the circles we will plot: + print("Computing plots") + radius = 0.4 + circles = [] + x_tick_pos = [] + x_gene_offset = 0 + for gene_index, (g, df) in enumerate(dataframe_dict.items()): + y = nrows-1 + for row_index, row in df.iterrows(): + x = 0 + x_gene_offset + for col_index, col in enumerate(row[1:]): + edgecolor = [0]*3 + facecolor = [1,0,0] if col else [1]*3 + c = plt.Circle((x+gene_index, y), facecolor=facecolor, edgecolor=edgecolor) + c.set(radius=radius) + circles.append(c) + x_tick_pos.append(x+gene_index) + x += 1 + y -= 1 + x_gene_offset += len(df.columns)-1 + + x_tick_pos = sorted(list(set(x_tick_pos))) + + ############################ + + print("Generating Plots") + fig, ax = plt.subplots() + + col = PatchCollection(circles, match_original=True) + ax.add_collection(col) + + # Now set the gene labels: + x_gene_offset = 0 + for gene_index, (g, df) in enumerate(dataframe_dict.items()): + line_x = [x_gene_offset, x_gene_offset+len(df.columns)-2] + lbl_x = sum(line_x)/2 + plt.text(lbl_x, nrows+2, g, ha="center", fontfamily="monospace", size="large") + plt.plot(line_x, [nrows+1]*2, '-k', linewidth=2) + x_gene_offset += len(df.columns) + + ax.set(xticks=x_tick_pos, + yticks=[], + xticklabels=xlabels, + yticklabels=[]) + + + ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha='center') + + ax.axis([-1, ncols-1, -1, nrows+3]) + ax.set_aspect(aspect='equal') + + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['bottom'].set_visible(False) + ax.spines['left'].set_visible(False) + + return fig + +''' +Map +''' +def create_map(coordinates, sample_name): + m = folium.Map(location=coordinates, zoom_start = 10) + + if coordinates != [0,0]: + folium.Marker(location=coordinates, popup = ('Sample: '+sample_name), icon=folium.Icon(color='red',prefix='fa',icon='circle'), parse_html=True).add_to(m) + + m.get_root().width = "491px" + m.get_root().height = "420px" + map_html = m.get_root()._repr_html_() + + return map_html + +''' +Location Info +''' +def capitalize_proper_nouns(text): + """Capitalizes the first letter of each word in a string, assuming all words are proper nouns.""" + if type(text) == str: + words = re.split(r'(\s+|-|\')', text) + capitalized_words = [] + for i, word in enumerate(words): + if i % 2 == 0: + capitalized_words.append(word.capitalize()) + else: + capitalized_words.append(word.replace('-', '. ').capitalize()) + return ''.join(capitalized_words) + else: + return text + +def parse_location_info(metadata_file): + """Parses location data by matching three letter code to region, district, site, and type, in order.""" + facility_types = {"C": "Clinic", "SS": "Sentinel Site", "SI": "Special Interest", + "PP": "PECADOM Plus", "C + PP": "Clinic, PECADOM Plus", "SS + PP": "Sentinel Site, PECADOM Plus"} + + metadata = pd.read_csv(metadata_file) + metadata = metadata.replace(np.nan, "Unspecified").replace(facility_types) + metadata.iloc[:, 1:4] = metadata.iloc[:, 1:4].applymap(capitalize_proper_nouns) + + # Make dictionary matching code to region, district, site, and type, in order + loc_dict = metadata.groupby("CODE")[list(metadata.columns[1:5])].agg(list).to_dict("index") + return loc_dict + +def extract_code(sample_name): + """Extracts the three-letter location code from this sample's name.""" + code = (sample_name.split("_")[2]).split(".")[0] + return code + +def get_sample_loc_info(loc_dict, sample_name, code = None): + extracted_code = extract_code(sample_name) + entry = next((loc_dict[c] for c in (code, extracted_code) if c in loc_dict), None) + if entry: + return [entry["SITE"][0], entry["REGION"][0], entry["DISTRICT"][0], entry["TYPE"][0]] + else: + return ["Unspecified"] * 4 + +''' +Quality Report (FastQC or ONT QC) +''' +def read_qc_report(directory): + ''' + Function to read HTML from FastQC or ONT QC report as string to pass to Jinja template IFrame + ''' + if(os.path.exists(directory)): + # There should only be one file in the directory (report-files/data/quality_report) + for file in os.listdir(directory): + print(f"{directory} + {file}") + with open(os.path.join(directory, file), "r", encoding="utf-8") as f: + html = f.read() + else: + html = None + + return html + +''' +Snapshots +''' +def get_bed_info(bed_file): + ''' + Input: Bed file containing drug resistance loci and their names. + Output: A list containing the names of all the drug resistance regions and a list of loci in the format of start-end. + ''' + column_names = ["chromosome", "start", "end", "name"] + bed = pd.read_csv(bed_file, sep="\s+", header=None, names=column_names) + bed["start"] = pd.Series(map(format_long_number, bed["start"])) + bed["end"] = pd.Series(map(format_long_number, bed["end"])) + + bed["locus"] = bed["start"].apply(str) + "-" + bed["end"].apply(str) + + return list(bed.name), list(bed.locus) + +''' +Utility +''' +def plot_to_b64(plot, bbox_inches=None): + ''' + Input: A matplotlib plot. + Output: The b64 string conversion of the input plot. + ''' + # Create I/O buffer to save image in bytes + stringIObytes = io.BytesIO() + plot.savefig(stringIObytes, format="jpeg", bbox_inches = bbox_inches) + + # Retrieve byte-string and encode it in base64 + stringIObytes.seek(0) + plot_b64 = base64.b64encode(stringIObytes.read()) + + return plot_b64 + +def img_to_b64(img): + with open(img, "rb") as image_file: + return base64.b64encode(image_file.read()) + +def format_dates(date_string): + ''' + Input: A string of numbers in the form of YYYYMMDD, such as 20240528 (May 28th, 2024). + Output: A formatted version of the input string using slashes to separate year, month, and day or "N/A". + ''' + if date_string in ["", None, "N/A"]: + return "Unknown" + else: + date = datetime.date(int(date_string[:4]), int(date_string[4:6]), int(date_string[6:])) + month = date.strftime("%B") + return f"{month} {date_string[6:]}, {date_string[:4]}" + +def check_unknown(data): + ''' + Input: A variable of any type to check if it is valid for passing to the report. + Output: The variable or "N/A". + ''' + return "Unknown" if data in [None, "None", "Unknown", "N/A", 0] else data + +def format_long_number(number): + return '{:,}'.format(number) + +''' +Report Generation & Templating +''' +class Sample: + ''' + This class holds all variables that will be used on the summary page of the report. + + Additionally, this class holds variables that are used on both the summary and analysis pages, + such as the sample name. + ''' + + def __init__(self, sample_name, hrp2, hrp3, drug_res, info, _map, location_info, qc_pass, dr_bubbles, tech_flag): + '''This function defines the class variables and retrieves them from their respective functions.''' + + self.sample_name = sample_name + self.hrp2 = hrp2 + self.hrp3 = hrp3 + self.drug_res = drug_res + self.info = info + self.map = _map + self.location_info = location_info + self.qc_pass = qc_pass + self.dr_bubbles = dr_bubbles + self.tech_flag = tech_flag + +class Analysis: + ''' + This class holds all variables used on the analysis page of the report. + ''' + + def __init__(self, sequencing_summary, qscore_reads, qscore_scores, barcode, coverage_plot, snapshots, res_loci_names, res_loci): + '''This function defines the variables used and retrieves them from their respective functions.''' + + self.sequencing_summary = sequencing_summary + self.scores = qscore_scores + self.reads = qscore_reads + self.barcode = barcode + self.coverage_plot = coverage_plot + self.snapshots = snapshots + self.res_loci_names = res_loci_names + self.res_loci = res_loci + +class QCReport: + def __init__(self, qc_report_html): + self.qc_report_html = qc_report_html + +def prepare_summary_data(arg_dict): + ''' + Gathers all data needed for summary page from inputs. + ''' + + sample_name = arg_dict['sample_name'] + + upload_date = arg_dict['upload_date'][0] + collection_date = arg_dict['collection_date'] + sequencing_date = arg_dict['sequencing_date'] + species = ' '.join(arg_dict['species']) + location_table = arg_dict["location_table"] + sample_code = arg_dict["code"] + tech_flag = arg_dict["tech_flag"] + + info = [upload_date, format_dates(collection_date), format_dates(sequencing_date), species, round(arg_dict['aligned_coverage'], 2), check_unknown(arg_dict['aligned_read_length']), + check_unknown(arg_dict['pct_properly_paired_reads']), 0, round(arg_dict['read_qual_mean'], 2)] + processed_info = [item if item not in ["", None] else "N/A" for item in info] + + qc_pass = arg_dict["qc_pass"] + if (qc_pass == "true"): + qc_pass = "PASS" + elif (qc_pass == "false"): + qc_pass = "FAIL" + + resistances = create_drug_table(None if arg_dict["drug_resistance_text"] in [None, "None", ""] else arg_dict["drug_resistance_text"]) + resistance_bubbles = plot_dr_bubbles(arg_dict["drug_resistance_text"]) + resistance_bubbles_b64 = plot_to_b64(resistance_bubbles, "tight") + + loc_dict = parse_location_info(location_table) + + location_info = [check_unknown(round(arg_dict['latitude'], 2)), check_unknown(round(arg_dict['longitude'], 2)), arg_dict['location'], + *get_sample_loc_info(loc_dict, sample_name, sample_code)] + coordinates = [arg_dict['latitude'], arg_dict['longitude']] + _map = create_map(coordinates, sample_name) + + HRP2 = arg_dict['HRP2'] + HRP3 = arg_dict['HRP3'] + + return Sample(sample_name, HRP2, HRP3, resistances, processed_info, _map, location_info, qc_pass, resistance_bubbles_b64, tech_flag) + +def prepare_analysis_data(arg_dict): + ''' + Gathers all data needed for analysis page from inputs. + ''' + + frac_bases = check_unknown(arg_dict["fraction_aligned_bases"]) + + sequencing_summary = [0, 0, format_long_number(arg_dict['aligned_bases']), format_long_number(arg_dict['aligned_reads']), + str(round(frac_bases, 4)*100)[:5], round(arg_dict['average_identity'], 2)] + + barcode = arg_dict['barcode'] + + qscorex = [5, 7, 10, 12, 15] # available q-score measures are predetermined + qscorey = [arg_dict['num_reads_q5'], arg_dict['num_reads_q7'], arg_dict['num_reads_q10'], arg_dict['num_reads_q12'], arg_dict['num_reads_q15']] + + # Create coverage plot and convert it to base64 + coverage_bin_size = arg_dict["coverage_bin_size"] + coverage_plot = plot_coverage("/report-files/data/coverage", arg_dict["sample_name"], coverage_bin_size) # default bin size = 750 + coverage_b64 = plot_to_b64(coverage_plot) if coverage_plot not in ["", None] else None + + # IGV Snapshots + snapshots = str(arg_dict["snapshots"]) + + snapshots = snapshots.split(",") + snapshots_b64 = [img_to_b64(image) for image in snapshots] + + bed_file = open(arg_dict["regions_bed"], "r") + loci_names, loci = get_bed_info(bed_file) + + return Analysis(sequencing_summary, qscorey, qscorex, barcode, coverage_b64, snapshots_b64, loci_names, loci) + +def create_report(sample, analysis, qc_report): + ''' + This function handles the final steps of report generation. + + It pools all variables and data needed for the report and creates two objects: + analysis and summary. These objects hold the variables and are passed to their respective + page templates (analysis or summary). + ''' + + print("path: "+os.path.dirname(os.path.realpath(__file__))) + print("cwd: "+os.getcwd()) + + # creating summary page + templateLoader = jinja2.FileSystemLoader(searchpath='/report-files/templates/') + templateEnv = jinja2.Environment(loader=templateLoader) + + # Add zip to Jinja2 namespace + templateEnv.globals.update(zip=zip) + + TEMPLATE_FILE = 'report.html' # may need to change if file is moved + template = templateEnv.get_template(TEMPLATE_FILE) + output = template.render(sample=sample, analysis=analysis, qc_report = qc_report) + + file_name = os.path.join(os.getcwd(),sample.sample_name+'_lrma_report.html') + print(file_name) + + with open(file_name, 'w') as fh: + fh.write(output) + + print('Report generated!') + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + + ''' Summary Page ''' + parser.add_argument("--barcode", help="barcode of the sample", default="Unknown") + + # Sample Info + parser.add_argument("--sample_name", help="name of sequenced sample", required=True) + parser.add_argument("--upload_date", help="date sample was uploaded", nargs='+', required=True) + parser.add_argument("--collection_date", help="date sample was collected", required=True, default="Unknown", nargs="?") + parser.add_argument("--sequencing_date", help="date sample was sequenced", default="Unknown", nargs="?") + parser.add_argument("--species", help="species of sample", nargs='+', default="P. falciparum") + parser.add_argument("--aligned_coverage", help="number of times the bases in the sequenced reads cover the target genome", required=True, type=float) + parser.add_argument("--aligned_read_length", help="number at which 50%\ of the read lengths are longer than this value", type=float) + parser.add_argument("--pct_properly_paired_reads", help="percent of reads that are properly paired", type=float) + parser.add_argument("--read_qual_mean", help="mean measure of the uncertainty of base calls", required=True, type=float, default="Unknown", nargs="?") + + # Drug Resistance + parser.add_argument("--drug_resistance_text", help="path of text file used for determining and displaying drug resistances", default=None) + parser.add_argument("--HRP2", help="value denoting whether the HRP2 marker is present or not -- true or false", default="Unknown") + parser.add_argument("--HRP3", help="value denoting whether the HRP3 marker is present or not -- true or false", default="Unknown") + + # Map + parser.add_argument("--longitude", help="longitude value of where the sample was collected", type=float, default=0) + parser.add_argument("--latitude", help="latitude value of where the sample collected", type=float, default=0) + parser.add_argument("--location", help="location where the sample was collected", default="Unknown") + parser.add_argument("--location_table", help="table with information about sample location related to its 3-letter code", required=True) + parser.add_argument("--code", help="three letter code corresponding to the site this sample was collected at") + + # QC Status + parser.add_argument("--qc_pass", help="status to determine whether or not the sequencing run passes quality control standards", required=True) + + ''' Analysis Page ''' + # Q-Scores Plot + parser.add_argument("--num_reads_q5", help="the number of reads where the probability of a given base call being wrong is approximately 1 in 3", required=True) + parser.add_argument("--num_reads_q7", help="the number of reads where the probability of a given base call being wrong is approximately 1 in 5", required=True) + parser.add_argument("--num_reads_q10", help="the number of reads where the probability of a given base call being wrong is 1 in 10", required=True) + parser.add_argument("--num_reads_q12", help="the number of reads where the probability of a given base call being wrong is approximately 1 in 16", required=True) + parser.add_argument("--num_reads_q15", help="the number of reads where the probability of a given base call being wrong is approximately 1 in 32", required=True) + + # Sequencing Summary + parser.add_argument("--aligned_bases", help="total number of bases aligned to the reference genome", required=True, type=float) + parser.add_argument("--aligned_reads", help="total number of reads aligned to the reference genome", required=True, type=float) + parser.add_argument("--fraction_aligned_bases", help="number of bases aligned out of all bases sequenced", required=True, type=float) + parser.add_argument("--average_identity", help="", required=True, type=float) # check + + # Coverage Plot + parser.add_argument("--fastqc_path", help="location of fastqc_report file; used to locate BAM files for coverage report generation") + parser.add_argument("--coverage_bin_size", help="number to use as size of bins for coverage plot generation; default is 1500", default=500, required=False, type=int) + + # Snapshots + parser.add_argument("--snapshots", required=True) + parser.add_argument("--regions_bed") + + parser.add_argument("--tech_flag", help="string denoting if the sample was sequenced using long read or short read techniques") + + # parse given arguments + args = parser.parse_args() + arg_dict = vars(args) + + qc_report_html = read_qc_report("/report-files/data/quality_report") + + summary = prepare_summary_data(arg_dict) + analysis = prepare_analysis_data(arg_dict) + qc_report = QCReport(qc_report_html) + + create_report(summary, analysis, qc_report) + + + + diff --git a/docker/lr-malaria-reports/report-files/templates/plots.js b/docker/lr-malaria-reports/report-files/templates/plots.js new file mode 100644 index 000000000..1fc831e58 --- /dev/null +++ b/docker/lr-malaria-reports/report-files/templates/plots.js @@ -0,0 +1,66 @@ +QSCORES = document.getElementById('qscores'); +data = document.querySelector("meta[name='qscores_y']").content; +console.log(data); + +function createQScoresPlot(data) { + var layout = { + width: 500, + height: 350, + margin: { + l: 95, + r: 80, + b: 90, + t: 10 + }, + paper_bgcolor:'rgba(0,0,0,0)', + plot_bgcolor:'rgba(0,0,0,0)', + modebar: { + // vertical modebar button layout + orientation: 'v', + // for demonstration purposes + bgcolor: 'white', + color: '#a7a8a8', + activecolor: '#9ED3CD' + }, + + // move the legend inside the plot on the top right + xaxis: { + title: { + text: 'Q Score', + font: { + family: 'Rubik', + size: 18, + color: '#7f7f7f' + } + }, + }, + yaxis: { + title: { + text: 'Number of Reads', + font: { + family: 'Rubik', + size: 18, + color: '#7f7f7f' + } + } + } + + } + + var config = { + displayModeBar: true // or 'true' for always visible + }; + + var data = data; // string + data = data.replace(/["']/g, ""); + y_values = JSON.parse(data); + + Plotly.plot(QSCORES, [{ + type: 'scatter', + x: [5, 7, 10, 12, 15], + y: y_values + }], layout, config) +} + +createQScoresPlot(data); +QSCORES.style.zIndex="100"; \ No newline at end of file diff --git a/docker/lr-malaria-reports/report-files/templates/report.html b/docker/lr-malaria-reports/report-files/templates/report.html new file mode 100644 index 000000000..5d4088154 --- /dev/null +++ b/docker/lr-malaria-reports/report-files/templates/report.html @@ -0,0 +1,2211 @@ + + + + + + + + + + + + LRMA Report + + + + + + + + + + + + + + + + +
+ + +
+ + +

Sample Name: + {{ sample.sample_name }}

+
+
+
+ + + + + + + +
+
+
+
+
+
Sample Info +
+ i + + High-level data and metadata about this sample. + +
+
+
+ +
+
+ +
+ +
+ +
+ +
+ +
+ +
+
{{ sample.info[0] }} +
+ +
{{ sample.info[1] }} +
+ +
{{ sample.info[2] }} +
+ +
{{ sample.info[3] }} +
+
+
+ Year
+ Date of Collection
+ Date of Sequencing
+ Species +
+
+
+ +
+
+
+
+
+ Location Info +
+ i + + Information about where the sample was collected. + +
+
+
+
+
+
+
+
+
+
+
+
+
+
Country +
+ +
Latitude +
+ +
Longitude +
+ +
Health Facility
+ +
Region
+ +
District
+ +
Type
+
+
+
{{ sample.location_info[2] }} +
+ +
{{ sample.location_info[0] }} +
+ +
{{ sample.location_info[1] }} +
+ +
{{ sample.location_info[3] }} +
+ +
{{ sample.location_info[4] }} +
+ +
{{ sample.location_info[5] }} +
+ +
{{ sample.location_info[6] }} +
+
+
+
+
Predicted RDT Region Presence +
+ i + + The presence of the HRP-2 and HRP-3 regions is presented below. "+" indicates the presence of either region ; "-" indicates the absence of the region. +

+ Presence or absence is determined by examining the coverage for this sample. If the coverage over a region is less than 5% of the coverage of the containing contig, that region is said to be deleted (for HRP-2: Pf3D7_08_V3:1373212-1376988; for HRP-3: Pf3D7_13_v3:2840236-2842840). Otherwise the region is counted as present. +
+
+
+
+
+
+

{{ sample.hrp2 }}
HRP-2 Result

+
+

{{ sample.hrp3 }}
HRP-3 Result

+
+ +
+
+ +
+
+
+ +
+
Map
+
+ i + + Interactive map showing where the sample was collected. + +
+
+ {{ sample.map|safe }} +
+
+
+ +
+
+
+
Predicted Drug Sensitivities +
+ i + + Sensitivity and resistance to widely used anti-malarial drugs are shown in the table. +

+ Presence of individual markers (i.e. variants / mutation events / protein changes) are shown in the plot. Red markers indicate the presence of a potential drug-resistance-related mutation event in this sample. White markers indicate the absence of such a marker. Amino acid changes are indicated by vertical labels, grouped by gene (column group labels). +

+ The drug resistance markers shown below are defined in the methods the PF7 callset from the Sanger Institute. Each listed drug has a rubric by which resistance or sensitivity is determined based on the presence or absence of specific genotypes at known protein change sites. +

+ For more information, see: + Pf7 Resistance Classification +
+
+
+
+
+ + + + + + + + + + + + + + + + + {% for item in sample.drug_res %} + {% if item == "RESISTANT" %} + + {% elif item == "SENSITIVE" %} + + {% else %} + + {% endif %} + {% endfor %} + + +
 ChloroquinePyrimethamineSulfadoxineMefloquineArtemisininPiperaquine
Sensitivity {{ item }} {{ item }} {{ item }}
+
+
+
+
+
+
+
Marker Absent
+
+
+
Marker Present
+
+
+ +
+
+
+
QC Info +
+ i + + Metrics about the quality of this sample. + +
+
+
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+
{{ sample.info[4] }} +
+ +
{{ sample.info[5]|int }} +
+ +
+ {% if sample.info[6] != "N/A" %} + {{ sample.info[6] }}% + {% else %} + {{ sample.info[6] }} + {% endif %} +
+ +
{{ sample.info[8] }} +
+
+
+ Aligned Fold Coverage
+ Aligned Read Length
+ Percent Properly Paired Reads
+ Read Quality Mean +
+
+
+ + + + +
+ + + + +
+
+
+ + + + + + +
+ + {% if qc_report.qc_report_html is defined and qc_report.qc_report_html %} + + + + + + {% endif %} + + + + + + + \ No newline at end of file diff --git a/wdl/pipelines/TechAgnostic/Utility/MalariaReportsWorkflow.wdl b/wdl/pipelines/TechAgnostic/Utility/MalariaReportsWorkflow.wdl new file mode 100644 index 000000000..9b567dc60 --- /dev/null +++ b/wdl/pipelines/TechAgnostic/Utility/MalariaReportsWorkflow.wdl @@ -0,0 +1,186 @@ +version 1.0 +import "../../../tasks/Visualization/MalariaReports.wdl" as MRS +import "../../../tasks/Visualization/MalariaReportsSnapshots.wdl" as Snapshot + +workflow GenerateMalariaReports { + + meta { + author: "Bridget Knight" + description: "## Report Generation \n This workflow calls the Python script that generates a sequencing report." + } + + parameter_meta { + # ------ Summary Page ------ # + + # Sample Info + sample_name: "name of sequenced sample" + upload_date: "date sample was uploaded" + collection_date: "date sample was collected" + sequencing_date: "date sample was sequenced" + species: "species of sample" + aligned_coverage: "number of reads uniquely mapped to a reference" + aligned_read_length: "number at which 50% of the read lengths are longer than this value" # check + pct_properly_paired_reads: "percent of reads that are properly paired/aligned" + read_qual_mean: "mean measure of the uncertainty of base calls" + + # Drug Resistance + drug_resistance_text: "text file used for determining and displaying drug resistances" + HRP2: "value denoting whether the HRP2 marker is present or not -- true or false" + HRP3: "value denoting whether the HRP3 marker is present or not -- true or false" + + # Map + longitude: "longitude value of where the sample was taken" + latitude: "latitude value of where the sample was taken" + location: "location where the sample was taken" + code: "the three letter code denoting the corresponding health facility/site" + location_table: "table containing the region, district, site, and facility type for this sample's location" + + # QC Status + qc_pass: "status to determine whether or not the sequencing run passes quality control standards" + + # ------ Analysis Page ------ # + barcode: "the barcode of the sample" + + # Q-Scores Plot + num_reads_q5: "the number of reads where the probability of a given base call being wrong is approximately 1 in 3" + num_reads_q7: "the number of reads where the probability of a given base call being wrong is approximately 1 in 5" + num_reads_q10: "the number of reads where the probability of a given base call being wrong is 1 in 10" + num_reads_q12: "the number of reads where the probability of a given base call being wrong is approximately 1 in 16" + num_reads_q15: "the number of reads where the probability of a given base call being wrong is approximately 1 in 31" + + # Sequencing Summary + aligned_bases: "total number of bases aligned to the reference genome" + aligned_reads: "total number of reads aligned to the reference genome" + fraction_aligned_bases: "number of bases aligned out of all bases in the sequence" + + # Coverage Plot + fastqc_path: "directory of fastqc_report used for finding BAM files" + coverage_bin_size: "number to use as size of bins for coverage plot generation; default is 1500" + + # ------ IGV Snapshots ------ # + regions_bed: "GCS path to bed file containing drug resistance loci" + ref_map_file: "table indicating reference sequence and auxillary file locations" + aligned_bam: "GCS path to aligned bam" + aligned_bai: "GCS path to aligned bai" + gcs_out_root_dir: "GCS bucket to store the results" + + ont_qc_report: "ONT QC report file" + + tech_flag: "string denoting if the sample was sequenced using long read or short read techniques" + } + + input { + # ------ Summary Page ------ # + + # Sample Info + String sample_name + String upload_date + String? collection_date + String? sequencing_date + String? species + Float aligned_coverage + Float? aligned_read_length + Float? pct_properly_paired_reads + Float? read_qual_mean + + # Drug Resistance + File? drug_resistance_text + String? HRP2 + String? HRP3 + + # Map + Float? longitude + Float? latitude + String? location + String? code + File location_table + + # QC Pass + String qc_pass + + # ------ Analysis Page ------ # + + String? barcode + + # Q-Scores Plot + Int num_reads_q5 + Int num_reads_q7 + Int num_reads_q10 + Int num_reads_q12 + Int num_reads_q15 + + # Sequencing Summary + Float aligned_bases + Int aligned_reads + Float fraction_aligned_bases + Float average_identity + + # Coverage Plot -- incomplete + File? fastqc_path + Int? coverage_bin_size + + # ------ IGV Snapshots ------ # + File aligned_bam + File aligned_bai + File regions_bed + File ref_map_file + + File? ont_qc_report + + String tech_flag + } + + Map[String, String] ref_map = read_map(ref_map_file) + + call Snapshot.DrugResIGV as GenerateSnapshots { + input: + sample_name = sample_name, + fasta_path = ref_map["fasta"], + fasta_index_path = ref_map["fai"], + regions_bed = regions_bed, + aligned_bam = aligned_bam, + aligned_bai = aligned_bai + } + + call MRS.RunReportScript as RunReportScript { + input: + sample_name = sample_name, + upload_date = upload_date, + collection_date = collection_date, + sequencing_date = sequencing_date, + species = species, + aligned_coverage = aligned_coverage, + aligned_read_length = aligned_read_length, + pct_properly_paired_reads = pct_properly_paired_reads, + read_qual_mean = read_qual_mean, + drug_resistance_text = drug_resistance_text, + HRP2 = HRP2, + HRP3 = HRP3, + longitude = longitude, + latitude = latitude, + location = location, + barcode = barcode, + num_reads_q5 = num_reads_q5, + num_reads_q7 = num_reads_q7, + num_reads_q10 = num_reads_q10, + num_reads_q12 = num_reads_q12, + num_reads_q15 = num_reads_q15, + aligned_bases = aligned_bases, + aligned_reads = aligned_reads, + fraction_aligned_bases = fraction_aligned_bases, + average_identity = average_identity, + fastqc_path = fastqc_path, + ont_qc_report = ont_qc_report, + coverage_bin_size = coverage_bin_size, + snapshots = GenerateSnapshots.snapshots, + regions_bed = regions_bed, + qc_pass = qc_pass, + code = code, + location_table = location_table, + tech_flag = tech_flag + } + + output { + File report = RunReportScript.report + } +} \ No newline at end of file diff --git a/wdl/tasks/Visualization/MalariaReports.wdl b/wdl/tasks/Visualization/MalariaReports.wdl new file mode 100644 index 000000000..6ab6471de --- /dev/null +++ b/wdl/tasks/Visualization/MalariaReports.wdl @@ -0,0 +1,235 @@ +version 1.0 + +import "../../structs/Structs.wdl" +task RunReportScript { + + meta { + description: "Use RunReportScript to start the report generation process." + } + + parameter_meta { + runtime_attr_override: "Override the default runtime attributes" + + # ------ Summary Page ------ # + + # Sample Info + sample_name: "name of sequenced sample" + upload_date: "date sample was uploaded" + sequencing_date: "date sample was sequenced" + collection_date: "date sample was collected" + species: "species of sample" + aligned_coverage: "number of reads uniquely mapped to a reference" + aligned_read_length: "number at which 50% of the read lengths are longer than this value" # check + pct_properly_paired_reads: "median read length" + read_qual_mean: "mean measure of the uncertainty of base calls" + + # Drug Resistance + drug_resistance_text: "text file used for determining and displaying drug resistances" + HRP2: "value denoting whether the HRP2 marker is present or not -- true or false" + HRP3: "value denoting whether the HRP3 marker is present or not -- true or false" + + # Map + longitude: "longitude value of where the sample was taken" + latitude: "latitude value of where the sample was taken" + location: "location where the sample was taken" + code: "the three letter code denoting the corresponding health facility/site" + location_table: "table containing the region, district, site, and facility type for this sample's location" + + # QC Status + qc_pass: "status to determine whether or not the sequencing run passes quality control standards" + + # ------ Analysis Page ------ # + # Main Box + barcode: "the barcode of the sample" + + # Q-Scores Plot + num_reads_q5: "the number of reads where the probability of a given base call being wrong is approximately 1 in 3" + num_reads_q7: "the number of reads where the probability of a given base call being wrong is approximately 1 in 5" + num_reads_q10: "the number of reads where the probability of a given base call being wrong is 1 in 10" + num_reads_q12: "the number of reads where the probability of a given base call being wrong is approximately 1 in 16" + + # Sequencing Summary + aligned_bases: "total number of bases aligned to the reference genome" + aligned_reads: "total number of reads aligned to the reference genome" + fraction_aligned_bases: "percentage of bases aligned out of all bases in the sequence" + + # Coverage Plot + # coverage_dir: "directory of BAM files for coverage plot generation" + fastqc_path: "directory of fastqc_report used for finding BED and BAM files" + coverage_bin_size: "number to use as size of bins for coverage plot generation; default is 1500" + + ont_qc_report: "ONT QC report file" + + tech_flag: "string denoting if the sample was sequenced using long read or short read techniques" + } + + input { + RuntimeAttr? runtime_attr_override + + # ------ Summary Page ------ # + + # Sample Info + String sample_name + String upload_date + String? collection_date + String? sequencing_date + String? species + Float aligned_coverage + Float? aligned_read_length + Float? pct_properly_paired_reads + Float? read_qual_mean + + # Drug Resistance + File? drug_resistance_text + String? HRP2 + String? HRP3 + + # Map + Float? longitude + Float? latitude + String? location + String? code + File location_table + + # QC Pass + String qc_pass + + # ------ Analysis Page ------ # + + # Barcode + String? barcode + + # Q-Scores Plot + Int num_reads_q5 + Int num_reads_q7 + Int num_reads_q10 + Int num_reads_q12 + Int num_reads_q15 + + # Sequencing Summary + Float aligned_bases + Int aligned_reads + Float fraction_aligned_bases + Float average_identity + + # Coverage Plot -- incomplete + # String? coverage_dir + File? fastqc_path + Int? coverage_bin_size + + # Snapshots + File regions_bed + Array[File] snapshots + + # ONT QC Report Page + File? ont_qc_report + + String tech_flag + } + + Int disk_size_gb = 20 + ceil(size(drug_resistance_text, "GB")) + + # Wrap location in quotes in case it has spaces + String wrap_location = "'~{location}'" + + String results_dir = if (defined(fastqc_path)) then sub(select_first([fastqc_path]), "results\/.*", "") else "" + String coverage_dir = "~{results_dir}results/SRFlowcell/~{sample_name}/metrics/coverage/" + String coverage_regex = "~{coverage_dir}*?[0-9]_v3.regions.bed.gz" + + String quality_report = select_first([fastqc_path, ont_qc_report, ""]) + + command <<< + set -euxo + pwd + ls + + if [[ ~{coverage_regex} == "gs://"* ]]; then + echo "Retrieving BED files..." + echo ~{coverage_dir} + mkdir -p /report-files/data/coverage + gsutil ls ~{coverage_regex} > filelist.txt + echo "COPYING..." + cat filelist.txt | gsutil -m cp -I /report-files/data/coverage + else + echo "No BED files found." + fi + + if [ ! -z ~{quality_report} ]; then + echo "Retrieving quality report..." + mkdir -p /report-files/data/quality_report + echo ~{quality_report} + gsutil cp ~{quality_report} /report-files/data/quality_report + else + echo "No quality report found." + fi + + #mkdir -p /report-files/data/quality_report + #echo ~{fastqc_path} + #gsutil cp ~{fastqc_path} /report-files/data/quality_report + + echo "CREATING SUMMARY REPORT..." + python3 /report-files/report_gen.py \ + --sample_name ~{sample_name} \ + --upload_date ~{upload_date} \ + --collection_date ~{default="Unknown" collection_date} \ + --sequencing_date ~{default="Unknown" sequencing_date} \ + --species ~{default="P. falciparum" species} \ + --aligned_coverage ~{aligned_coverage} \ + --aligned_read_length ~{default=0 aligned_read_length} \ + --pct_properly_paired_reads ~{default=0 pct_properly_paired_reads} \ + --read_qual_mean ~{read_qual_mean} \ + --drug_resistance_text ~{default="None" drug_resistance_text} \ + --HRP2 ~{default="Unknown" HRP2} \ + --HRP3 ~{default="Unknown" HRP3} \ + --longitude ~{default=0 longitude} \ + --latitude ~{default=0 latitude} \ + --location ~{default="Unknown" wrap_location} \ + --barcode ~{default="Unknown" barcode} \ + --num_reads_q5 ~{num_reads_q5} \ + --num_reads_q7 ~{num_reads_q7} \ + --num_reads_q10 ~{num_reads_q10} \ + --num_reads_q12 ~{num_reads_q12} \ + --num_reads_q15 ~{num_reads_q15} \ + --aligned_bases ~{aligned_bases} \ + --aligned_reads ~{aligned_reads} \ + --fraction_aligned_bases ~{fraction_aligned_bases} \ + --average_identity ~{average_identity} \ + --coverage_bin_size ~{default=750 coverage_bin_size} \ + --snapshots ~{sep="," snapshots} \ + --regions_bed ~{regions_bed} \ + --qc_pass ~{default="N/A" qc_pass} \ + --code ~{default="" code} \ + --location_table ~{location_table} \ + --tech_flag ~{tech_flag} + echo "DONE!" + >>> + + output { + File report = "~{sample_name}_lrma_report.html" + } + + + # ----------------------------------------------------------- # + # Runtime Config + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 16, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-malaria-reports:0.0.2" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + +} diff --git a/wdl/tasks/Visualization/MalariaReportsSnapshots.wdl b/wdl/tasks/Visualization/MalariaReportsSnapshots.wdl new file mode 100644 index 000000000..8eaf51ce6 --- /dev/null +++ b/wdl/tasks/Visualization/MalariaReportsSnapshots.wdl @@ -0,0 +1,81 @@ +version 1.0 + +import "../../structs/Structs.wdl" +task DrugResIGV { + + meta { + description: "DrugResIGV generates IGV snapshots for the different drug resistance loci for malaria." + } + + parameter_meta { + runtime_attr_override: "Override the default runtime attributes" + + regions_bed: "GCS path to bed file containing drug resistance loci" + fasta_path: "GCS path to fasta file" + fasta_index_path: "GCS path to fasta.fai file" + aligned_bam: "GCS path to aligned bam" + aligned_bai: "GCS path to aligned bai" + sample_name: "name of the sample sequenced" + } + + input { + RuntimeAttr? runtime_attr_override + + File aligned_bam + File aligned_bai + + File fasta_path + File fasta_index_path + File regions_bed + + String sample_name + } + + Int disk_size_gb = 20 + ceil(size(aligned_bam, "GB")) + + + command <<< + set -euxo pipefail + pwd + ls + + mkdir -p out + + echo "CREATING SNAPSHOTS..." + make_IGV_snapshots.py \ + -g ~{fasta_path} \ + ~{aligned_bam} \ + -r ~{regions_bed} \ + -nf4 -o out + >>> + + output { + Array[File] snapshots = glob("out/*.png") + File batch_script = "out/IGV_snapshots.bat" + } + + + # ----------------------------------------------------------- # + # Runtime Config + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 16, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "stevekm/igv-snapshot-automator" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + +}