diff --git a/.dockstore.yml b/.dockstore.yml
index ff958fe50..57004902e 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -96,3 +96,6 @@ workflows:
- name: CleanupIntermediate
subclass: wdl
primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/CleanupIntermediate.wdl
+- name: GenerateMalariaReport
+ subclass: wdl
+ primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/MalariaReportsWorkflow.wdl
diff --git a/.gitignore b/.gitignore
index e46a4fc83..137469f0d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,5 @@ venv/
.tox
test/test_output/
__pycache__
+*.zip
+docker/lr-malaria-reports/report-files/Senegal_Terra_Metadata.csv
diff --git a/docker/lr-malaria-reports/.gitignore b/docker/lr-malaria-reports/.gitignore
new file mode 100644
index 000000000..9c2b7c643
--- /dev/null
+++ b/docker/lr-malaria-reports/.gitignore
@@ -0,0 +1,11 @@
+templates/map.html
+test_data/
+data/
+/.vscode
+*_report.html
+*.jpeg
+*.json
+*.ipynb
+*.txt
+*.bed.gz
+test.html
\ No newline at end of file
diff --git a/docker/lr-malaria-reports/Dockerfile b/docker/lr-malaria-reports/Dockerfile
new file mode 100644
index 000000000..a55c80ca0
--- /dev/null
+++ b/docker/lr-malaria-reports/Dockerfile
@@ -0,0 +1,36 @@
+# Base image
+FROM continuumio/miniconda3:4.9.2
+
+LABEL Bridget Knight
+
+# install gsutil
+RUN apt-get --allow-releaseinfo-change update
+RUN apt install -y curl git-lfs parallel
+RUN curl https://sdk.cloud.google.com | bash
+
+# Setup crcmodc for gsutil:
+RUN apt-get install -y gcc python3-dev python3-setuptools && \
+ pip3 uninstall -y crcmod && \
+ pip3 install --no-cache-dir -U crcmod
+
+#### Specific for google cloud support
+RUN apt-get install -y gnupg2
+RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list \
+ && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - \
+ && apt-get update -y \
+ && apt-get install google-cloud-sdk -y
+
+# make sure pip is up to date
+RUN pip3 install --upgrade pip==23.2.1
+
+# install other packages
+RUN pip3 install --no-cache-dir NumPy==1.24.4
+RUN pip3 install --no-cache-dir pandas==1.3.0
+RUN pip3 install --no-cache-dir argparse==1.4.0 folium==0.14.0 jinja2==3.1.2 \
+ MarkupSafe==2.1.3 matplotlib==3.7.3 \
+ plotly==5.15.0 tabulate==0.9.0
+
+ENV PATH=/root/google-cloud-sdk/bin/:${PATH}
+
+# copy other resources
+COPY ./report-files /report-files/
diff --git a/docker/lr-malaria-reports/Makefile b/docker/lr-malaria-reports/Makefile
new file mode 100644
index 000000000..bbd1f6711
--- /dev/null
+++ b/docker/lr-malaria-reports/Makefile
@@ -0,0 +1,18 @@
+IMAGE_NAME = lr-malaria-reports
+VERSION = 0.0.2
+
+TAG1 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):$(VERSION)
+TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest
+
+all: | build push
+
+build:
+ docker build -t $(TAG1) -t $(TAG2) .
+
+build_no_cache:
+ docker build --no-cache -t $(TAG1) -t $(TAG2) .
+
+push:
+ docker push $(TAG1)
+ docker push $(TAG2)
+
diff --git a/docker/lr-malaria-reports/report-files/report_gen.py b/docker/lr-malaria-reports/report-files/report_gen.py
new file mode 100644
index 000000000..d6f7ab224
--- /dev/null
+++ b/docker/lr-malaria-reports/report-files/report_gen.py
@@ -0,0 +1,859 @@
+import os
+
+# Data manipulation
+import numpy as np
+import pandas as pd
+import random as rnd
+import tabulate
+import re
+from enum import Enum
+import glob
+import datetime
+
+# Plotting
+import matplotlib.pyplot as plt
+import matplotlib
+import plotly.express as px
+import folium
+import plotly.graph_objects as go
+import math
+from collections import defaultdict
+from matplotlib.collections import PatchCollection
+
+# HTML
+from markupsafe import Markup
+import jinja2
+import io
+from io import StringIO
+import base64
+import urllib.parse
+
+# Argument Parsing
+import argparse
+
+'''
+matplotlib render settings
+'''
+# Make big figures:
+gFIG_SIZE_in = [14, 10]
+
+# Set plotting defaults:
+gPLOT_PARAMS = {
+ "legend.fontsize": "x-large",
+ "figure.figsize": gFIG_SIZE_in,
+ "axes.labelsize": "x-large",
+ "axes.titlesize": "x-large",
+ "xtick.labelsize": "x-large",
+ "ytick.labelsize": "x-large"
+}
+matplotlib.rcParams.update(gPLOT_PARAMS)
+
+# Some single-place definitions of sizes for plots / figures:
+gFONT_SIZE_UNITS = "pt"
+gTITLE_FONT_SIZE = 36
+gAXIS_LABEL_FONT_SIZE = 24
+gTICK_LABEL_FONT_SIZE = 16
+gTEXT_FONT_SIZE = 16
+
+# To track global figure number creation:
+gFIG_NUM = 0
+
+def fix_plot_visuals(fig,
+ titlesize=gTITLE_FONT_SIZE,
+ labelsize=gAXIS_LABEL_FONT_SIZE,
+ ticklabelsize=gTICK_LABEL_FONT_SIZE,
+ textsize=gTEXT_FONT_SIZE,
+ tight_rect=None):
+ """Fix the plot elements to be appropriate sizes for a slide / presentation."""
+
+ if not textsize:
+ textsize = ticklabelsize
+
+ for ax in fig.get_axes():
+
+ for ticklabel in (ax.get_xticklabels()):
+ ticklabel.set_fontsize(ticklabelsize)
+ for ticklabel in (ax.get_yticklabels()):
+ ticklabel.set_fontsize(ticklabelsize)
+ for c in ax.get_children():
+ if c.__class__ == matplotlib.text.Text:
+ c.set_fontsize(textsize)
+
+ ax.xaxis.get_label().set_fontsize(labelsize)
+ ax.yaxis.get_label().set_fontsize(labelsize)
+ ax.title.set_fontsize(titlesize)
+
+ for c in fig.get_children():
+ if c.__class__ == matplotlib.legend.Legend:
+ c.prop.set_size(ticklabelsize)
+ c.get_title().set_size(ticklabelsize)
+
+ if tight_rect:
+ fig.tight_layout(rect=tight_rect)
+ else:
+ fig.tight_layout()
+
+ if fig._suptitle:
+ sup_title = fig._suptitle.get_text()
+ fig.suptitle(sup_title, fontsize=titlesize)
+
+ # Make it so we can actually see what's happening on the plots with "dark mode":
+ fig.patch.set_facecolor("white")
+
+
+'''
+Coverage Plot
+'''
+def plot_coverage(directory, sample_name, bin_width=500):
+
+ # only looking for .bed.gz files
+ ext = (".bed.gz")
+
+ # Set up data
+ beds = pd.DataFrame(columns=["stop", "depth"])
+ sorted_file_list = list()
+
+ # Reading and sorting files if coverage data is available
+ if os.path.exists(directory):
+ file_list = os.listdir(directory)
+ sorted_file_list = sorted(file_list, key=lambda item: item.split(".")[-4].split("_")[1] if item.endswith(".bed.gz") else "00")
+ print(f"Files to be plotted: {sorted_file_list}")
+ else:
+ return None
+
+ # Plot setup
+ fig, ax = plt.subplots(1,1, figsize=(15, 9))
+ plt.title(f"Coverage Plot of Sample {sample_name}", pad=12, fontsize=12)
+ plt.xlabel("Contig (bp)", labelpad=10, fontsize=12)
+ plt.ylabel("Depth", labelpad=10, fontsize=12)
+ color = "#2494C7"
+ tick_positions = []
+ contigs = []
+ bin_max = 0
+
+ # Iterate over each bed.gz in test_data folder
+ for idx, file in enumerate(sorted_file_list):
+
+ if file.endswith(ext):
+ # Reading current .bed.gz file
+ f = os.path.join(directory, file)
+ #print(f"\n{idx} Current file: {f}")
+
+ # Create DataFrame per .bed.gz
+ bed_df = pd.read_csv(f, delimiter="\\t", names=["contig", "start", "stop", "depth"], engine="python")
+
+ # Get bins
+ start_min = bed_df["start"].min()
+ stop_max = bed_df["stop"].max()
+ bins = np.arange(start_min, stop_max+1, bin_width)
+ values = np.zeros(len(bins))
+
+ # Iterrate through each DataFrame and populate bins
+ #print("Populating bins...")
+ for _, row in bed_df.iterrows():
+ avg = (row["stop"]+row["start"])/2
+ index = int(np.floor(avg/bin_width))
+ values[index]+=row["depth"]
+
+ # Append new data to DF
+ #print("Plotting data...")
+ if(idx == 0):
+ ax.plot(bins, values, ".", c=color)
+ tick_positions.append(math.floor(stop_max/2)+bin_max)
+ bin_max = max(bed_df["stop"])
+ else:
+ ax.plot(bins+bin_max, values, ".", c=color)
+ tick_positions.append(math.floor(stop_max/2)+bin_max)
+ bin_max = bin_max + max(bed_df["stop"])
+
+ # Saving xtick data
+ contigs.append(bed_df["contig"][0])
+
+ # Setting xticks
+ ax.set_xticks(ticks=tick_positions)
+ ax.set_xticklabels(labels=contigs, rotation=90)
+ ax.set_yscale("log")
+ fix_plot_visuals(fig)
+
+ return fig
+
+'''
+Drug Resistance Table
+'''
+def create_drug_table(file):
+ if not file:
+ resistances = ["UNDETERMINED","UNDETERMINED","UNDETERMINED","UNDETERMINED","UNDETERMINED","UNDETERMINED"]
+ else:
+ data = open(file, 'r').read()
+ resistances = list(get_drug_resistance(data, None, None, do_print=True))
+
+ resistances_tbl = pd.DataFrame(columns = ["Chloroquine", "Pyrimethamine", "Sulfadoxine", "Mefloquine", "Artemisinin", "Piperaquine"])
+ resistances = map(str, resistances)
+ resistances = [s.replace('DrugSensitivity.','') for s in list(resistances)]
+
+ return resistances
+
+# Set up some functions to determine drug sensitivity:
+# vars for markers in drug_resistance_report.txt
+present = '+'
+absent = '-'
+
+pchange_regex = re.compile(r"""p\.([A-z]+)([0-9]+)([A-z]+)""")
+AA_3_2 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K', 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}
+
+DrugSensitivity = Enum("DrugSensitivity", ["UNDETERMINED", "SENSITIVE", "RESISTANT"])
+
+def parse_pchange(pchange):
+ old, pos, new = pchange_regex.match(pchange).groups()
+ return AA_3_2[old.upper()], int(pos), AA_3_2[new.upper()]
+
+def get_chloroquine_sensitivity(dr_report):
+# Locus utilized: PF3D7_0709000 (crt)
+# Codon: 76
+# Workflow:
+# Step Genetic change Interpretation Classification
+# 1 76 K/T heterozygote Heterozygous mutant Undetermined
+# 2 76 missing Missing Undetermined
+# 3 K76 Wild type Sensitive
+# 4 76T Mutant Resistant
+# 5 otherwise Unknown mutant Undetermined
+
+ for line in StringIO(dr_report):
+ line = ' '.join(line.split())
+ # We only care about this locus for this drug:
+ if line.startswith("pfcrt PF3D7_0709000"):
+ gene, locus, pchange, marker = line.strip().split(" ")
+ old, pos, new = parse_pchange(pchange)
+ if pos == 76:
+ if (old == "L") and (new == "T") and (marker == absent):
+ return DrugSensitivity.SENSITIVE
+ elif (new == "T") and (marker == present):
+ return DrugSensitivity.RESISTANT
+
+ return DrugSensitivity.UNDETERMINED
+
+def get_pyrimethamine_sensitivity(dr_report):
+# Locus utilized: PF3D7_0417200 (dhfr)
+# Codon: 108
+# Workflow:
+# Step Genetic change Interpretation Classification
+# 1 108 S/N heterozygote Heterozygous mutant Undetermined
+# 2 108 missing Missing Undetermined
+# 3 S108 Wild type Sensitive
+# 4 108N Mutant Resistant
+# 5 otherwise Unknown mutant Undetermined
+
+ for line in StringIO(dr_report):
+ line = ' '.join(line.split())
+ # We only care about this locus for this drug:
+ if line.startswith("pfdhfr PF3D7_0417200"):
+ gene, locus, pchange, marker = line.strip().split(" ")
+ old, pos, new = parse_pchange(pchange)
+ if pos == 108:
+ if (old == "S") and (new == "N") and (marker == absent):
+ return DrugSensitivity.SENSITIVE
+ elif (old == "S") and (new == "N") and (marker == present):
+ return DrugSensitivity.RESISTANT
+ elif (new == "N") and (marker == present):
+ return DrugSensitivity.RESISTANT
+ elif marker == absent:
+ return DrugSensitivity.SENSITIVE
+
+ return DrugSensitivity.UNDETERMINED
+
+def get_sulfadoxine_sensitivity(dr_report):
+# Locus utilized: PF3D7_0810800 (dhps)
+# Codon: 437
+# Workflow:
+# Step Genetic change Interpretation Classification
+# 1 437 A/G heterozygote Heterozygous mutant Undetermined
+# 2 437 missing Missing Undetermined
+# 3 A437 Wild type Sensitive
+# 4 437G Mutant Resistant
+# 5 otherwise Unknown mutant Undetermined
+
+ for line in StringIO(dr_report):
+ line = ' '.join(line.split())
+
+ # We only care about this locus for this drug:
+ if line.startswith("pfdhps PF3D7_0810800"):
+ gene, locus, pchange, marker = line.strip().split(" ")
+ old, pos, new = parse_pchange(pchange)
+ if pos == 437:
+ if (old == "A") and (new == "G") and (marker == absent):
+ return DrugSensitivity.SENSITIVE
+ elif (old == "A") and (new == "G") and (marker == present):
+ return DrugSensitivity.RESISTANT
+ elif (new == "G") and (marker == present):
+ return DrugSensitivity.RESISTANT
+ elif marker == absent:
+ return DrugSensitivity.SENSITIVE
+
+ return DrugSensitivity.UNDETERMINED
+
+def get_mefloquine_sensitivity(dr_report):
+# Locus utilized: PF3D7_0523000 (mdr1)
+# Codons: Amplification status of whole gene
+# Workflow:
+# Step Genetic change Interpretation Classification
+# 1 Missing Missing Undetermined
+# 2 Heterozygous duplication Heterozygous mutant Undetermined
+# 3 Single copy Wild type Sensitive
+# 4 Multiple copies Mutant Resistant
+
+ # Currently we can't determine this.
+ # We need to get CNV calling working first.
+
+ return DrugSensitivity.UNDETERMINED
+
+def get_artemisinin_sensitivity(dr_report):
+# Locus utilized: PF3D7_1343700 (kelch13)
+# Codons: 349-726 (BTB/POZ and propeller domains)
+# Workflow:
+# Step Genetic change Interpretation Classification
+# 1 Homozygous non-synonymous mutations in the kelch13 BTB/POZ and propeller
+# domain classified by the World Health Organisation as associated with delayed
+# parasite clearance
+# Mutant – associated with delayed clearance Resistant
+# 2 Heterozygous non-synonymous mutations in the kelch13 BTB/POZ and
+# propeller domain classified by the World Health Organisation as associated
+# with delayed parasite clearance
+# Mutant - heterozygous Undetermined
+# 3 578S as homozygous Mutant - not associated Sensitive
+# 4 Any missing call in amino acids 349-726 Missing Undetermined
+# 5 No non-reference calls in amino acids 349-726 Wild-type Sensitive
+# 6 otherwise Mutant - not in WHO list Undetermined
+
+ has_variants = False
+
+ for line in StringIO(dr_report):
+ line = ' '.join(line.split())
+ # We only care about this locus for this drug:
+ if line.startswith("pfkelch13 PF3D7_1343700"):
+ gene, locus, pchange, marker = line.strip().split(" ")
+ old, pos, new = parse_pchange(pchange)
+
+ has_non_ref = False
+ has_variants = False
+ if 349 <= pos <= 726:
+ if (old != new) and (marker == present):
+ return DrugSensitivity.RESISTANT
+ elif (new == "S") and (marker == present):
+ return DrugSensitivity.SENSITIVE
+ elif marker == present:
+ has_non_ref = True
+ has_variants = True
+ if (has_variants) and (not has_non_ref):
+ return DrugSensitivity.SENSITIVE
+
+ return DrugSensitivity.UNDETERMINED
+
+def get_piperaquine_sensitivity(dr_report):
+# Loci utilized: PF3D7_1408000 (plasmepsin 2) and PF3D7_1408100 (plasmepsin 3)
+# Codons: Amplification status of both genes
+# Workflow:
+# Step Genetic change Interpretation Classification
+# 1 Missing Missing Undetermined
+# 2 Heterozygous duplication Heterozygous mutant Undetermined
+# 3 Single copy Wild type Sensitive
+# 4 Multiple copies Mutant Resistant
+
+ # Currently we can't determine this.
+ # We need to get CNV calling working first.
+
+ return DrugSensitivity.UNDETERMINED
+
+def get_drug_resistance(data, sample_id, sample_df, do_print=False):
+
+ chloroquine = get_chloroquine_sensitivity(data)
+ pyrimethamine = get_pyrimethamine_sensitivity(data)
+ sulfadoxine = get_sulfadoxine_sensitivity(data)
+ mefloquine = get_mefloquine_sensitivity(data)
+ artemisinin = get_artemisinin_sensitivity(data)
+ piperaquine = get_piperaquine_sensitivity(data)
+
+ return chloroquine, pyrimethamine, sulfadoxine, mefloquine, artemisinin, piperaquine
+
+def plot_dr_bubbles(dr_report_file):
+ # Set up dataframes using a single report:
+ # Download the file contents:
+ with open(dr_report_file, 'r') as f:
+ dr_report_contents = f.read()
+
+ # Set up our data:
+ dataframe_dict = dict()
+ drug_resistance_df = pd.DataFrame(columns=["Sample", "Chloroquine", "Pyrimethamine", "Sulfadoxine", "Mefloquine", "Artemisinin", "Piperaquine"])
+
+ # Get the raw info here:
+ last_gene = None
+ cur_df = None
+ for line in dr_report_contents.split("\n"):
+ if len(line) > 1:
+ gene, loc, variant, presence = line.split(" ")
+ if gene != last_gene:
+ # must make a new dataframe
+ if last_gene:
+ dataframe_dict[last_gene] = cur_df
+ cur_df = pd.DataFrame({'Sample': pd.Series(dtype='str')})
+ cur_df[variant] = pd.Series(dtype='bool')
+ last_gene = gene
+ dataframe_dict[gene] = cur_df
+
+ # Get the raw info here:
+ sample_id = dr_report_file[dr_report_file.find("/SEN_")+1:dr_report_file.find("_ALL_scored")]
+
+ dr_info = defaultdict(list)
+ for line in dr_report_contents.split("\n"):
+ if len(line) > 1:
+ gene, loc, variant, presence = line.split(" ")
+ presence = presence == "present"
+
+ dr_info[gene].append([variant, presence])
+
+ # Now process it into dataframes:
+ for gene, variant_info in dr_info.items():
+
+ # set up place to put new markers:
+ gene_df = dataframe_dict[gene]
+ columns = list(gene_df.columns)
+
+ gene_df.loc[len(gene_df.index)] = [sample_id] + (len(columns)-1) * [None]
+
+ # Now add the markers:
+ for v, presence in variant_info:
+ gene_df.loc[len(gene_df.index)-1, v] = presence
+
+ # Set up logistics of our data:
+ nrows = len(next(iter(dataframe_dict.values())))
+ ncols = sum([len(d.columns) for d in dataframe_dict.values()])
+
+ aa_dict = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
+ 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
+ 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',
+ 'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}
+ aa_dict_rev = {v:k for k, v in aa_dict.items()}
+
+ pchange_string_re = re.compile(r'P\.([A-Z]+)(\d+)([A-Z]+)')
+
+ xlabels = []
+ for df in dataframe_dict.values():
+ for variant in df.columns[1:]:
+ m = pchange_string_re.match(variant.upper())
+ xlabels.append(f"{aa_dict[m[1]]}{m[2]}{aa_dict[m[3]]}")
+
+ # Now set up the circles we will plot:
+ print("Computing plots")
+ radius = 0.4
+ circles = []
+ x_tick_pos = []
+ x_gene_offset = 0
+ for gene_index, (g, df) in enumerate(dataframe_dict.items()):
+ y = nrows-1
+ for row_index, row in df.iterrows():
+ x = 0 + x_gene_offset
+ for col_index, col in enumerate(row[1:]):
+ edgecolor = [0]*3
+ facecolor = [1,0,0] if col else [1]*3
+ c = plt.Circle((x+gene_index, y), facecolor=facecolor, edgecolor=edgecolor)
+ c.set(radius=radius)
+ circles.append(c)
+ x_tick_pos.append(x+gene_index)
+ x += 1
+ y -= 1
+ x_gene_offset += len(df.columns)-1
+
+ x_tick_pos = sorted(list(set(x_tick_pos)))
+
+ ############################
+
+ print("Generating Plots")
+ fig, ax = plt.subplots()
+
+ col = PatchCollection(circles, match_original=True)
+ ax.add_collection(col)
+
+ # Now set the gene labels:
+ x_gene_offset = 0
+ for gene_index, (g, df) in enumerate(dataframe_dict.items()):
+ line_x = [x_gene_offset, x_gene_offset+len(df.columns)-2]
+ lbl_x = sum(line_x)/2
+ plt.text(lbl_x, nrows+2, g, ha="center", fontfamily="monospace", size="large")
+ plt.plot(line_x, [nrows+1]*2, '-k', linewidth=2)
+ x_gene_offset += len(df.columns)
+
+ ax.set(xticks=x_tick_pos,
+ yticks=[],
+ xticklabels=xlabels,
+ yticklabels=[])
+
+
+ ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha='center')
+
+ ax.axis([-1, ncols-1, -1, nrows+3])
+ ax.set_aspect(aspect='equal')
+
+ ax.spines['top'].set_visible(False)
+ ax.spines['right'].set_visible(False)
+ ax.spines['bottom'].set_visible(False)
+ ax.spines['left'].set_visible(False)
+
+ return fig
+
+'''
+Map
+'''
+def create_map(coordinates, sample_name):
+ m = folium.Map(location=coordinates, zoom_start = 10)
+
+ if coordinates != [0,0]:
+ folium.Marker(location=coordinates, popup = ('Sample: '+sample_name), icon=folium.Icon(color='red',prefix='fa',icon='circle'), parse_html=True).add_to(m)
+
+ m.get_root().width = "491px"
+ m.get_root().height = "420px"
+ map_html = m.get_root()._repr_html_()
+
+ return map_html
+
+'''
+Location Info
+'''
+def capitalize_proper_nouns(text):
+ """Capitalizes the first letter of each word in a string, assuming all words are proper nouns."""
+ if type(text) == str:
+ words = re.split(r'(\s+|-|\')', text)
+ capitalized_words = []
+ for i, word in enumerate(words):
+ if i % 2 == 0:
+ capitalized_words.append(word.capitalize())
+ else:
+ capitalized_words.append(word.replace('-', '. ').capitalize())
+ return ''.join(capitalized_words)
+ else:
+ return text
+
+def parse_location_info(metadata_file):
+ """Parses location data by matching three letter code to region, district, site, and type, in order."""
+ facility_types = {"C": "Clinic", "SS": "Sentinel Site", "SI": "Special Interest",
+ "PP": "PECADOM Plus", "C + PP": "Clinic, PECADOM Plus", "SS + PP": "Sentinel Site, PECADOM Plus"}
+
+ metadata = pd.read_csv(metadata_file)
+ metadata = metadata.replace(np.nan, "Unspecified").replace(facility_types)
+ metadata.iloc[:, 1:4] = metadata.iloc[:, 1:4].applymap(capitalize_proper_nouns)
+
+ # Make dictionary matching code to region, district, site, and type, in order
+ loc_dict = metadata.groupby("CODE")[list(metadata.columns[1:5])].agg(list).to_dict("index")
+ return loc_dict
+
+def extract_code(sample_name):
+ """Extracts the three-letter location code from this sample's name."""
+ code = (sample_name.split("_")[2]).split(".")[0]
+ return code
+
+def get_sample_loc_info(loc_dict, sample_name, code = None):
+ extracted_code = extract_code(sample_name)
+ entry = next((loc_dict[c] for c in (code, extracted_code) if c in loc_dict), None)
+ if entry:
+ return [entry["SITE"][0], entry["REGION"][0], entry["DISTRICT"][0], entry["TYPE"][0]]
+ else:
+ return ["Unspecified"] * 4
+
+'''
+Quality Report (FastQC or ONT QC)
+'''
+def read_qc_report(directory):
+ '''
+ Function to read HTML from FastQC or ONT QC report as string to pass to Jinja template IFrame
+ '''
+ if(os.path.exists(directory)):
+ # There should only be one file in the directory (report-files/data/quality_report)
+ for file in os.listdir(directory):
+ print(f"{directory} + {file}")
+ with open(os.path.join(directory, file), "r", encoding="utf-8") as f:
+ html = f.read()
+ else:
+ html = None
+
+ return html
+
+'''
+Snapshots
+'''
+def get_bed_info(bed_file):
+ '''
+ Input: Bed file containing drug resistance loci and their names.
+ Output: A list containing the names of all the drug resistance regions and a list of loci in the format of start-end.
+ '''
+ column_names = ["chromosome", "start", "end", "name"]
+ bed = pd.read_csv(bed_file, sep="\s+", header=None, names=column_names)
+ bed["start"] = pd.Series(map(format_long_number, bed["start"]))
+ bed["end"] = pd.Series(map(format_long_number, bed["end"]))
+
+ bed["locus"] = bed["start"].apply(str) + "-" + bed["end"].apply(str)
+
+ return list(bed.name), list(bed.locus)
+
+'''
+Utility
+'''
+def plot_to_b64(plot, bbox_inches=None):
+ '''
+ Input: A matplotlib plot.
+ Output: The b64 string conversion of the input plot.
+ '''
+ # Create I/O buffer to save image in bytes
+ stringIObytes = io.BytesIO()
+ plot.savefig(stringIObytes, format="jpeg", bbox_inches = bbox_inches)
+
+ # Retrieve byte-string and encode it in base64
+ stringIObytes.seek(0)
+ plot_b64 = base64.b64encode(stringIObytes.read())
+
+ return plot_b64
+
+def img_to_b64(img):
+ with open(img, "rb") as image_file:
+ return base64.b64encode(image_file.read())
+
+def format_dates(date_string):
+ '''
+ Input: A string of numbers in the form of YYYYMMDD, such as 20240528 (May 28th, 2024).
+ Output: A formatted version of the input string using slashes to separate year, month, and day or "N/A".
+ '''
+ if date_string in ["", None, "N/A"]:
+ return "Unknown"
+ else:
+ date = datetime.date(int(date_string[:4]), int(date_string[4:6]), int(date_string[6:]))
+ month = date.strftime("%B")
+ return f"{month} {date_string[6:]}, {date_string[:4]}"
+
+def check_unknown(data):
+ '''
+ Input: A variable of any type to check if it is valid for passing to the report.
+ Output: The variable or "N/A".
+ '''
+ return "Unknown" if data in [None, "None", "Unknown", "N/A", 0] else data
+
+def format_long_number(number):
+ return '{:,}'.format(number)
+
+'''
+Report Generation & Templating
+'''
+class Sample:
+ '''
+ This class holds all variables that will be used on the summary page of the report.
+
+ Additionally, this class holds variables that are used on both the summary and analysis pages,
+ such as the sample name.
+ '''
+
+ def __init__(self, sample_name, hrp2, hrp3, drug_res, info, _map, location_info, qc_pass, dr_bubbles, tech_flag):
+ '''This function defines the class variables and retrieves them from their respective functions.'''
+
+ self.sample_name = sample_name
+ self.hrp2 = hrp2
+ self.hrp3 = hrp3
+ self.drug_res = drug_res
+ self.info = info
+ self.map = _map
+ self.location_info = location_info
+ self.qc_pass = qc_pass
+ self.dr_bubbles = dr_bubbles
+ self.tech_flag = tech_flag
+
+class Analysis:
+ '''
+ This class holds all variables used on the analysis page of the report.
+ '''
+
+ def __init__(self, sequencing_summary, qscore_reads, qscore_scores, barcode, coverage_plot, snapshots, res_loci_names, res_loci):
+ '''This function defines the variables used and retrieves them from their respective functions.'''
+
+ self.sequencing_summary = sequencing_summary
+ self.scores = qscore_scores
+ self.reads = qscore_reads
+ self.barcode = barcode
+ self.coverage_plot = coverage_plot
+ self.snapshots = snapshots
+ self.res_loci_names = res_loci_names
+ self.res_loci = res_loci
+
+class QCReport:
+ def __init__(self, qc_report_html):
+ self.qc_report_html = qc_report_html
+
+def prepare_summary_data(arg_dict):
+ '''
+ Gathers all data needed for summary page from inputs.
+ '''
+
+ sample_name = arg_dict['sample_name']
+
+ upload_date = arg_dict['upload_date'][0]
+ collection_date = arg_dict['collection_date']
+ sequencing_date = arg_dict['sequencing_date']
+ species = ' '.join(arg_dict['species'])
+ location_table = arg_dict["location_table"]
+ sample_code = arg_dict["code"]
+ tech_flag = arg_dict["tech_flag"]
+
+ info = [upload_date, format_dates(collection_date), format_dates(sequencing_date), species, round(arg_dict['aligned_coverage'], 2), check_unknown(arg_dict['aligned_read_length']),
+ check_unknown(arg_dict['pct_properly_paired_reads']), 0, round(arg_dict['read_qual_mean'], 2)]
+ processed_info = [item if item not in ["", None] else "N/A" for item in info]
+
+ qc_pass = arg_dict["qc_pass"]
+ if (qc_pass == "true"):
+ qc_pass = "PASS"
+ elif (qc_pass == "false"):
+ qc_pass = "FAIL"
+
+ resistances = create_drug_table(None if arg_dict["drug_resistance_text"] in [None, "None", ""] else arg_dict["drug_resistance_text"])
+ resistance_bubbles = plot_dr_bubbles(arg_dict["drug_resistance_text"])
+ resistance_bubbles_b64 = plot_to_b64(resistance_bubbles, "tight")
+
+ loc_dict = parse_location_info(location_table)
+
+ location_info = [check_unknown(round(arg_dict['latitude'], 2)), check_unknown(round(arg_dict['longitude'], 2)), arg_dict['location'],
+ *get_sample_loc_info(loc_dict, sample_name, sample_code)]
+ coordinates = [arg_dict['latitude'], arg_dict['longitude']]
+ _map = create_map(coordinates, sample_name)
+
+ HRP2 = arg_dict['HRP2']
+ HRP3 = arg_dict['HRP3']
+
+ return Sample(sample_name, HRP2, HRP3, resistances, processed_info, _map, location_info, qc_pass, resistance_bubbles_b64, tech_flag)
+
+def prepare_analysis_data(arg_dict):
+ '''
+ Gathers all data needed for analysis page from inputs.
+ '''
+
+ frac_bases = check_unknown(arg_dict["fraction_aligned_bases"])
+
+ sequencing_summary = [0, 0, format_long_number(arg_dict['aligned_bases']), format_long_number(arg_dict['aligned_reads']),
+ str(round(frac_bases, 4)*100)[:5], round(arg_dict['average_identity'], 2)]
+
+ barcode = arg_dict['barcode']
+
+ qscorex = [5, 7, 10, 12, 15] # available q-score measures are predetermined
+ qscorey = [arg_dict['num_reads_q5'], arg_dict['num_reads_q7'], arg_dict['num_reads_q10'], arg_dict['num_reads_q12'], arg_dict['num_reads_q15']]
+
+ # Create coverage plot and convert it to base64
+ coverage_bin_size = arg_dict["coverage_bin_size"]
+ coverage_plot = plot_coverage("/report-files/data/coverage", arg_dict["sample_name"], coverage_bin_size) # default bin size = 750
+ coverage_b64 = plot_to_b64(coverage_plot) if coverage_plot not in ["", None] else None
+
+ # IGV Snapshots
+ snapshots = str(arg_dict["snapshots"])
+
+ snapshots = snapshots.split(",")
+ snapshots_b64 = [img_to_b64(image) for image in snapshots]
+
+ bed_file = open(arg_dict["regions_bed"], "r")
+ loci_names, loci = get_bed_info(bed_file)
+
+ return Analysis(sequencing_summary, qscorey, qscorex, barcode, coverage_b64, snapshots_b64, loci_names, loci)
+
+def create_report(sample, analysis, qc_report):
+ '''
+ This function handles the final steps of report generation.
+
+ It pools all variables and data needed for the report and creates two objects:
+ analysis and summary. These objects hold the variables and are passed to their respective
+ page templates (analysis or summary).
+ '''
+
+ print("path: "+os.path.dirname(os.path.realpath(__file__)))
+ print("cwd: "+os.getcwd())
+
+ # creating summary page
+ templateLoader = jinja2.FileSystemLoader(searchpath='/report-files/templates/')
+ templateEnv = jinja2.Environment(loader=templateLoader)
+
+ # Add zip to Jinja2 namespace
+ templateEnv.globals.update(zip=zip)
+
+ TEMPLATE_FILE = 'report.html' # may need to change if file is moved
+ template = templateEnv.get_template(TEMPLATE_FILE)
+ output = template.render(sample=sample, analysis=analysis, qc_report = qc_report)
+
+ file_name = os.path.join(os.getcwd(),sample.sample_name+'_lrma_report.html')
+ print(file_name)
+
+ with open(file_name, 'w') as fh:
+ fh.write(output)
+
+ print('Report generated!')
+
+if __name__ == '__main__':
+
+ parser = argparse.ArgumentParser()
+
+ ''' Summary Page '''
+ parser.add_argument("--barcode", help="barcode of the sample", default="Unknown")
+
+ # Sample Info
+ parser.add_argument("--sample_name", help="name of sequenced sample", required=True)
+ parser.add_argument("--upload_date", help="date sample was uploaded", nargs='+', required=True)
+ parser.add_argument("--collection_date", help="date sample was collected", required=True, default="Unknown", nargs="?")
+ parser.add_argument("--sequencing_date", help="date sample was sequenced", default="Unknown", nargs="?")
+ parser.add_argument("--species", help="species of sample", nargs='+', default="P. falciparum")
+ parser.add_argument("--aligned_coverage", help="number of times the bases in the sequenced reads cover the target genome", required=True, type=float)
+ parser.add_argument("--aligned_read_length", help="number at which 50%\ of the read lengths are longer than this value", type=float)
+ parser.add_argument("--pct_properly_paired_reads", help="percent of reads that are properly paired", type=float)
+ parser.add_argument("--read_qual_mean", help="mean measure of the uncertainty of base calls", required=True, type=float, default="Unknown", nargs="?")
+
+ # Drug Resistance
+ parser.add_argument("--drug_resistance_text", help="path of text file used for determining and displaying drug resistances", default=None)
+ parser.add_argument("--HRP2", help="value denoting whether the HRP2 marker is present or not -- true or false", default="Unknown")
+ parser.add_argument("--HRP3", help="value denoting whether the HRP3 marker is present or not -- true or false", default="Unknown")
+
+ # Map
+ parser.add_argument("--longitude", help="longitude value of where the sample was collected", type=float, default=0)
+ parser.add_argument("--latitude", help="latitude value of where the sample collected", type=float, default=0)
+ parser.add_argument("--location", help="location where the sample was collected", default="Unknown")
+ parser.add_argument("--location_table", help="table with information about sample location related to its 3-letter code", required=True)
+ parser.add_argument("--code", help="three letter code corresponding to the site this sample was collected at")
+
+ # QC Status
+ parser.add_argument("--qc_pass", help="status to determine whether or not the sequencing run passes quality control standards", required=True)
+
+ ''' Analysis Page '''
+ # Q-Scores Plot
+ parser.add_argument("--num_reads_q5", help="the number of reads where the probability of a given base call being wrong is approximately 1 in 3", required=True)
+ parser.add_argument("--num_reads_q7", help="the number of reads where the probability of a given base call being wrong is approximately 1 in 5", required=True)
+ parser.add_argument("--num_reads_q10", help="the number of reads where the probability of a given base call being wrong is 1 in 10", required=True)
+ parser.add_argument("--num_reads_q12", help="the number of reads where the probability of a given base call being wrong is approximately 1 in 16", required=True)
+ parser.add_argument("--num_reads_q15", help="the number of reads where the probability of a given base call being wrong is approximately 1 in 32", required=True)
+
+ # Sequencing Summary
+ parser.add_argument("--aligned_bases", help="total number of bases aligned to the reference genome", required=True, type=float)
+ parser.add_argument("--aligned_reads", help="total number of reads aligned to the reference genome", required=True, type=float)
+ parser.add_argument("--fraction_aligned_bases", help="number of bases aligned out of all bases sequenced", required=True, type=float)
+ parser.add_argument("--average_identity", help="", required=True, type=float) # check
+
+ # Coverage Plot
+ parser.add_argument("--fastqc_path", help="location of fastqc_report file; used to locate BAM files for coverage report generation")
+ parser.add_argument("--coverage_bin_size", help="number to use as size of bins for coverage plot generation; default is 1500", default=500, required=False, type=int)
+
+ # Snapshots
+ parser.add_argument("--snapshots", required=True)
+ parser.add_argument("--regions_bed")
+
+ parser.add_argument("--tech_flag", help="string denoting if the sample was sequenced using long read or short read techniques")
+
+ # parse given arguments
+ args = parser.parse_args()
+ arg_dict = vars(args)
+
+ qc_report_html = read_qc_report("/report-files/data/quality_report")
+
+ summary = prepare_summary_data(arg_dict)
+ analysis = prepare_analysis_data(arg_dict)
+ qc_report = QCReport(qc_report_html)
+
+ create_report(summary, analysis, qc_report)
+
+
+
+
diff --git a/docker/lr-malaria-reports/report-files/templates/plots.js b/docker/lr-malaria-reports/report-files/templates/plots.js
new file mode 100644
index 000000000..1fc831e58
--- /dev/null
+++ b/docker/lr-malaria-reports/report-files/templates/plots.js
@@ -0,0 +1,66 @@
+QSCORES = document.getElementById('qscores');
+data = document.querySelector("meta[name='qscores_y']").content;
+console.log(data);
+
+function createQScoresPlot(data) {
+ var layout = {
+ width: 500,
+ height: 350,
+ margin: {
+ l: 95,
+ r: 80,
+ b: 90,
+ t: 10
+ },
+ paper_bgcolor:'rgba(0,0,0,0)',
+ plot_bgcolor:'rgba(0,0,0,0)',
+ modebar: {
+ // vertical modebar button layout
+ orientation: 'v',
+ // for demonstration purposes
+ bgcolor: 'white',
+ color: '#a7a8a8',
+ activecolor: '#9ED3CD'
+ },
+
+ // move the legend inside the plot on the top right
+ xaxis: {
+ title: {
+ text: 'Q Score',
+ font: {
+ family: 'Rubik',
+ size: 18,
+ color: '#7f7f7f'
+ }
+ },
+ },
+ yaxis: {
+ title: {
+ text: 'Number of Reads',
+ font: {
+ family: 'Rubik',
+ size: 18,
+ color: '#7f7f7f'
+ }
+ }
+ }
+
+ }
+
+ var config = {
+ displayModeBar: true // or 'true' for always visible
+ };
+
+ var data = data; // string
+ data = data.replace(/["']/g, "");
+ y_values = JSON.parse(data);
+
+ Plotly.plot(QSCORES, [{
+ type: 'scatter',
+ x: [5, 7, 10, 12, 15],
+ y: y_values
+ }], layout, config)
+}
+
+createQScoresPlot(data);
+QSCORES.style.zIndex="100";
\ No newline at end of file
diff --git a/docker/lr-malaria-reports/report-files/templates/report.html b/docker/lr-malaria-reports/report-files/templates/report.html
new file mode 100644
index 000000000..5d4088154
--- /dev/null
+++ b/docker/lr-malaria-reports/report-files/templates/report.html
@@ -0,0 +1,2211 @@
+
+
+
+
+
+
+
+
+
+
+
+ LRMA Report
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Sample Info
+
+ i
+
+ High-level data and metadata about this sample.
+
+
+
+
+
+
+
+
+
{{ sample.info[0] }}
+
+
+
{{ sample.info[1] }}
+
+
+
{{ sample.info[2] }}
+
+
+
{{ sample.info[3] }}
+
+
+
+ Year
+ Date of Collection
+ Date of Sequencing
+ Species
+
+
+
+
+
+
+
+
+
+ Location Info
+
+ i
+
+ Information about where the sample was collected.
+
+
+
+
+
+
Country
+
+
+
Latitude
+
+
+
Longitude
+
+
+
Health Facility
+
+
Region
+
+
District
+
+
Type
+
+
+
{{ sample.location_info[2] }}
+
+
+
{{ sample.location_info[0] }}
+
+
+
{{ sample.location_info[1] }}
+
+
+
{{ sample.location_info[3] }}
+
+
+
{{ sample.location_info[4] }}
+
+
+
{{ sample.location_info[5] }}
+
+
+
{{ sample.location_info[6] }}
+
+
+
+
+
Predicted RDT Region Presence
+
+ i
+
+ The presence of the HRP-2 and HRP-3 regions is presented below. "+" indicates the presence of either region ; "-" indicates the absence of the region.
+
+ Presence or absence is determined by examining the coverage for this sample. If the coverage over a region is less than 5% of the coverage of the containing contig, that region is said to be deleted (for HRP-2: Pf3D7_08_V3:1373212-1376988; for HRP-3: Pf3D7_13_v3:2840236-2842840). Otherwise the region is counted as present.
+
+
+
+
+
+
+
{{ sample.hrp2 }} HRP-2 Result
+
+
{{ sample.hrp3 }} HRP-3 Result
+
+
+
+
+
+
+
+
+
Map
+
+ i
+
+ Interactive map showing where the sample was collected.
+
+
+
+ {{ sample.map|safe }}
+
+
+
+
+
+
+
+
Predicted Drug Sensitivities
+
+
+
+
+
+
+
+
+
+ Chloroquine
+ Pyrimethamine
+ Sulfadoxine
+ Mefloquine
+ Artemisinin
+ Piperaquine
+
+
+
+
+ Sensitivity
+ {% for item in sample.drug_res %}
+ {% if item == "RESISTANT" %}
+ {{ item }}
+ {% elif item == "SENSITIVE" %}
+ {{ item }}
+ {% else %}
+ {{ item }}
+ {% endif %}
+ {% endfor %}
+
+
+
+
+
+
+
+
+
+
+
+
+
QC Info
+
+ i
+
+ Metrics about the quality of this sample.
+
+
+
+
+
+
+
+
+
{{ sample.info[4] }}
+
+
+
{{ sample.info[5]|int }}
+
+
+
+ {% if sample.info[6] != "N/A" %}
+ {{ sample.info[6] }}%
+ {% else %}
+ {{ sample.info[6] }}
+ {% endif %}
+
+
+
{{ sample.info[8] }}
+
+
+
+ Aligned Fold Coverage
+ Aligned Read Length
+ Percent Properly Paired Reads
+ Read Quality Mean
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
{{ analysis.barcode }}
+
+
+
24 SNP Molecular Barcode
+
+
+
+
Sequencing Summary
+
+ i
+
+ High-level sequencing information about this sample.
+
+ Aligned Bases : Total number of bases successfully aligned to the reference genome.
+ Aligned Reads : Total number of reads aligned successfully to the reference genome.
+ Fraction of Aligned Bases : Fraction of the total number of bases sequenced that were aligned successfully to the reference genome.
+ Average Identity : Measure of nucleotide-level similarity between the sample and the reference genome.
+
+
+
{{ analysis.sequencing_summary[2]|string|replace(".0","") }} bp
+
{{ analysis.sequencing_summary[3]|string|replace(".0","") }}
+
{{ analysis.sequencing_summary[4] }}%
+
{{ analysis.sequencing_summary[5] }}%
+
+
+
Aligned Bases
+
Aligned Reads
+
Fraction of Aligned Bases
+
Average Identity
+
+
+
+
+
+
+ {% if analysis.coverage_plot is defined and analysis.coverage_plot %}
+
+ {% endif %}
+
+
+
+
IGV Snapshots
+
+ i
+
+ Screenshots of IGV plots around genes of interest.
+
+ Click on the main image to enlarge it. Click on other images to select them. Click on arrows to view more regions to select.
+
+
+
+
+
+
+
Locus: {{ analysis.res_loci_names[0] }}
,
{{ analysis.res_loci[0]|string|replace(".0","") }}
+
+
+
+
+ ◄
+
+
+ {% for image, name in zip(analysis.snapshots, analysis.res_loci_names) %}
+
+
+
+
+
{{ name }}
+
+ {% endfor %}
+
+
+ ▶
+
+
+
+
+
+
+
+
+
+
+
+
+ {% if qc_report.qc_report_html is defined and qc_report.qc_report_html %}
+
+
+
+
+
+ {% endif %}
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/wdl/pipelines/TechAgnostic/Utility/MalariaReportsWorkflow.wdl b/wdl/pipelines/TechAgnostic/Utility/MalariaReportsWorkflow.wdl
new file mode 100644
index 000000000..9b567dc60
--- /dev/null
+++ b/wdl/pipelines/TechAgnostic/Utility/MalariaReportsWorkflow.wdl
@@ -0,0 +1,186 @@
+version 1.0
+import "../../../tasks/Visualization/MalariaReports.wdl" as MRS
+import "../../../tasks/Visualization/MalariaReportsSnapshots.wdl" as Snapshot
+
+workflow GenerateMalariaReports {
+
+ meta {
+ author: "Bridget Knight"
+ description: "## Report Generation \n This workflow calls the Python script that generates a sequencing report."
+ }
+
+ parameter_meta {
+ # ------ Summary Page ------ #
+
+ # Sample Info
+ sample_name: "name of sequenced sample"
+ upload_date: "date sample was uploaded"
+ collection_date: "date sample was collected"
+ sequencing_date: "date sample was sequenced"
+ species: "species of sample"
+ aligned_coverage: "number of reads uniquely mapped to a reference"
+ aligned_read_length: "number at which 50% of the read lengths are longer than this value" # check
+ pct_properly_paired_reads: "percent of reads that are properly paired/aligned"
+ read_qual_mean: "mean measure of the uncertainty of base calls"
+
+ # Drug Resistance
+ drug_resistance_text: "text file used for determining and displaying drug resistances"
+ HRP2: "value denoting whether the HRP2 marker is present or not -- true or false"
+ HRP3: "value denoting whether the HRP3 marker is present or not -- true or false"
+
+ # Map
+ longitude: "longitude value of where the sample was taken"
+ latitude: "latitude value of where the sample was taken"
+ location: "location where the sample was taken"
+ code: "the three letter code denoting the corresponding health facility/site"
+ location_table: "table containing the region, district, site, and facility type for this sample's location"
+
+ # QC Status
+ qc_pass: "status to determine whether or not the sequencing run passes quality control standards"
+
+ # ------ Analysis Page ------ #
+ barcode: "the barcode of the sample"
+
+ # Q-Scores Plot
+ num_reads_q5: "the number of reads where the probability of a given base call being wrong is approximately 1 in 3"
+ num_reads_q7: "the number of reads where the probability of a given base call being wrong is approximately 1 in 5"
+ num_reads_q10: "the number of reads where the probability of a given base call being wrong is 1 in 10"
+ num_reads_q12: "the number of reads where the probability of a given base call being wrong is approximately 1 in 16"
+ num_reads_q15: "the number of reads where the probability of a given base call being wrong is approximately 1 in 31"
+
+ # Sequencing Summary
+ aligned_bases: "total number of bases aligned to the reference genome"
+ aligned_reads: "total number of reads aligned to the reference genome"
+ fraction_aligned_bases: "number of bases aligned out of all bases in the sequence"
+
+ # Coverage Plot
+ fastqc_path: "directory of fastqc_report used for finding BAM files"
+ coverage_bin_size: "number to use as size of bins for coverage plot generation; default is 1500"
+
+ # ------ IGV Snapshots ------ #
+ regions_bed: "GCS path to bed file containing drug resistance loci"
+ ref_map_file: "table indicating reference sequence and auxillary file locations"
+ aligned_bam: "GCS path to aligned bam"
+ aligned_bai: "GCS path to aligned bai"
+ gcs_out_root_dir: "GCS bucket to store the results"
+
+ ont_qc_report: "ONT QC report file"
+
+ tech_flag: "string denoting if the sample was sequenced using long read or short read techniques"
+ }
+
+ input {
+ # ------ Summary Page ------ #
+
+ # Sample Info
+ String sample_name
+ String upload_date
+ String? collection_date
+ String? sequencing_date
+ String? species
+ Float aligned_coverage
+ Float? aligned_read_length
+ Float? pct_properly_paired_reads
+ Float? read_qual_mean
+
+ # Drug Resistance
+ File? drug_resistance_text
+ String? HRP2
+ String? HRP3
+
+ # Map
+ Float? longitude
+ Float? latitude
+ String? location
+ String? code
+ File location_table
+
+ # QC Pass
+ String qc_pass
+
+ # ------ Analysis Page ------ #
+
+ String? barcode
+
+ # Q-Scores Plot
+ Int num_reads_q5
+ Int num_reads_q7
+ Int num_reads_q10
+ Int num_reads_q12
+ Int num_reads_q15
+
+ # Sequencing Summary
+ Float aligned_bases
+ Int aligned_reads
+ Float fraction_aligned_bases
+ Float average_identity
+
+ # Coverage Plot -- incomplete
+ File? fastqc_path
+ Int? coverage_bin_size
+
+ # ------ IGV Snapshots ------ #
+ File aligned_bam
+ File aligned_bai
+ File regions_bed
+ File ref_map_file
+
+ File? ont_qc_report
+
+ String tech_flag
+ }
+
+ Map[String, String] ref_map = read_map(ref_map_file)
+
+ call Snapshot.DrugResIGV as GenerateSnapshots {
+ input:
+ sample_name = sample_name,
+ fasta_path = ref_map["fasta"],
+ fasta_index_path = ref_map["fai"],
+ regions_bed = regions_bed,
+ aligned_bam = aligned_bam,
+ aligned_bai = aligned_bai
+ }
+
+ call MRS.RunReportScript as RunReportScript {
+ input:
+ sample_name = sample_name,
+ upload_date = upload_date,
+ collection_date = collection_date,
+ sequencing_date = sequencing_date,
+ species = species,
+ aligned_coverage = aligned_coverage,
+ aligned_read_length = aligned_read_length,
+ pct_properly_paired_reads = pct_properly_paired_reads,
+ read_qual_mean = read_qual_mean,
+ drug_resistance_text = drug_resistance_text,
+ HRP2 = HRP2,
+ HRP3 = HRP3,
+ longitude = longitude,
+ latitude = latitude,
+ location = location,
+ barcode = barcode,
+ num_reads_q5 = num_reads_q5,
+ num_reads_q7 = num_reads_q7,
+ num_reads_q10 = num_reads_q10,
+ num_reads_q12 = num_reads_q12,
+ num_reads_q15 = num_reads_q15,
+ aligned_bases = aligned_bases,
+ aligned_reads = aligned_reads,
+ fraction_aligned_bases = fraction_aligned_bases,
+ average_identity = average_identity,
+ fastqc_path = fastqc_path,
+ ont_qc_report = ont_qc_report,
+ coverage_bin_size = coverage_bin_size,
+ snapshots = GenerateSnapshots.snapshots,
+ regions_bed = regions_bed,
+ qc_pass = qc_pass,
+ code = code,
+ location_table = location_table,
+ tech_flag = tech_flag
+ }
+
+ output {
+ File report = RunReportScript.report
+ }
+}
\ No newline at end of file
diff --git a/wdl/tasks/Visualization/MalariaReports.wdl b/wdl/tasks/Visualization/MalariaReports.wdl
new file mode 100644
index 000000000..6ab6471de
--- /dev/null
+++ b/wdl/tasks/Visualization/MalariaReports.wdl
@@ -0,0 +1,235 @@
+version 1.0
+
+import "../../structs/Structs.wdl"
+task RunReportScript {
+
+ meta {
+ description: "Use RunReportScript to start the report generation process."
+ }
+
+ parameter_meta {
+ runtime_attr_override: "Override the default runtime attributes"
+
+ # ------ Summary Page ------ #
+
+ # Sample Info
+ sample_name: "name of sequenced sample"
+ upload_date: "date sample was uploaded"
+ sequencing_date: "date sample was sequenced"
+ collection_date: "date sample was collected"
+ species: "species of sample"
+ aligned_coverage: "number of reads uniquely mapped to a reference"
+ aligned_read_length: "number at which 50% of the read lengths are longer than this value" # check
+ pct_properly_paired_reads: "median read length"
+ read_qual_mean: "mean measure of the uncertainty of base calls"
+
+ # Drug Resistance
+ drug_resistance_text: "text file used for determining and displaying drug resistances"
+ HRP2: "value denoting whether the HRP2 marker is present or not -- true or false"
+ HRP3: "value denoting whether the HRP3 marker is present or not -- true or false"
+
+ # Map
+ longitude: "longitude value of where the sample was taken"
+ latitude: "latitude value of where the sample was taken"
+ location: "location where the sample was taken"
+ code: "the three letter code denoting the corresponding health facility/site"
+ location_table: "table containing the region, district, site, and facility type for this sample's location"
+
+ # QC Status
+ qc_pass: "status to determine whether or not the sequencing run passes quality control standards"
+
+ # ------ Analysis Page ------ #
+ # Main Box
+ barcode: "the barcode of the sample"
+
+ # Q-Scores Plot
+ num_reads_q5: "the number of reads where the probability of a given base call being wrong is approximately 1 in 3"
+ num_reads_q7: "the number of reads where the probability of a given base call being wrong is approximately 1 in 5"
+ num_reads_q10: "the number of reads where the probability of a given base call being wrong is 1 in 10"
+ num_reads_q12: "the number of reads where the probability of a given base call being wrong is approximately 1 in 16"
+
+ # Sequencing Summary
+ aligned_bases: "total number of bases aligned to the reference genome"
+ aligned_reads: "total number of reads aligned to the reference genome"
+ fraction_aligned_bases: "percentage of bases aligned out of all bases in the sequence"
+
+ # Coverage Plot
+ # coverage_dir: "directory of BAM files for coverage plot generation"
+ fastqc_path: "directory of fastqc_report used for finding BED and BAM files"
+ coverage_bin_size: "number to use as size of bins for coverage plot generation; default is 1500"
+
+ ont_qc_report: "ONT QC report file"
+
+ tech_flag: "string denoting if the sample was sequenced using long read or short read techniques"
+ }
+
+ input {
+ RuntimeAttr? runtime_attr_override
+
+ # ------ Summary Page ------ #
+
+ # Sample Info
+ String sample_name
+ String upload_date
+ String? collection_date
+ String? sequencing_date
+ String? species
+ Float aligned_coverage
+ Float? aligned_read_length
+ Float? pct_properly_paired_reads
+ Float? read_qual_mean
+
+ # Drug Resistance
+ File? drug_resistance_text
+ String? HRP2
+ String? HRP3
+
+ # Map
+ Float? longitude
+ Float? latitude
+ String? location
+ String? code
+ File location_table
+
+ # QC Pass
+ String qc_pass
+
+ # ------ Analysis Page ------ #
+
+ # Barcode
+ String? barcode
+
+ # Q-Scores Plot
+ Int num_reads_q5
+ Int num_reads_q7
+ Int num_reads_q10
+ Int num_reads_q12
+ Int num_reads_q15
+
+ # Sequencing Summary
+ Float aligned_bases
+ Int aligned_reads
+ Float fraction_aligned_bases
+ Float average_identity
+
+ # Coverage Plot -- incomplete
+ # String? coverage_dir
+ File? fastqc_path
+ Int? coverage_bin_size
+
+ # Snapshots
+ File regions_bed
+ Array[File] snapshots
+
+ # ONT QC Report Page
+ File? ont_qc_report
+
+ String tech_flag
+ }
+
+ Int disk_size_gb = 20 + ceil(size(drug_resistance_text, "GB"))
+
+ # Wrap location in quotes in case it has spaces
+ String wrap_location = "'~{location}'"
+
+ String results_dir = if (defined(fastqc_path)) then sub(select_first([fastqc_path]), "results\/.*", "") else ""
+ String coverage_dir = "~{results_dir}results/SRFlowcell/~{sample_name}/metrics/coverage/"
+ String coverage_regex = "~{coverage_dir}*?[0-9]_v3.regions.bed.gz"
+
+ String quality_report = select_first([fastqc_path, ont_qc_report, ""])
+
+ command <<<
+ set -euxo
+ pwd
+ ls
+
+ if [[ ~{coverage_regex} == "gs://"* ]]; then
+ echo "Retrieving BED files..."
+ echo ~{coverage_dir}
+ mkdir -p /report-files/data/coverage
+ gsutil ls ~{coverage_regex} > filelist.txt
+ echo "COPYING..."
+ cat filelist.txt | gsutil -m cp -I /report-files/data/coverage
+ else
+ echo "No BED files found."
+ fi
+
+ if [ ! -z ~{quality_report} ]; then
+ echo "Retrieving quality report..."
+ mkdir -p /report-files/data/quality_report
+ echo ~{quality_report}
+ gsutil cp ~{quality_report} /report-files/data/quality_report
+ else
+ echo "No quality report found."
+ fi
+
+ #mkdir -p /report-files/data/quality_report
+ #echo ~{fastqc_path}
+ #gsutil cp ~{fastqc_path} /report-files/data/quality_report
+
+ echo "CREATING SUMMARY REPORT..."
+ python3 /report-files/report_gen.py \
+ --sample_name ~{sample_name} \
+ --upload_date ~{upload_date} \
+ --collection_date ~{default="Unknown" collection_date} \
+ --sequencing_date ~{default="Unknown" sequencing_date} \
+ --species ~{default="P. falciparum" species} \
+ --aligned_coverage ~{aligned_coverage} \
+ --aligned_read_length ~{default=0 aligned_read_length} \
+ --pct_properly_paired_reads ~{default=0 pct_properly_paired_reads} \
+ --read_qual_mean ~{read_qual_mean} \
+ --drug_resistance_text ~{default="None" drug_resistance_text} \
+ --HRP2 ~{default="Unknown" HRP2} \
+ --HRP3 ~{default="Unknown" HRP3} \
+ --longitude ~{default=0 longitude} \
+ --latitude ~{default=0 latitude} \
+ --location ~{default="Unknown" wrap_location} \
+ --barcode ~{default="Unknown" barcode} \
+ --num_reads_q5 ~{num_reads_q5} \
+ --num_reads_q7 ~{num_reads_q7} \
+ --num_reads_q10 ~{num_reads_q10} \
+ --num_reads_q12 ~{num_reads_q12} \
+ --num_reads_q15 ~{num_reads_q15} \
+ --aligned_bases ~{aligned_bases} \
+ --aligned_reads ~{aligned_reads} \
+ --fraction_aligned_bases ~{fraction_aligned_bases} \
+ --average_identity ~{average_identity} \
+ --coverage_bin_size ~{default=750 coverage_bin_size} \
+ --snapshots ~{sep="," snapshots} \
+ --regions_bed ~{regions_bed} \
+ --qc_pass ~{default="N/A" qc_pass} \
+ --code ~{default="" code} \
+ --location_table ~{location_table} \
+ --tech_flag ~{tech_flag}
+ echo "DONE!"
+ >>>
+
+ output {
+ File report = "~{sample_name}_lrma_report.html"
+ }
+
+
+ # ----------------------------------------------------------- #
+ # Runtime Config
+
+ RuntimeAttr default_attr = object {
+ cpu_cores: 1,
+ mem_gb: 16,
+ disk_gb: disk_size_gb,
+ boot_disk_gb: 10,
+ preemptible_tries: 2,
+ max_retries: 1,
+ docker: "us.gcr.io/broad-dsp-lrma/lr-malaria-reports:0.0.2"
+ }
+ RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+ runtime {
+ cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+ memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+ disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+ bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+ preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+ maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+ docker: select_first([runtime_attr.docker, default_attr.docker])
+ }
+
+}
diff --git a/wdl/tasks/Visualization/MalariaReportsSnapshots.wdl b/wdl/tasks/Visualization/MalariaReportsSnapshots.wdl
new file mode 100644
index 000000000..8eaf51ce6
--- /dev/null
+++ b/wdl/tasks/Visualization/MalariaReportsSnapshots.wdl
@@ -0,0 +1,81 @@
+version 1.0
+
+import "../../structs/Structs.wdl"
+task DrugResIGV {
+
+ meta {
+ description: "DrugResIGV generates IGV snapshots for the different drug resistance loci for malaria."
+ }
+
+ parameter_meta {
+ runtime_attr_override: "Override the default runtime attributes"
+
+ regions_bed: "GCS path to bed file containing drug resistance loci"
+ fasta_path: "GCS path to fasta file"
+ fasta_index_path: "GCS path to fasta.fai file"
+ aligned_bam: "GCS path to aligned bam"
+ aligned_bai: "GCS path to aligned bai"
+ sample_name: "name of the sample sequenced"
+ }
+
+ input {
+ RuntimeAttr? runtime_attr_override
+
+ File aligned_bam
+ File aligned_bai
+
+ File fasta_path
+ File fasta_index_path
+ File regions_bed
+
+ String sample_name
+ }
+
+ Int disk_size_gb = 20 + ceil(size(aligned_bam, "GB"))
+
+
+ command <<<
+ set -euxo pipefail
+ pwd
+ ls
+
+ mkdir -p out
+
+ echo "CREATING SNAPSHOTS..."
+ make_IGV_snapshots.py \
+ -g ~{fasta_path} \
+ ~{aligned_bam} \
+ -r ~{regions_bed} \
+ -nf4 -o out
+ >>>
+
+ output {
+ Array[File] snapshots = glob("out/*.png")
+ File batch_script = "out/IGV_snapshots.bat"
+ }
+
+
+ # ----------------------------------------------------------- #
+ # Runtime Config
+
+ RuntimeAttr default_attr = object {
+ cpu_cores: 1,
+ mem_gb: 16,
+ disk_gb: disk_size_gb,
+ boot_disk_gb: 10,
+ preemptible_tries: 2,
+ max_retries: 1,
+ docker: "stevekm/igv-snapshot-automator"
+ }
+ RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+ runtime {
+ cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+ memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+ disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+ bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+ preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+ maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+ docker: select_first([runtime_attr.docker, default_attr.docker])
+ }
+
+}