Skip to content

Commit

Permalink
Added the fastp log to the final report
Browse files Browse the repository at this point in the history
  • Loading branch information
GallVp committed Oct 31, 2024
1 parent 14ca56e commit d4624de
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 22 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6. Added a sequence labels table below the HiC contact map [#147](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/147)
7. Added parameter `hic_samtools_ext_args` and set its default value to `-F 3852` [#159](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/159)
8. Added the HiC QC report to the final report so that users don't have to navigate to the results folder [#162](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/162)
9. Added the fastp log to the final report [#163](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/163)

### `Fixed`

Expand Down
54 changes: 54 additions & 0 deletions bin/report_modules/parsers/hic_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,50 @@
from report_modules.parsers.parsing_commons import sort_list_of_results


def colorize_fastp_log(log: Path):
section_colors = {
"adapter": "color: blue;",
"before_filtering": "color: goldenrod;",
"after_filtering": "color: green;",
"filtering_result": "color: green;",
"duplication": "color: red;",
"fastp": "color: gray;",
"version": "color: blue;",
}

patterns = {
"adapter": re.compile(r"Detecting adapter sequence for read\d..."),
"before_filtering": re.compile(r"Read\d before filtering:"),
"after_filtering": re.compile(r"Read\d after filtering:"),
"filtering_result": re.compile(r"Filtering result:"),
"duplication": re.compile(r"Duplication rate:"),
"fastp": re.compile(r"fastp --in"),
"version": re.compile(r"fastp v"),
}

html_log = "<pre>\n"

for line in log.read_text().split("\n"):
colored_line = line.strip()
# Apply HTML color style based on section patterns
for section, pattern in patterns.items():
if pattern.search(line):
colored_line = (
f"<span style='{section_colors[section]}'>{line.strip()}</span>"
)
break
else:
# Default styling for uncolored lines
colored_line = f"<span>{line.strip()}</span>"

html_log += f"{colored_line}\n"

# Close HTML tags
html_log += "</pre>"

return html_log


def parse_hic_folder(folder_name="hic_outputs"):
dir = os.getcwdb().decode()
hic_folder_path = Path(f"{dir}/{folder_name}")
Expand Down Expand Up @@ -44,6 +88,15 @@ def parse_hic_folder(folder_name="hic_outputs"):
if re.match(rf"[\S]+\.on\.{tag}_qc_report\.pdf", x.name)
][0]

# Get FASTP log if it is there
fastp_log = [x for x in hic_folder_path.glob("*.log")]

if fastp_log != []:
fastp_log = fastp_log[0]
fastp_log = colorize_fastp_log(fastp_log)
else:
fastp_log = None

data["HIC"].append(
{
"hap": tag,
Expand All @@ -57,6 +110,7 @@ def parse_hic_folder(folder_name="hic_outputs"):
showindex=False,
),
"hicqc_report_pdf": os.path.basename(str(hicqc_report)),
"fastp_log": fastp_log,
}
)

Expand Down
59 changes: 43 additions & 16 deletions bin/report_modules/templates/hic/hic.html
Original file line number Diff line number Diff line change
@@ -1,18 +1,45 @@
<div id="HIC" class="tabcontent" style="display: none">
<div class="section-para-wrapper">
<p class="section-para">
Hi-C contact mapping experiments measure the frequency of physical contact between loci in the genome. The
resulting dataset, called a “contact map,” is represented using a two-dimensional heatmap where the
intensity of each pixel indicates the frequency of contact between a pair of loci.
</p>
<p class="section-para"><b>Reference:</b></p>
<p class="section-para">
Robinson JT, Turner D, Durand NC, Thorvaldsdóttir H, Mesirov JP, Aiden EL. Juicebox.js Provides a
Cloud-Based Visualization System for Hi-C Data. Cell Syst. 2018 Feb 28;6(2):256-258.e1.
<a href="https://doi.org/10.1016/j.cels.2018.01.001" target="_blank">10.1016/j.cels.2018.01.001</a>. Epub
2018 Feb 7. PMID: 29428417; PMCID: PMC6047755.
</p>
<p class="section-para"><b>Version: {{ all_stats_dicts['VERSIONS']['JUICEBOX_JS'] }}</b></p>
</div>
{% include 'hic/dropdown.html' %} {% include 'hic/report_contents.html' %}
<div class="section-para-wrapper">
<p class="section-para">
Hi-C contact mapping experiments measure the frequency of physical contact between loci in the genome. The
resulting dataset, called a “contact map,” is represented using a two-dimensional heatmap where the
intensity of each pixel indicates the frequency of contact between a pair of loci.
</p>
<p class="section-para"><b>References:</b></p>

<p class="section-para">
<b>fastp</b> Chen, Yanqing Zhou, Yaru Chen, Jia Gu, fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics,
Volume 34, Issue 17, September 2018, Pages i884–i890, <a href="https://doi.org/10.1093/bioinformatics/bty560"
target="_blank">10.1093/bioinformatics/bty560</a>
</p>

<p class="section-para">
<b>BWA</b> Li, H. (2013). Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXiv preprint <a
href="https://arxiv.org/abs/1303.3997" target="_blank">arXiv: 1303.3997</a>.
</p>

<p class="section-para">
<b>SAMBLASTER</b> Gregory G. Faust, Ira M. Hall, SAMBLASTER: fast duplicate marking and structural variant read extraction,
Bioinformatics, Volume 30, Issue 17, September 2014, Pages 2503–2505, <a
href="https://doi.org/10.1093/bioinformatics/btu314" target="_blank">10.1093/bioinformatics/btu314</a>
</p>

<p class="section-para">
<b>SAMtools</b> Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham,
Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li, Twelve years of SAMtools and BCFtools, GigaScience,
Volume
10, Issue 2, February 2021, giab008, <a href="https://doi.org/10.1093/gigascience/giab008"
target="_blank">10.1093/gigascience/giab008</a>
</p>

<p class="section-para">
<b>Juicebox.js</b> Robinson JT, Turner D, Durand NC, Thorvaldsdóttir H, Mesirov JP, Aiden EL. Juicebox.js Provides a
Cloud-Based Visualization System for Hi-C Data. Cell Syst. 2018 Feb 28;6(2):256-258.e1.
<a href="https://doi.org/10.1016/j.cels.2018.01.001" target="_blank">10.1016/j.cels.2018.01.001</a>. Epub
2018 Feb 7. PMID: 29428417; PMCID: PMC6047755.
</p>

<p class="section-para"><b>Version: {{ all_stats_dicts['VERSIONS']['JUICEBOX_JS'] }}</b></p>
</div>
{% include 'hic/dropdown.html' %} {% include 'hic/report_contents.html' %}
</div>
10 changes: 8 additions & 2 deletions bin/report_modules/templates/hic/report_contents.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
<div class="iframe-wrapper-hic">
<iframe src="./hic/{{ all_stats_dicts['HIC'][item]['hic_html_file_name'] }}" width="700px" height="850px"></iframe>
</div>
</div>
<div class="results-section">
<div class="section-para-wrapper">
<p class="section-para"><b>Sequence labels and lengths</b></p>
</div>
Expand All @@ -22,6 +20,14 @@
<div class="iframe-wrapper">
<iframe src="./hic/hicqc/{{ all_stats_dicts['HIC'][item]['hicqc_report_pdf'] }}" width="100%" height="100%"></iframe>
</div>
{% if all_stats_dicts['HIC'][item]['fastp_log'] is not none %}
<div class="section-para-wrapper">
<p class="section-para"><b>fastp log</b></p>
</div>
<div class="section-para-wrapper">
{{ all_stats_dicts['HIC'][item]['fastp_log'] }}
</div>
{% endif %}
</div>
</div>
{% if vars.update({'is_first': False}) %} {% endif %} {% endfor %}
Binary file added docs/images/fastp.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 3 additions & 2 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,9 @@ Kraken2 [assigns taxonomic labels](https://ccb.jhu.edu/software/kraken2/) to seq
Hi-C contact mapping experiments measure the frequency of physical contact between loci in the genome. The resulting dataset, called a “contact map,” is represented using a [two-dimensional heatmap](https://github.com/igvteam/juicebox.js) where the intensity of each pixel indicates the frequency of contact between a pair of loci.

<div align="center">
<img src="images/hicqc.png" alt="AssemblyQC - HiC QC report" width="44.59%">
<img src="images/hic_map.png" alt="AssemblyQC - HiC interactive contact map" width="40%">
<img src="images/fastp.png" alt="AssemblyQC - fastp log for HiC reads" width="31%">
<img src="images/hicqc.png" alt="AssemblyQC - HiC QC report" width="46.6%">
<img src="images/hic_map.png" alt="AssemblyQC - HiC interactive contact map" width="50%">
<hr>
<em>AssemblyQC - HiC results</em>
</div>
Expand Down
6 changes: 4 additions & 2 deletions subworkflows/local/fq2hic.nf
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ workflow FQ2HIC {
1 // min_trimmed_reads
)

ch_fastp_log = FASTQ_FASTQC_UMITOOLS_FASTP.out.trim_log
ch_trim_reads = FASTQ_FASTQC_UMITOOLS_FASTP.out.reads
ch_versions = ch_versions.mix(FASTQ_FASTQC_UMITOOLS_FASTP.out.versions)

Expand Down Expand Up @@ -64,7 +65,7 @@ workflow FQ2HIC {

HICQC ( ch_bam_and_ref.map { meta3, bam, fa -> [ meta3, bam ] } )

ch_hicqc_pdf = HICQC.out.pdf
ch_hicqc_pdf = HICQC.out.pdf
ch_versions = ch_versions.mix(HICQC.out.versions)

// MODULE: MAKEAGPFROMFASTA | AGP2ASSEMBLY | ASSEMBLY2BEDPE
Expand Down Expand Up @@ -96,7 +97,8 @@ workflow FQ2HIC {
ch_versions = ch_versions.mix(HIC2HTML.out.versions.first())

emit:
hicqc_pdf = ch_hicqc_pdf
fastp_log = ch_fastp_log
hicqc_pdf = ch_hicqc_pdf
hic = ch_hic
html = HIC2HTML.out.html
assembly = AGP2ASSEMBLY.out.assembly
Expand Down
4 changes: 4 additions & 0 deletions workflows/assemblyqc.nf
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,7 @@ workflow ASSEMBLYQC {
params.hic_skip_fastqc
)

ch_hic_fastp_log = FQ2HIC.out.fastp_log
ch_hicqc_pdf = FQ2HIC.out.hicqc_pdf
ch_hic_html = FQ2HIC.out.html
ch_hic_assembly = FQ2HIC.out.assembly
Expand All @@ -600,6 +601,9 @@ workflow ASSEMBLYQC {
| mix(
ch_hicqc_pdf.map { meta, pdf -> pdf }
)
| mix(
ch_hic_fastp_log.map { meta, log -> log }
)
ch_versions = ch_versions.mix(FQ2HIC.out.versions)

// SUBWORKFLOW: FASTA_SYNTENY
Expand Down

0 comments on commit d4624de

Please sign in to comment.