From 9448824bdeb873f05cef39662fd2fe809fec486c Mon Sep 17 00:00:00 2001 From: Retype GitHub Action Date: Thu, 11 Jul 2024 14:03:15 +0000 Subject: [PATCH] Refreshes Retype-generated documentation. Process triggered by pdimens. --- 404.html | 8 ++--- blog/filteringsnps/index.html | 10 +++--- blog/index.html | 8 ++--- blog/softwareenvironments/index.html | 10 +++--- categories/guides/index.html | 8 ++--- categories/index.html | 8 ++--- commonoptions/index.html | 10 +++--- development/index.html | 10 +++--- haplotagdata/index.html | 31 ++++++++++++++---- index.html | 27 +++------------ install/index.html | 10 +++--- issues/index.html | 10 +++--- modules/align/bwa/index.html | 10 +++--- modules/align/ema/index.html | 10 +++--- modules/align/index.html | 8 ++--- modules/align/strobe/index.html | 10 +++--- modules/deconvolve/index.html | 10 +++--- modules/demultiplex/index.html | 10 +++--- modules/impute/index.html | 10 +++--- modules/other/index.html | 10 +++--- modules/phase/index.html | 10 +++--- modules/preflight/index.html | 10 +++--- modules/qc/index.html | 10 +++--- modules/simulate/index.html | 8 ++--- .../simulate/simulate-linkedreads/index.html | 10 +++--- modules/simulate/simulate-variants/index.html | 10 +++--- modules/snp/index.html | 10 +++--- modules/sv/index.html | 8 ++--- modules/sv/leviathan/index.html | 10 +++--- modules/sv/naibr/index.html | 10 +++--- resources/js/config.js | 2 +- resources/js/search.json | 2 +- sitemap.xml.gz | Bin 576 -> 577 bytes snakemake/index.html | 10 +++--- software/index.html | 8 ++--- 35 files changed, 173 insertions(+), 173 deletions(-) diff --git a/404.html b/404.html index 9207e76d6..b8089cd07 100644 --- a/404.html +++ b/404.html @@ -4,7 +4,7 @@ - + @@ -29,11 +29,11 @@ - + - + - +
diff --git a/blog/filteringsnps/index.html b/blog/filteringsnps/index.html index 9f4275cb6..6f33ccce8 100644 --- a/blog/filteringsnps/index.html +++ b/blog/filteringsnps/index.html @@ -4,7 +4,7 @@ - + @@ -37,12 +37,12 @@ - + - + - - + +
diff --git a/blog/index.html b/blog/index.html index e0526a36d..9091b89e1 100644 --- a/blog/index.html +++ b/blog/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
diff --git a/blog/softwareenvironments/index.html b/blog/softwareenvironments/index.html index 874a11110..653ac78c4 100644 --- a/blog/softwareenvironments/index.html +++ b/blog/softwareenvironments/index.html @@ -4,7 +4,7 @@ - + @@ -37,12 +37,12 @@ - + - + - - + +
diff --git a/categories/guides/index.html b/categories/guides/index.html index bf57ac0d3..be80e8411 100644 --- a/categories/guides/index.html +++ b/categories/guides/index.html @@ -4,7 +4,7 @@ - + @@ -29,11 +29,11 @@ - + - + - +
diff --git a/categories/index.html b/categories/index.html index f98091388..825e8c15a 100644 --- a/categories/index.html +++ b/categories/index.html @@ -4,7 +4,7 @@ - + @@ -29,11 +29,11 @@ - + - + - +
diff --git a/commonoptions/index.html b/commonoptions/index.html index 8a3ae1e11..9e42d3f49 100644 --- a/commonoptions/index.html +++ b/commonoptions/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
diff --git a/development/index.html b/development/index.html index 6abf9e89f..2530420e7 100644 --- a/development/index.html +++ b/development/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + +
diff --git a/haplotagdata/index.html b/haplotagdata/index.html index 118afd787..11d6e91ac 100644 --- a/haplotagdata/index.html +++ b/haplotagdata/index.html @@ -4,7 +4,7 @@ - + @@ -12,7 +12,7 @@ Haplotag data | Harpy haplotag - + @@ -21,24 +21,24 @@ - + - + - + - + - + @@ -273,6 +273,23 @@

Haplotag data

+ +

+ # + What is haplotagging? +

+
+

Linked-read sequencing exists to combine the throughput and accuracy of short-read +sequencing with the long range haplotype information of long-read sequencing. +Haplotagging is an implementation of linked-read sequencing developed by +Meier et al. to:

+
    +
  1. sequence a large number of samples
  2. +
  3. achieve high molecular resolution
  4. +
  5. do both within a reasonable budget
  6. +
+

If you don't have haplotagged data, then Harpy will likely be of little to no use to you. See the haplotagging site +for more information about haplotagging and why you might consider it for your study system.

# diff --git a/index.html b/index.html index d7ecb48a0..15494ba34 100644 --- a/index.html +++ b/index.html @@ -4,7 +4,7 @@ - + @@ -34,11 +34,11 @@ - + - + - + @@ -277,29 +277,12 @@

-

Harpy is a haplotagging data processing pipeline for Linux-based systems. It uses all the +

Harpy is a haplotagging data processing pipeline for Linux-based systems. It uses all the magic of Snakemake under the hood to handle the worklfow decision-making, but as a user, you just interact with it like a normal command-line program. Harpy uses both well known and niche programs to take raw haplotagging sequences and process them to become called SNP genotypes (or haplotypes) or large structural variants (inversions, deletions, duplications). Most of the settings are pre-configured and the settings you can modify are done at the command line.

- -

- # - What is haplotagging? -

-
-

Linked-read sequencing exists to combine the throughput and accuracy of short-read -sequencing with the long range haplotype information of long-read sequencing. -Haplotagging is an implementation of linked-read sequencing developed by -Meier et al. to:

-
    -
  1. sequence a large number of samples
  2. -
  3. achieve high molecular resolution
  4. -
  5. do both within a reasonable budget
  6. -
-

If you don't have haplotagged data, then Harpy will likely be of little to no use to you. See the haplotagging site -for more information about haplotagging and why you might consider it for your study system.

# diff --git a/install/index.html b/install/index.html index 639fc499b..691cc9789 100644 --- a/install/index.html +++ b/install/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
diff --git a/issues/index.html b/issues/index.html index aa4bd32e3..54a194c5b 100644 --- a/issues/index.html +++ b/issues/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
diff --git a/modules/align/bwa/index.html b/modules/align/bwa/index.html index b9ad013f2..78cda26d0 100644 --- a/modules/align/bwa/index.html +++ b/modules/align/bwa/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/modules/align/ema/index.html b/modules/align/ema/index.html index ba0f00b73..70c5333bf 100644 --- a/modules/align/ema/index.html +++ b/modules/align/ema/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/modules/align/index.html b/modules/align/index.html index a149f9a40..f1dd89f81 100644 --- a/modules/align/index.html +++ b/modules/align/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
diff --git a/modules/align/strobe/index.html b/modules/align/strobe/index.html index d3835f0a6..db396f67d 100644 --- a/modules/align/strobe/index.html +++ b/modules/align/strobe/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/modules/deconvolve/index.html b/modules/deconvolve/index.html index e66ad0de4..1e76c6672 100644 --- a/modules/deconvolve/index.html +++ b/modules/deconvolve/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + + diff --git a/modules/demultiplex/index.html b/modules/demultiplex/index.html index 277d62328..aec5a8c22 100644 --- a/modules/demultiplex/index.html +++ b/modules/demultiplex/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/modules/impute/index.html b/modules/impute/index.html index 1168d9a9c..3b34cbd9a 100644 --- a/modules/impute/index.html +++ b/modules/impute/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/modules/other/index.html b/modules/other/index.html index 4bbdd7d18..86cfe4c7e 100644 --- a/modules/other/index.html +++ b/modules/other/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
diff --git a/modules/phase/index.html b/modules/phase/index.html index ca1dbe234..4ece70662 100644 --- a/modules/phase/index.html +++ b/modules/phase/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/modules/preflight/index.html b/modules/preflight/index.html index 8c403d1df..bf0533d18 100644 --- a/modules/preflight/index.html +++ b/modules/preflight/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + +
diff --git a/modules/qc/index.html b/modules/qc/index.html index f221af81e..971c1ad9e 100644 --- a/modules/qc/index.html +++ b/modules/qc/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/modules/simulate/index.html b/modules/simulate/index.html index f3fc37fcf..2757e80cc 100644 --- a/modules/simulate/index.html +++ b/modules/simulate/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
diff --git a/modules/simulate/simulate-linkedreads/index.html b/modules/simulate/simulate-linkedreads/index.html index 12ed5b150..7482f2504 100644 --- a/modules/simulate/simulate-linkedreads/index.html +++ b/modules/simulate/simulate-linkedreads/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/modules/simulate/simulate-variants/index.html b/modules/simulate/simulate-variants/index.html index d52993c19..d2f225357 100644 --- a/modules/simulate/simulate-variants/index.html +++ b/modules/simulate/simulate-variants/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + + diff --git a/modules/snp/index.html b/modules/snp/index.html index 957035bfc..9c254e485 100644 --- a/modules/snp/index.html +++ b/modules/snp/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/modules/sv/index.html b/modules/sv/index.html index ec6e254de..23844a403 100644 --- a/modules/sv/index.html +++ b/modules/sv/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
diff --git a/modules/sv/leviathan/index.html b/modules/sv/leviathan/index.html index 1de83e238..986165eea 100644 --- a/modules/sv/leviathan/index.html +++ b/modules/sv/leviathan/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/modules/sv/naibr/index.html b/modules/sv/naibr/index.html index 9b8600605..7c033dfbd 100644 --- a/modules/sv/naibr/index.html +++ b/modules/sv/naibr/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/resources/js/config.js b/resources/js/config.js index 31b2b6b5f..b741220db 100644 --- a/resources/js/config.js +++ b/resources/js/config.js @@ -1 +1 @@ -var __DOCS_CONFIG__ = {"id":"WFnv5tAvV+qU2qxTVZoO7KmTB230vOGyWS5","key":"5IlfN2BN3KYbH+NU6iGH78HIAosTuTSqHNih9mMb+/g.rgRv0B60+78hlQCsFGxj/xdSjxpVdVOGWDZNkC7zTFOM3hamXckax+n5a7Fm+KUvbBaTdlzleCEz09n+ZLc3/A.89","base":"/harpy/","host":"pdimens.github.io","version":"1.0.0","useRelativePaths":true,"documentName":"index.html","appendDocumentName":false,"trailingSlash":true,"preloadSearch":false,"cacheBustingToken":"3.5.0.774020233777","cacheBustingStrategy":"query","sidebarFilterPlaceholder":"Filter","toolbarFilterPlaceholder":"Filter","showSidebarFilter":true,"filterNotFoundMsg":"No member names found containing the query \"{query}\"","maxHistoryItems":15,"homeIcon":"","access":[{"value":"public","label":"Public"},{"value":"protected","label":"Protected"}],"toolbarLinks":[{"id":"fields","label":"Fields"},{"id":"properties","label":"Properties"},{"id":"methods","label":"Methods"},{"id":"events","label":"Events"}],"sidebar":[{"n":"/","l":"Home","s":""},{"n":"install","l":"Install","s":""},{"n":"modules","l":"Modules","c":false,"i":[{"n":"align","l":"Align","i":[{"n":"bwa","l":"BWA","s":""},{"n":"ema","l":"EMA","s":""},{"n":"strobe","l":"Strobe","s":""}],"s":""},{"n":"deconvolve","l":"Deconvolve","s":""},{"n":"demultiplex","l":"Demultiplex","s":""},{"n":"impute","l":"Impute","s":""},{"n":"other","l":"Other","s":""},{"n":"phase","l":"Phase","s":""},{"n":"preflight","l":"Preflight","s":""},{"n":"qc","l":"QC","s":""},{"n":"simulate","l":"Simulate","i":[{"n":"simulate-linkedreads","l":"Linked Reads","s":""},{"n":"simulate-variants","l":"Variants","s":""}],"s":""},{"n":"snp","l":"SNP","s":""},{"n":"sv","l":"SV","i":[{"n":"leviathan","l":"Leviathan","s":""},{"n":"naibr","l":"Naibr","s":""}],"s":""}],"s":""},{"n":"haplotagdata","l":"Haplotag data","s":""},{"n":"commonoptions","l":"Common Options","s":""},{"n":"issues","l":"Common Issues","s":""},{"n":"snakemake","l":"Snakemake things","s":""},{"n":"software","l":"Software","s":""},{"n":"development","l":"Development","s":""},{"n":"blog","l":"Blog","v":false,"i":[{"n":"softwareenvironments","l":" Choosing a software runtime method","v":false,"s":""},{"n":"filteringsnps","l":" Filtering Variants","v":false,"s":""}]}],"search":{"mode":1,"minChars":2,"maxResults":20,"placeholder":"Search","hotkeys":["k"],"noResultsFoundMsg":"Sorry, no results found.","recognizeLanguages":true,"languages":[0],"preload":false},"resources":{"History_Title_Label":"History","History_ClearLink_Label":"Clear","History_NoHistory_Label":"No history items","API_AccessFilter_Label":"Access","API_ParameterSection_Label":"PARAMETERS","API_SignatureSection_Label":"SIGNATURE","API_CopyHint_Label":"Copy","API_CopyNameHint_Label":"Copy name","API_CopyLinkHint_Label":"Copy link","API_CopiedAckHint_Label":"Copied!","API_MoreOverloads_Label":"more","API_MoreDropdownItems_Label":"More","API_OptionalParameter_Label":"optional","API_DefaultParameterValue_Label":"Default value","API_InheritedFilter_Label":"Inherited","Search_Input_Placeholder":"Search","Toc_Contents_Label":"Contents","Toc_RelatedClasses_Label":"Related Classes","History_JustNowTime_Label":"just now","History_AgoTime_Label":"ago","History_YearTime_Label":"y","History_MonthTime_Label":"mo","History_DayTime_Label":"d","History_HourTime_Label":"h","History_MinuteTime_Label":"m","History_SecondTime_Label":"s"}}; +var __DOCS_CONFIG__ = {"id":"CVx230Vf2/ARyeLObJ/v92FMrtlhFSGGtae","key":"2+uWk848hrBDMQbUnAjVF1swOR4BRxur8NbwMIkTC0A.g1XEqr2kdNyJ87sw267PbAUjULizld6C8s3a63TUs45Cn9TKZ//55MYfivAoIY9cJOJgijTm+/wWX9DZy4F6BA.88","base":"/harpy/","host":"pdimens.github.io","version":"1.0.0","useRelativePaths":true,"documentName":"index.html","appendDocumentName":false,"trailingSlash":true,"preloadSearch":false,"cacheBustingToken":"3.5.0.774021772995","cacheBustingStrategy":"query","sidebarFilterPlaceholder":"Filter","toolbarFilterPlaceholder":"Filter","showSidebarFilter":true,"filterNotFoundMsg":"No member names found containing the query \"{query}\"","maxHistoryItems":15,"homeIcon":"","access":[{"value":"public","label":"Public"},{"value":"protected","label":"Protected"}],"toolbarLinks":[{"id":"fields","label":"Fields"},{"id":"properties","label":"Properties"},{"id":"methods","label":"Methods"},{"id":"events","label":"Events"}],"sidebar":[{"n":"/","l":"Home","s":""},{"n":"install","l":"Install","s":""},{"n":"modules","l":"Modules","c":false,"i":[{"n":"align","l":"Align","i":[{"n":"bwa","l":"BWA","s":""},{"n":"ema","l":"EMA","s":""},{"n":"strobe","l":"Strobe","s":""}],"s":""},{"n":"deconvolve","l":"Deconvolve","s":""},{"n":"demultiplex","l":"Demultiplex","s":""},{"n":"impute","l":"Impute","s":""},{"n":"other","l":"Other","s":""},{"n":"phase","l":"Phase","s":""},{"n":"preflight","l":"Preflight","s":""},{"n":"qc","l":"QC","s":""},{"n":"simulate","l":"Simulate","i":[{"n":"simulate-linkedreads","l":"Linked Reads","s":""},{"n":"simulate-variants","l":"Variants","s":""}],"s":""},{"n":"snp","l":"SNP","s":""},{"n":"sv","l":"SV","i":[{"n":"leviathan","l":"Leviathan","s":""},{"n":"naibr","l":"Naibr","s":""}],"s":""}],"s":""},{"n":"haplotagdata","l":"Haplotag data","s":""},{"n":"commonoptions","l":"Common Options","s":""},{"n":"issues","l":"Common Issues","s":""},{"n":"snakemake","l":"Snakemake things","s":""},{"n":"software","l":"Software","s":""},{"n":"development","l":"Development","s":""},{"n":"blog","l":"Blog","v":false,"i":[{"n":"softwareenvironments","l":" Choosing a software runtime method","v":false,"s":""},{"n":"filteringsnps","l":" Filtering Variants","v":false,"s":""}]}],"search":{"mode":1,"minChars":2,"maxResults":20,"placeholder":"Search","hotkeys":["k"],"noResultsFoundMsg":"Sorry, no results found.","recognizeLanguages":true,"languages":[0],"preload":false},"resources":{"History_Title_Label":"History","History_ClearLink_Label":"Clear","History_NoHistory_Label":"No history items","API_AccessFilter_Label":"Access","API_ParameterSection_Label":"PARAMETERS","API_SignatureSection_Label":"SIGNATURE","API_CopyHint_Label":"Copy","API_CopyNameHint_Label":"Copy name","API_CopyLinkHint_Label":"Copy link","API_CopiedAckHint_Label":"Copied!","API_MoreOverloads_Label":"more","API_MoreDropdownItems_Label":"More","API_OptionalParameter_Label":"optional","API_DefaultParameterValue_Label":"Default value","API_InheritedFilter_Label":"Inherited","Search_Input_Placeholder":"Search","Toc_Contents_Label":"Contents","Toc_RelatedClasses_Label":"Related Classes","History_JustNowTime_Label":"just now","History_AgoTime_Label":"ago","History_YearTime_Label":"y","History_MonthTime_Label":"mo","History_DayTime_Label":"d","History_HourTime_Label":"h","History_MinuteTime_Label":"m","History_SecondTime_Label":"s"}}; diff --git a/resources/js/search.json b/resources/js/search.json index 45e457a43..ba1c1cf73 100644 --- a/resources/js/search.json +++ b/resources/js/search.json @@ -1 +1 @@ -[[{"i":"#","p":["Using Harpy to process your haplotagged data"]},{"l":"Home","p":["Harpy is a haplotagging data processing pipeline for Linux-based systems. It uses all the magic of Snakemake under the hood to handle the worklfow decision-making, but as a user, you just interact with it like a normal command-line"]},{"i":"what-is-haplotagging","l":"What is haplotagging?","p":["Linked-read sequencing exists to combine the throughput and accuracy of short-read sequencing with the long range haplotype information of long-read sequencing. Haplotagging is an implementation of linked-read sequencing developed by"]},{"l":"Harpy Modules","p":["Harpy is modular, meaning you can use different parts of it independent from each other. Need to only align reads? Great! Only want to call variants? Awesome! All modules are called by"]},{"l":"Using Harpy","p":["You can call harpy without any arguments (or with --help) to print the docstring to your terminal. You can likewise call any of the modules without arguments or with --help to see their usage (e.g."]},{"l":"Linked-Read Workflow","p":["Depending on your project goals, you may want any combination of SNPs, structural variants (inversions, deletions, duplications), or phased haplotypes. Below is a flow chart outlining a general workflow of linked-read data."]}],[{"l":"Install HARPY","p":["Harpy is now hosted on Bioconda! That means to install it, you just need to have mamba(or conda) on your Linux-based system and install it with a simple command. You can install Harpy into an existing environment or create a new one for it (recommended)."]}],[{"i":"#","p":["Align haplotagged sequences"]},{"l":"Align Sequences to a Genome","p":["After your sequences (in FASTQ format) have been checked for quality, you will need to align them to a reference genome before you can call variants. Harpy offers several aligners for this purpose:"]}],[{"i":"#","p":["Align haplotagged sequences with BWA MEM"]},{"l":"Map Reads onto a genome with BWA MEM","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to 1"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"BWA workflow"}],[{"i":"#","p":["Align haplotagged sequences with EMA"]},{"l":"Map Reads onto a genome with EMA","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Barcode whitelist","p":["Some linked-read methods (e.g. 10x, Tellseq) require the inclusion of a barcode \"whitelist.\" This file is a simple text file that has one barcode per line so a given software knows what barcodes to expect in your data."]},{"l":"Quality filtering","p":["The --quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to 1"]},{"l":"Marking PCR duplicates","p":["EMA marks duplicates in the resulting alignments, however the read with invalid barcodes are aligned separately with BWA. Therefore, Harpy uses samtools markdup to mark putative"]},{"l":"EMA workflow"}],[{"i":"#","p":["Align haplotagged sequences with strobealign"]},{"l":"Map Reads onto a genome with strobealign","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Read Length","p":["The strobealign program uses a new strobemer design for aligning and requires its own way of indexing the genome. The index must be configured for the average read length of the sample"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to 1"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"Strobealign workflow"}],[{"i":"#","p":["Resolve clashing barcodes from different molecules"]},{"l":"Resolve clashing barcodes from different molecules","p":["Running is optional. In the alignment workflows (), Harpy already uses a distance-based approach to deconvolve barcodes and assign MI tags (Molecular Identifier), whereas the workflow has the"]},{"l":"Running Options"},{"l":"Resulting Barcodes","p":["After deconvolution, some barcodes may have a hyphenated suffix like -1 or -2(e.g. A01C33B41D93-1). This is how deconvolution methods create unique variants of barcodes to denote that identical barcodes"]},{"l":"Harpy Deconvolution Nuances","p":["Some of the downstream linked-read tools Harpy uses expect linked read barcodes to either look like the 16-base 10X variety or a standard haplotag (AxxCxxBxxDxx). Their pattern-matching would not recognize barcodes deconvoluted with"]}],[{"i":"#","p":["Demultiplex raw sequences into haplotag barcoded samples"]},{"l":"Demultiplex Raw Sequences","p":["When pooling samples and sequencing them in parallel on an Illumina sequencer, you will be given large multiplexed FASTQ files in return. These files contain sequences for all of your samples and need to be demultiplexed using barcodes to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Haplotag Types"},{"l":"Gen I Demultiplex Workflow"}],[{"i":"#","p":["Impute genotypes for haplotagged data with Harpy"]},{"l":"Impute Genotypes using Sequences","p":["After variants have been called, you may want to impute missing genotypes to get the most from your data. Harpy uses STITCH to impute genotypes, a haplotype-based method that is linked-read aware. Imputing genotypes requires a variant call file"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Extra STITCH parameters","p":["You may add additional parameters to STITCH by way of the--extra-params(or -x) option. Since STITCH is a function in the R language, the parameters you add must be in R syntax (e.g."]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Parameter file","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters (explained in next section). The solution Harpy uses for this is to have the user"]},{"l":"STITCH Parameters"},{"l":"Imputation Workflow"}],[{"i":"#","p":["Generate extra files for analysis with Harpy"]},{"l":"Other Harpy modules","p":["Some parts of Harpy (variant calling, imputation) want or need extra files. You can create various files necessary for different modules using these extra modules:"]},{"l":"Other modules"},{"l":"resume","p":["When calling a workflow (e.g. ), Harpy performs various file checks and validations, sets up the Snakemake command, output folder(s), etc. In the event you want to continue a failed or manually terminated workflow without overwriting the workflow"]},{"l":"arguments","p":["The DIRECTORY is the output directory of a previous harpy-invoked workflow, which must have the workflow/config.yaml file. For example, if you previously ran harpy align bwa -o align-bwa ..."]},{"l":"popgroup","p":["Creates a sample grouping file for variant calling"]},{"i":"arguments-1","l":"arguments","p":["This optional file is useful if you want SNP variant calling to happen on a per-population level via or on samples pooled-as-populations via ."]},{"l":"stitchparams","p":["Create a template parameter file for the module. The file is formatted correctly and serves as a starting point for using parameters that make sense for your study."]},{"i":"arguments-2","l":"arguments","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters. The solution Harpy uses for this is to have the user provide a tab-delimited dataframe file where the columns are the 6 STITCH model"]}],[{"i":"#","p":["Phase haplotypes for haplotagged data with Harpy"]},{"l":"Phase SNPs into Haplotypes","p":["You may want to phase your genotypes into haplotypes, as haplotypes tend to be more informative than unphased genotypes (higher polymorphism, captures relationship between genotypes). Phasing"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Molecule distance","p":["The molecule distance refers to the base-pair distance dilineating separate molecules. In other words, when two alignments on a single contig share the same barcode, how far away from each other are we willing to say they were and still consider them having"]},{"l":"Pruning threshold","p":["The pruning threshold refers to a PHRED-scale value between 0-1 (a percentage) for removing low-confidence SNPs from consideration. With Harpy, you configure this value as an integer"]},{"l":"Phasing Workflow"}],[{"i":"#","p":["Run file format checks on haplotagged FASTQ/BAM files"]},{"l":"Pre-flight checks for input files","p":["Harpy does a lot of stuff with a lot of software and each of these programs expect the incoming data to follow particular formats (plural, unfortunately). These formatting opinions/specifics are at the mercy of the original developers and while there are times when Harpy can (and does)"]},{"l":"When to run"},{"l":"Running Options","p":["In addition to the , the and modules are configured using only command-line input arguments:"]},{"l":"Workflow"}],[{"i":"#","p":["Quality trim haplotagged sequences with Harpy"]},{"l":"Quality Trim Sequences","p":["Raw sequences are not suitable for downstream analyses. They have sequencing adapters, index sequences, regions of poor quality, etc. The first step of any genetic sequence analyses is to remove these adapters and trim poor quality data. You can remove adapters,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"QC Workflow"}],[{"i":"#","p":["Simulate genomic data"]},{"l":"Simulate Genomic Data","p":["You may be interested in benchmarking variant detection or maybe just trying out haplotagging data without any financial commitment-- that's where simulations come in handy."]},{"l":"Simulate Genomic Variants","p":["Harpy lets you simulate genomic variants via for different variant types such as single nucleotide polymorphisms (SNP), indels, inversions, copy number variants (CNV), and translocations. All you need is to provide a genome to simulate"]},{"l":"Simulate Haplotag Linked-Reads","p":["You can also simulate haplotag-style linked reads from an existing genome using . Harpy incorporates LRSIM to generate linked reads from a diploid genomic. If you only have a haploid genome, then you can create a diploid genome by simulating variants into it with"]}],[{"i":"#","p":["Simulate linked reads from a genome"]},{"l":"Simulate Linked Reads","p":["Simulate linked reads from a genome"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Mutation Rate","p":["The read simulation is two-part: first dwgsim generates forward and reverse FASTQ files from the provided genome haplotypes( HAP1_GENOME and HAP2_GENOME), then LRSIM takes over and creates linked-reads from that. The"]},{"l":"Simulating a single sample","p":["If you intend to simulate a \"single individual\" (i.e. use this module once), then you might want no additonal SNPs beyond the variants you may have already introduced into the genome and set"]},{"l":"Simulating multiple samples","p":["If you intend on simulating \"multiple individuals\" (i.e. use this module multiple times on the same genome haplotypes), it may make sense to set this value larger than 0 so there is some \"natural\" variation between your simulated individuals."]},{"l":"Partitions","p":["TL;DR: 10X partitions ≈ haplotag beads"]},{"l":"Barcodes","p":["Barcodes, if provided, must be given as 16-basepair nucleotide sequences, one per line. If not provided, Harpy will download the standard 10X Genomics 4M-with-alts-february-2016.txt"]},{"l":"10X to Haplotag conversion","p":["Harpy will convert the simulated 10X-style reads, where the 16-basepair barcode is at the beginning of read 1, to haplotag format, where the barcode is coded in the sequence header under the"]},{"l":"Choosing parameters","p":["LRSIM does internal calculations to determine the number of reads per molecule based on --read-pairs,--partitions, and --molecules-per. Understanding how these parameters affect the resulting sequences"]},{"l":"Parameter calculator","p":["Conveniently, we provide a calculator to help you make informed decisions for these parameters:"]},{"l":"Simulate Linkedreads Workflow"}],[{"i":"#","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Simulate Genomic Variants","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Modules","p":["There are 4 submodules with very obvious names:"]},{"l":"Running Options","p":["While there are serveral differences between the submodule command line options, each has available all the like other Harpy modules. Each requires and input genome at the end of the command line, and each requires either a"]},{"l":"Simulate known variants","p":["Rather than simulating random variants, you can use a VCF file as input to any of the submodules to have simuG simulate the variants (of that type) from the VCF file. This becomes particularly"]},{"l":"Heterozygosity","p":["Each submodule has a --heterozygosity parameter where you can specify the heterozygosity of an intended diploid genome, should you use the resulting VCF(s) to simulate variants again."]},{"l":"Simulate Diploid Assembly","p":["Here is a simple but realistic workflow of creating a diploid assembly with simulated variants. Due to the roundabout complexity of the process, attempts were made to use color to help keep track of the"]},{"l":"Step 1","p":["Simulate random variants onto your haploid assembly with --heterozygosity(-z) set above 0. We aren't interested in the resulting genome, but rather the positions of the variants"]},{"l":"Step 2","p":["Use the resulting hap1 and hap2 VCF files to simulate those same variants, but shuffled into homozygotes and heterozygotes, onto the original haploid genome, creating two haplotype"]},{"l":"Step 3","p":["Use the one of the new genome haplotypes for simulating other kinds of variants. Again, use --heterozygosity(-z) with a value greater than 0. Like Step 1, we're only interested in the haplotype VCF files (positions of variants) and not the resulting"]},{"l":"Step 4","p":["Use the resulting haplotype VCFs to simulate known variants onto the haplotype genomes from Step 2."]},{"l":"Step 5","p":["Repeat Step 3 and Step 4 to your heart's content."]}],[{"i":"#","p":["Call SNPs and small indels"]},{"l":"Call SNPs and small indels","p":["After reads have been aligned, e.g., with , you can use those alignment files(.bam) to call variants in your data. Harpy can call SNPs and small indels using bcftools mpileup or with"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"regions","p":["The --regions(-r) option lets you specify the genomic regions you want to call variants on. Keep in mind that mpileup uses 1-based positions for genomic intervals, whereas freebayes"]},{"l":"populations","p":["Grouping samples changes the way the variant callers computes certain statistics when calling variants. If you have reason to believe there is a biologically meaningful grouping scheme to your samples, then you should include"]},{"l":"SNP calling workflow"}],[{"i":"#","p":["Find structural variants"]},{"l":"Find structural variants","p":["The module identifies single nucleotide polymorphisms (SNP) and small indels, but you may want to (and should!) leverage the linked-read data to identify larger structural variants (SV) like large deletions, duplications, and"]},{"l":"Caveats"},{"l":"NAIBR","p":["While our testing shows that NAIBR tends to find known inversions that LEVIATHAN misses, the program requires haplotype phased bam files as input. That means the alignments have a"]},{"l":"LEVIATHAN","p":["LEVIATHAN relies on split-read information in the sequence alignments to call variants. The EMA aligner does not report split read alignments, instead it reports secondary alignments."]}],[{"i":"#","p":["Call structural variants using Leviathan"]},{"l":"Call Structural Variants using LEVIATHAN","p":["(like indels, insertions, duplications, breakends)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"LEVIATHAN workflow"}],[{"i":"#","p":["Call structural variants using NAIBR (plus)"]},{"l":"Call Structural Variants using NAIBR","p":["(like indels, insertions, duplications)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used to let the program determine how far apart alignments on a contig with the same barcode can be from each other and still considered as originating from the same DNA molecule. See"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"optional vcf file","p":["In order to get the best variant calling performance out of NAIBR, it requires phased bam files as input. The --vcf option is optional and not used by NAIBR. However, to use with"]},{"i":"a-phased-input---vcf","l":"a phased input --vcf","p":["This file can be in vcf/vcf.gz/bcf format and most importantly it must be phased haplotypes. There are various ways to haplotype SNPs, but you can use to phase your SNPs into haplotypes using the haplotag barcode information. The resulting phased VCF file can then be used as input here."]},{"l":"NAIBR workflow"}],[{"l":"Haplotag data"},{"l":"Data Format"},{"l":"Barcodes","p":["While barcodes are actually combinatorial bases, in the read headers they are represented with the format AxxCxxBxxDxx, where each barcode segment is denoted as Axx(or Bxx, etc.)."]},{"l":"barcode protocol varieties","p":["If you think haplotagging is as simple as exactly 96^4 unique barcodes, you would only be half-correct. The original haplotagging protocol in Meier et al. is good, but the authors (and others) have been working to improve this linked-read technology to improve"]},{"l":"where the barcodes go","p":["Chromium 10X linked-reads use a format where the barcode is the leading 16 bases of the forward (R1) read. However, haplotagging data does not use that format and many of the tools"]},{"l":"Read headers","p":["Like mentioned, the haplotag barcode is expected to be stored in the BX:Z: tag in the read header. This information is retained through the various Harpy steps. An example read header could look like:"]},{"l":"Read length","p":["Reads must be at least 30 base pairs in length for alignment. By default, the module removes reads <30bp."]},{"l":"Compression","p":["Harpy generally doesn't require the input sequences to be in gzipped/bgzipped format, but it's good practice to compress your reads anyway. Compressed files are expected to end with the extension"]},{"l":"Naming conventions","p":["Unfortunately, there are many different ways of naming FASTQ files, which makes it difficult to accomodate every wacky iteration currently in circulation. While Harpy tries its best to be flexible, there are limitations."]},{"l":"Barcode thresholds","p":["By the nature of linked read technologies, there will (almost always) be more DNA fragments than unique barcodes for them. As a result, it's common for barcodes to reappear in sequences. Rather than incorrectly assume that all sequences/alignments with the same barcode"]}],[{"l":"Common Harpy Options"},{"l":"Input Arguments","p":["Each of the main Harpy modules (e.g. or ) follows the format of"]},{"l":"Common command-line options","p":["Every Harpy module has a series of configuration parameters. These are arguments you need to input to configure the module to run on your data, such as the directory with the reads/alignments,"]},{"l":"The workflow folder","p":["When you run one of the main Harpy modules, the output directory will contain a workflow folder. This folder is both necessary for the module to run and is very useful to understand what the module did, be it for your own"]},{"l":"The Genome folder","p":["You will notice that many of the workflows will create a Genome folder in the working directory. This folder is to make it easier for Harpy to store the genome and the associated"]}],[{"l":"Common Issues","p":["Lots of stuff can go wrong during an analysis. The intent of this page is to highlight common issues you may experience during analysis and ways to address these issues."]},{"l":"Problem installing with conda","p":["Conda is an awesome package manager, but it's slow and uses a ton of memory as dependencies increase. Harpy has a lot of dependencies and you might stall out conda trying to install it. Use mamba instead-- it'll work where conda fails."]},{"l":"Failures during imputation or phasing","p":["If you use bamutils clipOverlap on alignments that are used for the or modules, they will cause both programs to error. We don't know why, but they do."]},{"i":"alignment-file-name-and-id-tag-mismatch","l":"Alignment file name and ID: tag mismatch","p":["Aligning a sample to a genome via Harpy will insert the sample name (based on the file name) into the alignment header (the @RG ID:name SM:name tag). It likewise expects, through various steps,"]}],[{"l":"Adding Snakamake parameters","p":["Harpy relies on Snakemake under the hood to handle file and job dependencies. Most of these details have been abstracted away from the end-user, but every module of Harpy (except"]},{"l":"Common use cases","p":["You likely wont need to invoke --snakemake very often, if ever. However, here examples of some possible use cases for this parameter."]}],[{"l":"Software used in Harpy","p":["Harpy is the sum of its parts, and out of tremendous respect for the developers involved in the included software, we would like to highlight the tools directly involved in Harpy's many moving pieces."]},{"l":"Standalone Software"},{"l":"Software Packages"}],[{"l":"Developing Harpy","p":["Harpy is an open source program written using a combination of BASH, R, RMarkdown, Python, and Snakemake. This page provides information on Harpy's development and how to contribute to it, if you were inclined to do so."]},{"l":"Installing Harpy for development","p":["The process follows cloning the harpy repository, installing the preconfigured conda environment, and running the resources/buildlocal.sh script to move all the necessary files to the"]},{"i":"harpys-components","l":"Harpy's components"},{"l":"source code","p":["Harpy runs in two stages:"]},{"l":"Bioconda recipe","p":["For the ease of installation for end-users, Harpy has a recipe and build script in Bioconda, which makes it available for download and installation. A copy of the recipe is also"]},{"l":"The Harpy repository"},{"l":"structure","p":["Harpy exists as a Git repository and has 5 standard branches that are used in specific ways during development. Git is a popular version control system and discussing its use is out of the scope of this documentation, however there is no"]},{"l":"development workflow","p":["The dev workflow is reasonably standard:"]},{"l":"containerization","p":["As of Harpy v1.0, the software dependencies that the Snakemake workflows use are pre-configured as a Docker image that is uploaded to Dockerhub. Updating or editing this container can be done automatically or manually."]},{"l":"automatically","p":["The testing GitHub Action will automatically create a Dockerfile with (a hidden harpy command) and build a new Docker container, then upload it to dockerhub with the latest tag. This process is triggered on"]},{"l":"manually","p":["The dockerfile for that container is created by using a hidden harpy command"]},{"l":"Automations"},{"l":"Testing","p":["CI ( C ontinuous I ntegration) is a term describing automated actions that do things to/with your code and are triggered by how you interact with a repository. Harpy has a series of GitHub Actions triggered by interactions with the"]},{"l":"Releases","p":["There is an automation that gets triggered every time Harpy is tagged with the new version. It strips out the unnecessary files and will upload a cleaned tarball to the new release (reducing filesize by orders of magnitude). The automation will also"]}],[{"l":"Blog"}],[{"i":"#","p":["Deciding between using Conda or Containers"]},{"l":"Choosing a software runtime method","p":["There are two ways you can run Harpy, using a container with the necessary software environments in it (the default), or with local conda environments(with the --conda option). If software development and containerization"]},{"i":"tldr","l":"TL;DR"},{"l":"What Harpy Provides","p":["An conda-based installation of Harpy provides only the minimal set of programs Harpy needs to begin a workflow. These include: python 3.12, snakemake-minimal, pandas, and the htslib programs (htslib, samtools, bcftools, tabix)."]},{"l":"How Harpy Provides the Other Stuff","p":["Instead of a monolithic Harpy environment, which would be impossible with the current software dependencies, there are a handful of defined conda environment recipes that Harpy workflows generate. Snakemake will make"]},{"l":"Harpy and Containers","p":["The Harpy team manages a container on Dockerhub called, you guessed it, Harpy, that is synchronously versioned with the Harpy software. In other words, if you're using Harpy v1.4, it will use the container version v1.4. The"]},{"i":"whats-the-catch","l":"What's the Catch?","p":["While local conda enviroments at runtime or containers might seem like foolproof approaches, there are drawbacks."]},{"i":"conda-caveats","l":"Conda Caveats:"},{"i":"conda-caveat-1-inconsistent","l":"⚠️ Conda Caveat 1: Inconsistent","p":["Despite our and conda's best efforts, sometimes programs just don't install correctly on some systems due to unexpected system (or conda) configurations. This results in frustrating errors where jobs fail because software that is"]},{"i":"conda-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Conda Caveat 2: Troubleshooting","p":["To manually troubleshoot many of the tasks Harpy workflows perform, you may need to jump into one of the local conda environments in .snakemake/conda. That itself isn't terrible, but it's an extra step because you will"]},{"l":"Container Caveats"},{"i":"container-caveat-1-speed","l":"\uD83D\uDEA5 Container Caveat 1: Speed","p":["The overhead of Snakemake creating a container instance for a job, then cleaning it up after the job is done is not trivial and can negatively impact runtime."]},{"i":"container-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Container Caveat 2: Troubleshooting","p":["The command Snakemake secretly invokes to run a job in a container is quite lengthy. In most cases that shouldn't matter to you, but when something eventually goes wrong and you need to troubleshoot, it's harder"]}],[{"i":"#","p":["A gentle introduction to the wild world of filtering SNPs"]},{"l":"Filtering Variants","p":["The discussion around filtering SNPs and indels is massive and many researchers go about it differently, each very opinionated as to why their method is the best. As a starting point, have a look at how the authors of"]},{"i":"genotype-quality-qual","l":"genotype quality (QUAL)","p":["You will obviously want higher quality genotype calls to remove false positives. The HTSlib guide suggests at least 50(e.g. -i 'QUAL=50'), but we typically filter much higher at"]},{"i":"read-depth-dp","l":"read depth (DP)","p":["Variant sites with too few reads backing up the genotype might be false positives, although this may not hold true for very low-coverage data. Conversely, a maximum cut off is important because sites with very high read depths (relative to the distribution of read depth)"]},{"i":"minor-allele-frequency-maf","l":"minor allele frequency (MAF)","p":["It's usually advisable to set a minor allele frequency threshold with which to remove sites below that threshold. The reasoning is that if a MAF is too low, it might be because of incorrectly called genotypes in a very small handful of individuals (e.g. one or two)."]},{"i":"missing-data-f_missing","l":"missing data (F_MISSING)","p":["Missing data is, frankly, not terribly useful. The amount of missing data you're willing to tolerate will depend on your study, but it's common to remove sites with >20% missing data (e.g."]}]] \ No newline at end of file +[[{"i":"#","p":["Using Harpy to process your haplotagged data"]},{"l":"Home","p":["Harpy is a haplotagging data processing pipeline for Linux-based systems. It uses all the magic of Snakemake under the hood to handle the worklfow decision-making, but as a user, you just interact with it like a normal command-line"]},{"l":"Harpy Modules","p":["Harpy is modular, meaning you can use different parts of it independent from each other. Need to only align reads? Great! Only want to call variants? Awesome! All modules are called by"]},{"l":"Using Harpy","p":["You can call harpy without any arguments (or with --help) to print the docstring to your terminal. You can likewise call any of the modules without arguments or with --help to see their usage (e.g."]},{"l":"Linked-Read Workflow","p":["Depending on your project goals, you may want any combination of SNPs, structural variants (inversions, deletions, duplications), or phased haplotypes. Below is a flow chart outlining a general workflow of linked-read data."]}],[{"l":"Install HARPY","p":["Harpy is now hosted on Bioconda! That means to install it, you just need to have mamba(or conda) on your Linux-based system and install it with a simple command. You can install Harpy into an existing environment or create a new one for it (recommended)."]}],[{"i":"#","p":["Align haplotagged sequences"]},{"l":"Align Sequences to a Genome","p":["After your sequences (in FASTQ format) have been checked for quality, you will need to align them to a reference genome before you can call variants. Harpy offers several aligners for this purpose:"]}],[{"i":"#","p":["Align haplotagged sequences with BWA MEM"]},{"l":"Map Reads onto a genome with BWA MEM","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to 1"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"BWA workflow"}],[{"i":"#","p":["Align haplotagged sequences with EMA"]},{"l":"Map Reads onto a genome with EMA","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Barcode whitelist","p":["Some linked-read methods (e.g. 10x, Tellseq) require the inclusion of a barcode \"whitelist.\" This file is a simple text file that has one barcode per line so a given software knows what barcodes to expect in your data."]},{"l":"Quality filtering","p":["The --quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to 1"]},{"l":"Marking PCR duplicates","p":["EMA marks duplicates in the resulting alignments, however the read with invalid barcodes are aligned separately with BWA. Therefore, Harpy uses samtools markdup to mark putative"]},{"l":"EMA workflow"}],[{"i":"#","p":["Align haplotagged sequences with strobealign"]},{"l":"Map Reads onto a genome with strobealign","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Read Length","p":["The strobealign program uses a new strobemer design for aligning and requires its own way of indexing the genome. The index must be configured for the average read length of the sample"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to 1"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"Strobealign workflow"}],[{"i":"#","p":["Resolve clashing barcodes from different molecules"]},{"l":"Resolve clashing barcodes from different molecules","p":["Running is optional. In the alignment workflows (), Harpy already uses a distance-based approach to deconvolve barcodes and assign MI tags (Molecular Identifier), whereas the workflow has the"]},{"l":"Running Options"},{"l":"Resulting Barcodes","p":["After deconvolution, some barcodes may have a hyphenated suffix like -1 or -2(e.g. A01C33B41D93-1). This is how deconvolution methods create unique variants of barcodes to denote that identical barcodes"]},{"l":"Harpy Deconvolution Nuances","p":["Some of the downstream linked-read tools Harpy uses expect linked read barcodes to either look like the 16-base 10X variety or a standard haplotag (AxxCxxBxxDxx). Their pattern-matching would not recognize barcodes deconvoluted with"]}],[{"i":"#","p":["Demultiplex raw sequences into haplotag barcoded samples"]},{"l":"Demultiplex Raw Sequences","p":["When pooling samples and sequencing them in parallel on an Illumina sequencer, you will be given large multiplexed FASTQ files in return. These files contain sequences for all of your samples and need to be demultiplexed using barcodes to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Haplotag Types"},{"l":"Gen I Demultiplex Workflow"}],[{"i":"#","p":["Impute genotypes for haplotagged data with Harpy"]},{"l":"Impute Genotypes using Sequences","p":["After variants have been called, you may want to impute missing genotypes to get the most from your data. Harpy uses STITCH to impute genotypes, a haplotype-based method that is linked-read aware. Imputing genotypes requires a variant call file"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Extra STITCH parameters","p":["You may add additional parameters to STITCH by way of the--extra-params(or -x) option. Since STITCH is a function in the R language, the parameters you add must be in R syntax (e.g."]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Parameter file","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters (explained in next section). The solution Harpy uses for this is to have the user"]},{"l":"STITCH Parameters"},{"l":"Imputation Workflow"}],[{"i":"#","p":["Generate extra files for analysis with Harpy"]},{"l":"Other Harpy modules","p":["Some parts of Harpy (variant calling, imputation) want or need extra files. You can create various files necessary for different modules using these extra modules:"]},{"l":"Other modules"},{"l":"resume","p":["When calling a workflow (e.g. ), Harpy performs various file checks and validations, sets up the Snakemake command, output folder(s), etc. In the event you want to continue a failed or manually terminated workflow without overwriting the workflow"]},{"l":"arguments","p":["The DIRECTORY is the output directory of a previous harpy-invoked workflow, which must have the workflow/config.yaml file. For example, if you previously ran harpy align bwa -o align-bwa ..."]},{"l":"popgroup","p":["Creates a sample grouping file for variant calling"]},{"i":"arguments-1","l":"arguments","p":["This optional file is useful if you want SNP variant calling to happen on a per-population level via or on samples pooled-as-populations via ."]},{"l":"stitchparams","p":["Create a template parameter file for the module. The file is formatted correctly and serves as a starting point for using parameters that make sense for your study."]},{"i":"arguments-2","l":"arguments","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters. The solution Harpy uses for this is to have the user provide a tab-delimited dataframe file where the columns are the 6 STITCH model"]}],[{"i":"#","p":["Phase haplotypes for haplotagged data with Harpy"]},{"l":"Phase SNPs into Haplotypes","p":["You may want to phase your genotypes into haplotypes, as haplotypes tend to be more informative than unphased genotypes (higher polymorphism, captures relationship between genotypes). Phasing"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Molecule distance","p":["The molecule distance refers to the base-pair distance dilineating separate molecules. In other words, when two alignments on a single contig share the same barcode, how far away from each other are we willing to say they were and still consider them having"]},{"l":"Pruning threshold","p":["The pruning threshold refers to a PHRED-scale value between 0-1 (a percentage) for removing low-confidence SNPs from consideration. With Harpy, you configure this value as an integer"]},{"l":"Phasing Workflow"}],[{"i":"#","p":["Run file format checks on haplotagged FASTQ/BAM files"]},{"l":"Pre-flight checks for input files","p":["Harpy does a lot of stuff with a lot of software and each of these programs expect the incoming data to follow particular formats (plural, unfortunately). These formatting opinions/specifics are at the mercy of the original developers and while there are times when Harpy can (and does)"]},{"l":"When to run"},{"l":"Running Options","p":["In addition to the , the and modules are configured using only command-line input arguments:"]},{"l":"Workflow"}],[{"i":"#","p":["Quality trim haplotagged sequences with Harpy"]},{"l":"Quality Trim Sequences","p":["Raw sequences are not suitable for downstream analyses. They have sequencing adapters, index sequences, regions of poor quality, etc. The first step of any genetic sequence analyses is to remove these adapters and trim poor quality data. You can remove adapters,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"QC Workflow"}],[{"i":"#","p":["Simulate genomic data"]},{"l":"Simulate Genomic Data","p":["You may be interested in benchmarking variant detection or maybe just trying out haplotagging data without any financial commitment-- that's where simulations come in handy."]},{"l":"Simulate Genomic Variants","p":["Harpy lets you simulate genomic variants via for different variant types such as single nucleotide polymorphisms (SNP), indels, inversions, copy number variants (CNV), and translocations. All you need is to provide a genome to simulate"]},{"l":"Simulate Haplotag Linked-Reads","p":["You can also simulate haplotag-style linked reads from an existing genome using . Harpy incorporates LRSIM to generate linked reads from a diploid genomic. If you only have a haploid genome, then you can create a diploid genome by simulating variants into it with"]}],[{"i":"#","p":["Simulate linked reads from a genome"]},{"l":"Simulate Linked Reads","p":["Simulate linked reads from a genome"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Mutation Rate","p":["The read simulation is two-part: first dwgsim generates forward and reverse FASTQ files from the provided genome haplotypes( HAP1_GENOME and HAP2_GENOME), then LRSIM takes over and creates linked-reads from that. The"]},{"l":"Simulating a single sample","p":["If you intend to simulate a \"single individual\" (i.e. use this module once), then you might want no additonal SNPs beyond the variants you may have already introduced into the genome and set"]},{"l":"Simulating multiple samples","p":["If you intend on simulating \"multiple individuals\" (i.e. use this module multiple times on the same genome haplotypes), it may make sense to set this value larger than 0 so there is some \"natural\" variation between your simulated individuals."]},{"l":"Partitions","p":["TL;DR: 10X partitions ≈ haplotag beads"]},{"l":"Barcodes","p":["Barcodes, if provided, must be given as 16-basepair nucleotide sequences, one per line. If not provided, Harpy will download the standard 10X Genomics 4M-with-alts-february-2016.txt"]},{"l":"10X to Haplotag conversion","p":["Harpy will convert the simulated 10X-style reads, where the 16-basepair barcode is at the beginning of read 1, to haplotag format, where the barcode is coded in the sequence header under the"]},{"l":"Choosing parameters","p":["LRSIM does internal calculations to determine the number of reads per molecule based on --read-pairs,--partitions, and --molecules-per. Understanding how these parameters affect the resulting sequences"]},{"l":"Parameter calculator","p":["Conveniently, we provide a calculator to help you make informed decisions for these parameters:"]},{"l":"Simulate Linkedreads Workflow"}],[{"i":"#","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Simulate Genomic Variants","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Modules","p":["There are 4 submodules with very obvious names:"]},{"l":"Running Options","p":["While there are serveral differences between the submodule command line options, each has available all the like other Harpy modules. Each requires and input genome at the end of the command line, and each requires either a"]},{"l":"Simulate known variants","p":["Rather than simulating random variants, you can use a VCF file as input to any of the submodules to have simuG simulate the variants (of that type) from the VCF file. This becomes particularly"]},{"l":"Heterozygosity","p":["Each submodule has a --heterozygosity parameter where you can specify the heterozygosity of an intended diploid genome, should you use the resulting VCF(s) to simulate variants again."]},{"l":"Simulate Diploid Assembly","p":["Here is a simple but realistic workflow of creating a diploid assembly with simulated variants. Due to the roundabout complexity of the process, attempts were made to use color to help keep track of the"]},{"l":"Step 1","p":["Simulate random variants onto your haploid assembly with --heterozygosity(-z) set above 0. We aren't interested in the resulting genome, but rather the positions of the variants"]},{"l":"Step 2","p":["Use the resulting hap1 and hap2 VCF files to simulate those same variants, but shuffled into homozygotes and heterozygotes, onto the original haploid genome, creating two haplotype"]},{"l":"Step 3","p":["Use the one of the new genome haplotypes for simulating other kinds of variants. Again, use --heterozygosity(-z) with a value greater than 0. Like Step 1, we're only interested in the haplotype VCF files (positions of variants) and not the resulting"]},{"l":"Step 4","p":["Use the resulting haplotype VCFs to simulate known variants onto the haplotype genomes from Step 2."]},{"l":"Step 5","p":["Repeat Step 3 and Step 4 to your heart's content."]}],[{"i":"#","p":["Call SNPs and small indels"]},{"l":"Call SNPs and small indels","p":["After reads have been aligned, e.g., with , you can use those alignment files(.bam) to call variants in your data. Harpy can call SNPs and small indels using bcftools mpileup or with"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"regions","p":["The --regions(-r) option lets you specify the genomic regions you want to call variants on. Keep in mind that mpileup uses 1-based positions for genomic intervals, whereas freebayes"]},{"l":"populations","p":["Grouping samples changes the way the variant callers computes certain statistics when calling variants. If you have reason to believe there is a biologically meaningful grouping scheme to your samples, then you should include"]},{"l":"SNP calling workflow"}],[{"i":"#","p":["Find structural variants"]},{"l":"Find structural variants","p":["The module identifies single nucleotide polymorphisms (SNP) and small indels, but you may want to (and should!) leverage the linked-read data to identify larger structural variants (SV) like large deletions, duplications, and"]},{"l":"Caveats"},{"l":"NAIBR","p":["While our testing shows that NAIBR tends to find known inversions that LEVIATHAN misses, the program requires haplotype phased bam files as input. That means the alignments have a"]},{"l":"LEVIATHAN","p":["LEVIATHAN relies on split-read information in the sequence alignments to call variants. The EMA aligner does not report split read alignments, instead it reports secondary alignments."]}],[{"i":"#","p":["Call structural variants using Leviathan"]},{"l":"Call Structural Variants using LEVIATHAN","p":["(like indels, insertions, duplications, breakends)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"LEVIATHAN workflow"}],[{"i":"#","p":["Call structural variants using NAIBR (plus)"]},{"l":"Call Structural Variants using NAIBR","p":["(like indels, insertions, duplications)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used to let the program determine how far apart alignments on a contig with the same barcode can be from each other and still considered as originating from the same DNA molecule. See"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"optional vcf file","p":["In order to get the best variant calling performance out of NAIBR, it requires phased bam files as input. The --vcf option is optional and not used by NAIBR. However, to use with"]},{"i":"a-phased-input---vcf","l":"a phased input --vcf","p":["This file can be in vcf/vcf.gz/bcf format and most importantly it must be phased haplotypes. There are various ways to haplotype SNPs, but you can use to phase your SNPs into haplotypes using the haplotag barcode information. The resulting phased VCF file can then be used as input here."]},{"l":"NAIBR workflow"}],[{"l":"Haplotag data"},{"i":"what-is-haplotagging","l":"What is haplotagging?","p":["Linked-read sequencing exists to combine the throughput and accuracy of short-read sequencing with the long range haplotype information of long-read sequencing. Haplotagging is an implementation of linked-read sequencing developed by"]},{"l":"Data Format"},{"l":"Barcodes","p":["While barcodes are actually combinatorial bases, in the read headers they are represented with the format AxxCxxBxxDxx, where each barcode segment is denoted as Axx(or Bxx, etc.)."]},{"l":"barcode protocol varieties","p":["If you think haplotagging is as simple as exactly 96^4 unique barcodes, you would only be half-correct. The original haplotagging protocol in Meier et al. is good, but the authors (and others) have been working to improve this linked-read technology to improve"]},{"l":"where the barcodes go","p":["Chromium 10X linked-reads use a format where the barcode is the leading 16 bases of the forward (R1) read. However, haplotagging data does not use that format and many of the tools"]},{"l":"Read headers","p":["Like mentioned, the haplotag barcode is expected to be stored in the BX:Z: tag in the read header. This information is retained through the various Harpy steps. An example read header could look like:"]},{"l":"Read length","p":["Reads must be at least 30 base pairs in length for alignment. By default, the module removes reads <30bp."]},{"l":"Compression","p":["Harpy generally doesn't require the input sequences to be in gzipped/bgzipped format, but it's good practice to compress your reads anyway. Compressed files are expected to end with the extension"]},{"l":"Naming conventions","p":["Unfortunately, there are many different ways of naming FASTQ files, which makes it difficult to accomodate every wacky iteration currently in circulation. While Harpy tries its best to be flexible, there are limitations."]},{"l":"Barcode thresholds","p":["By the nature of linked read technologies, there will (almost always) be more DNA fragments than unique barcodes for them. As a result, it's common for barcodes to reappear in sequences. Rather than incorrectly assume that all sequences/alignments with the same barcode"]}],[{"l":"Common Harpy Options"},{"l":"Input Arguments","p":["Each of the main Harpy modules (e.g. or ) follows the format of"]},{"l":"Common command-line options","p":["Every Harpy module has a series of configuration parameters. These are arguments you need to input to configure the module to run on your data, such as the directory with the reads/alignments,"]},{"l":"The workflow folder","p":["When you run one of the main Harpy modules, the output directory will contain a workflow folder. This folder is both necessary for the module to run and is very useful to understand what the module did, be it for your own"]},{"l":"The Genome folder","p":["You will notice that many of the workflows will create a Genome folder in the working directory. This folder is to make it easier for Harpy to store the genome and the associated"]}],[{"l":"Common Issues","p":["Lots of stuff can go wrong during an analysis. The intent of this page is to highlight common issues you may experience during analysis and ways to address these issues."]},{"l":"Problem installing with conda","p":["Conda is an awesome package manager, but it's slow and uses a ton of memory as dependencies increase. Harpy has a lot of dependencies and you might stall out conda trying to install it. Use mamba instead-- it'll work where conda fails."]},{"l":"Failures during imputation or phasing","p":["If you use bamutils clipOverlap on alignments that are used for the or modules, they will cause both programs to error. We don't know why, but they do."]},{"i":"alignment-file-name-and-id-tag-mismatch","l":"Alignment file name and ID: tag mismatch","p":["Aligning a sample to a genome via Harpy will insert the sample name (based on the file name) into the alignment header (the @RG ID:name SM:name tag). It likewise expects, through various steps,"]}],[{"l":"Adding Snakamake parameters","p":["Harpy relies on Snakemake under the hood to handle file and job dependencies. Most of these details have been abstracted away from the end-user, but every module of Harpy (except"]},{"l":"Common use cases","p":["You likely wont need to invoke --snakemake very often, if ever. However, here examples of some possible use cases for this parameter."]}],[{"l":"Software used in Harpy","p":["Harpy is the sum of its parts, and out of tremendous respect for the developers involved in the included software, we would like to highlight the tools directly involved in Harpy's many moving pieces."]},{"l":"Standalone Software"},{"l":"Software Packages"}],[{"l":"Developing Harpy","p":["Harpy is an open source program written using a combination of BASH, R, RMarkdown, Python, and Snakemake. This page provides information on Harpy's development and how to contribute to it, if you were inclined to do so."]},{"l":"Installing Harpy for development","p":["The process follows cloning the harpy repository, installing the preconfigured conda environment, and running the resources/buildlocal.sh script to move all the necessary files to the"]},{"i":"harpys-components","l":"Harpy's components"},{"l":"source code","p":["Harpy runs in two stages:"]},{"l":"Bioconda recipe","p":["For the ease of installation for end-users, Harpy has a recipe and build script in Bioconda, which makes it available for download and installation. A copy of the recipe is also"]},{"l":"The Harpy repository"},{"l":"structure","p":["Harpy exists as a Git repository and has 5 standard branches that are used in specific ways during development. Git is a popular version control system and discussing its use is out of the scope of this documentation, however there is no"]},{"l":"development workflow","p":["The dev workflow is reasonably standard:"]},{"l":"containerization","p":["As of Harpy v1.0, the software dependencies that the Snakemake workflows use are pre-configured as a Docker image that is uploaded to Dockerhub. Updating or editing this container can be done automatically or manually."]},{"l":"automatically","p":["The testing GitHub Action will automatically create a Dockerfile with (a hidden harpy command) and build a new Docker container, then upload it to dockerhub with the latest tag. This process is triggered on"]},{"l":"manually","p":["The dockerfile for that container is created by using a hidden harpy command"]},{"l":"Automations"},{"l":"Testing","p":["CI ( C ontinuous I ntegration) is a term describing automated actions that do things to/with your code and are triggered by how you interact with a repository. Harpy has a series of GitHub Actions triggered by interactions with the"]},{"l":"Releases","p":["There is an automation that gets triggered every time Harpy is tagged with the new version. It strips out the unnecessary files and will upload a cleaned tarball to the new release (reducing filesize by orders of magnitude). The automation will also"]}],[{"l":"Blog"}],[{"i":"#","p":["Deciding between using Conda or Containers"]},{"l":"Choosing a software runtime method","p":["There are two ways you can run Harpy, using a container with the necessary software environments in it (the default), or with local conda environments(with the --conda option). If software development and containerization"]},{"i":"tldr","l":"TL;DR"},{"l":"What Harpy Provides","p":["An conda-based installation of Harpy provides only the minimal set of programs Harpy needs to begin a workflow. These include: python 3.12, snakemake-minimal, pandas, and the htslib programs (htslib, samtools, bcftools, tabix)."]},{"l":"How Harpy Provides the Other Stuff","p":["Instead of a monolithic Harpy environment, which would be impossible with the current software dependencies, there are a handful of defined conda environment recipes that Harpy workflows generate. Snakemake will make"]},{"l":"Harpy and Containers","p":["The Harpy team manages a container on Dockerhub called, you guessed it, Harpy, that is synchronously versioned with the Harpy software. In other words, if you're using Harpy v1.4, it will use the container version v1.4. The"]},{"i":"whats-the-catch","l":"What's the Catch?","p":["While local conda enviroments at runtime or containers might seem like foolproof approaches, there are drawbacks."]},{"i":"conda-caveats","l":"Conda Caveats:"},{"i":"conda-caveat-1-inconsistent","l":"⚠️ Conda Caveat 1: Inconsistent","p":["Despite our and conda's best efforts, sometimes programs just don't install correctly on some systems due to unexpected system (or conda) configurations. This results in frustrating errors where jobs fail because software that is"]},{"i":"conda-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Conda Caveat 2: Troubleshooting","p":["To manually troubleshoot many of the tasks Harpy workflows perform, you may need to jump into one of the local conda environments in .snakemake/conda. That itself isn't terrible, but it's an extra step because you will"]},{"l":"Container Caveats"},{"i":"container-caveat-1-speed","l":"\uD83D\uDEA5 Container Caveat 1: Speed","p":["The overhead of Snakemake creating a container instance for a job, then cleaning it up after the job is done is not trivial and can negatively impact runtime."]},{"i":"container-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Container Caveat 2: Troubleshooting","p":["The command Snakemake secretly invokes to run a job in a container is quite lengthy. In most cases that shouldn't matter to you, but when something eventually goes wrong and you need to troubleshoot, it's harder"]}],[{"i":"#","p":["A gentle introduction to the wild world of filtering SNPs"]},{"l":"Filtering Variants","p":["The discussion around filtering SNPs and indels is massive and many researchers go about it differently, each very opinionated as to why their method is the best. As a starting point, have a look at how the authors of"]},{"i":"genotype-quality-qual","l":"genotype quality (QUAL)","p":["You will obviously want higher quality genotype calls to remove false positives. The HTSlib guide suggests at least 50(e.g. -i 'QUAL=50'), but we typically filter much higher at"]},{"i":"read-depth-dp","l":"read depth (DP)","p":["Variant sites with too few reads backing up the genotype might be false positives, although this may not hold true for very low-coverage data. Conversely, a maximum cut off is important because sites with very high read depths (relative to the distribution of read depth)"]},{"i":"minor-allele-frequency-maf","l":"minor allele frequency (MAF)","p":["It's usually advisable to set a minor allele frequency threshold with which to remove sites below that threshold. The reasoning is that if a MAF is too low, it might be because of incorrectly called genotypes in a very small handful of individuals (e.g. one or two)."]},{"i":"missing-data-f_missing","l":"missing data (F_MISSING)","p":["Missing data is, frankly, not terribly useful. The amount of missing data you're willing to tolerate will depend on your study, but it's common to remove sites with >20% missing data (e.g."]}]] \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz index 4c286fb195bb25e14968079ba41a3b3689cd7a76..0cb9e9d96eacadc062fa1fe6b55f0b1ad990e4fb 100644 GIT binary patch literal 577 zcmV-H0>1qpiwFP!000001I(DslA16JO(k``i@v?S%+}F!G1+!b zc|p?3VHjdBNwnPh*tnYcvIz&CT~4wj&ZCbpn^Qxrx*Itx&STO%&)NBE zR7|GR@k!%0>?uBNHD2Aait(ga|}|rd8dSDT*+ELBG_ArpjzfwD%skO z1cdAfOJ*DHF<_uYe+4tt2nPWJudqtAog!(KI6?4IES}f)k+9B1=pYpd7YcZQgE&80 zv`Uml_eS-YRQMlWyB{F6DFZp1^u802xkM4&9LAwqiwFP!000001I(DsZlf?1h4*=ihouFM zM#XGCAD=XC!=5r_F~<8az*e#q+Q+q|rrXETu!cMP&bDlfI>&QQt*MG-vdkCBYLTX| zX|~B$n`HSt#GS8x#9Akflk{GMH{P=A(FnlK1YacCD^q_CC(Zwi6BK*E@RX))jW(9T z#QJ~NDT!A(2CU&$8RB)ti6gikL0nRM;z~h~mKdaP^G*rRxRSNLN3gdNLG`f4QpwhK zBp_suSTfsij{yU@#0AVyBOC+_yuvEcc8a7`;sn73t_4bBQ9lI+_!Qpjbl?xF-yp*Zde7&?97hp`fmo zA%mlFOnu5%Ai>Om2JSFSpqe>(*#%7KUOJ;gAYsw~T-bdNp#rtCw_X-90Rop{;40Az zW$ZYW_k?w;nI~uVm~;R)L}8_AF87dTEd$ritF>Kr7Cm>8gK{U~O46953No)={R diff --git a/snakemake/index.html b/snakemake/index.html index 6d09a9b2a..cd64e7751 100644 --- a/snakemake/index.html +++ b/snakemake/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
diff --git a/software/index.html b/software/index.html index 8c118336d..2e011451a 100644 --- a/software/index.html +++ b/software/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +