diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..87b4bc6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.log +/log \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3258aef --- /dev/null +++ b/LICENSE @@ -0,0 +1,31 @@ +BSD 3-Clause License + +Copyright (c) 2024, Ivan William Harsono + +Redistribution and use in source and binary forms, with or without +modification, are permitted (subject to the limitations in the disclaimer +below) provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/Prerequisite.md b/Prerequisite.md new file mode 100644 index 0000000..33f5721 --- /dev/null +++ b/Prerequisite.md @@ -0,0 +1,41 @@ +## Download latest Docker +Source : https://docs.docker.com/engine/install/ubuntu/ +### Add Docker's official GPG key: +```bash +sudo apt-get update +sudo apt-get install ca-certificates curl +sudo install -m 0755 -d /etc/apt/keyrings +sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +sudo chmod a+r /etc/apt/keyrings/docker.asc +``` + +### Add the repository to Apt sources: +```bash +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null +sudo apt-get update +``` + +### Install latest version +```bash +sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +``` + +### Dry run using Hello World +```bash +sudo docker run hello-world +``` + + +## Download Latest Anaconda 2023.09 (Recommended for cleaner python environment, and some subtools require Python support) +Latest version on 14-02-2024 is 23.09 + +### Download and follow instructions from bash script +Note : choose yes for option adding Anaconda to PATH +```bash +wget https://repo.anaconda.com/archive/Anaconda3-2023.09-0-Linux-x86_64.sh +chmod a+x ./Anaconda3-2023.09-0-Linux-x86_64.sh +bash Anaconda3-2023.09-0-Linux-x86_64.sh +``` diff --git a/README.md b/README.md new file mode 100644 index 0000000..20a44ad --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ +# IDeRare + +IDeRare or *"Indonesia Exome Rare Disease Variant Discovery Pipeline"* is simple and ready to use variant discovery pipeline to discover rare disease variants from exome sequencing data. + +**Note:** currently IDeRare paper is being considered journal submission. + +## Description +- This pipeline is designed to be used in Linux environment +- Original paper may used different version of tools, and the prerequisite used in this pipeline is the latest version of the tools +- This pipeline is designed and tested with Indonesia rare disease trio patient, but it should be also usable for general cases of rare disease variant discovery from Exome Sequences data given paired end .fq.gz file and HPO data(s) +- Ensure you have at least 250GB free for database and application setup, and 100GB free for each Trio family exome set +- The .yaml file path are assuming all the folder are stored in ```Downloads``` folder with subfolder of ```Database``` (for RefSeq, dbNSFP, dbSNP, ClinVar), ```Sandbox``` (for application and its database), ```IDeRare``` (git cloned folder) + +## Quick Install +1. Clone this repository +```bash +git clone https://github.com/ivanwilliammd/IDeRare +``` +2. Have a Linux environment (Ubuntu or Ubuntu-like 22.04 LTS distro is recommended) +3. Install [Docker](https://docs.docker.com/engine/install/ubuntu/) and [Anaconda - optional](https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html)- see [Prerequisite.md](Prerequisite.md) for more details +4. Run dependency installation script and database script +```bash +source install_dependencies.sh +source download_database.sh +``` +2. Set the data, directory file reference and trio information on ```iderare.yml```.

+**Note** : all exome files should be located in the ```input/A_FASTQ``` folder of absolute path setup by ```data_dir``` at ```iderare.yml``` +

+![File Structure](picture/01.png) +![Example File](picture/02.png) +1. Run the bash script +```bash +source iderare.sh +``` \ No newline at end of file diff --git a/convert/GCF_000001405.40_GRCh38.p14_assembly_report_revised_snpsift.chrnames b/convert/GCF_000001405.40_GRCh38.p14_assembly_report_revised_snpsift.chrnames new file mode 100644 index 0000000..bb0e6bc --- /dev/null +++ b/convert/GCF_000001405.40_GRCh38.p14_assembly_report_revised_snpsift.chrnames @@ -0,0 +1,709 @@ +NC_000001.11 1 +NC_000002.12 2 +NC_000003.12 3 +NC_000004.12 4 +NC_000005.10 5 +NC_000006.12 6 +NC_000007.14 7 +NC_000008.11 8 +NC_000009.12 9 +NC_000010.11 10 +NC_000011.10 11 +NC_000012.12 12 +NC_000013.11 13 +NC_000014.9 14 +NC_000015.10 15 +NC_000016.10 16 +NC_000017.11 17 +NC_000018.10 18 +NC_000019.10 19 +NC_000020.11 20 +NC_000021.9 21 +NC_000022.11 22 +NC_000023.11 X +NC_000024.10 Y +NT_187361.1 HSCHR1_CTG1_UNLOCALIZED +NT_187362.1 HSCHR1_CTG2_UNLOCALIZED +NT_187363.1 HSCHR1_CTG3_UNLOCALIZED +NT_187364.1 HSCHR1_CTG4_UNLOCALIZED +NT_187365.1 HSCHR1_CTG5_UNLOCALIZED +NT_187366.1 HSCHR1_CTG6_UNLOCALIZED +NT_187367.1 HSCHR1_CTG7_UNLOCALIZED +NT_187368.1 HSCHR1_CTG8_UNLOCALIZED +NT_187369.1 HSCHR1_CTG9_UNLOCALIZED +NT_187370.1 HSCHR2_RANDOM_CTG1 +NT_187371.1 HSCHR2_RANDOM_CTG2 +NT_167215.1 HSCHR3UN_CTG2 +NT_113793.3 HSCHR4_RANDOM_CTG4 +NT_113948.1 HSCHR5_RANDOM_CTG1 +NT_187372.1 HSCHR9_UNLOCALIZED_CTG1 +NT_187373.1 HSCHR9_UNLOCALIZED_CTG2 +NT_187374.1 HSCHR9_UNLOCALIZED_CTG3 +NT_187375.1 HSCHR9_UNLOCALIZED_CTG4 +NT_113796.3 HSCHR14_CTG1_UNLOCALIZED +NT_113888.1 HSCHR14_CTG4_UNLOCALIZED +NT_167219.1 HSCHR14_CTG2_UNLOCALIZED +NT_187377.1 HSCHR14_CTG3_UNLOCALIZED +NT_187378.1 HSCHR14_CTG5_UNLOCALIZED +NT_187379.1 HSCHR14_CTG6_UNLOCALIZED +NT_187380.1 HSCHR14_CTG7_UNLOCALIZED +NT_187381.1 HSCHR14_CTG8_UNLOCALIZED +NT_187382.1 HSCHR15_RANDOM_CTG1 +NT_187383.1 HSCHR16_RANDOM_CTG1 +NT_113930.2 HSCHR17_RANDOM_CTG3 +NT_187384.1 HSCHR17_RANDOM_CTG4 +NT_187385.1 HSCHR17_RANDOM_CTG5 +NT_187386.1 HSCHR22_UNLOCALIZED_CTG1 +NT_187387.1 HSCHR22_UNLOCALIZED_CTG2 +NT_187388.1 HSCHR22_UNLOCALIZED_CTG3 +NT_187390.1 HSCHR22_UNLOCALIZED_CTG5 +NT_187391.1 HSCHR22_UNLOCALIZED_CTG6 +NT_187392.1 HSCHR22_UNLOCALIZED_CTG7 +NT_187393.1 HSCHR22_UNLOCALIZED_CTG8 +NT_187394.1 HSCHR22_UNLOCALIZED_CTG9 +NT_187395.1 HSCHRY_RANDOM_CTG1 +NT_113901.1 HSCHRUN_RANDOM_CTG1 +NT_167208.1 HSCHRUN_RANDOM_CTG2 +NT_167209.1 HSCHRUN_RANDOM_CTG4 +NT_167211.2 HSCHRUN_RANDOM_CTG6 +NT_113889.1 HSCHRUN_RANDOM_CTG9 +NT_167213.1 HSCHRUN_RANDOM_CTG10 +NT_167214.1 HSCHRUN_RANDOM_CTG11 +NT_167218.1 HSCHRUN_RANDOM_CTG16 +NT_167220.1 HSCHRUN_RANDOM_CTG19 +NT_187396.1 HSCHRUN_RANDOM_100 +NT_187398.1 HSCHRUN_RANDOM_102 +NT_187397.1 HSCHRUN_RANDOM_101 +NT_187399.1 HSCHRUN_RANDOM_103 +NT_187402.1 HSCHRUN_RANDOM_106 +NT_187406.1 HSCHRUN_RANDOM_110 +NT_187405.1 HSCHRUN_RANDOM_109 +NT_187404.1 HSCHRUN_RANDOM_108 +NT_187403.1 HSCHRUN_RANDOM_107 +NT_187407.1 HSCHRUN_RANDOM_111 +NT_187401.1 HSCHRUN_RANDOM_105 +NT_187400.1 HSCHRUN_RANDOM_104 +NT_187459.1 HSCHRUN_RANDOM_167 +NT_187458.1 HSCHRUN_RANDOM_166 +NT_187461.1 HSCHRUN_RANDOM_169 +NT_187460.1 HSCHRUN_RANDOM_168 +NT_187462.1 HSCHRUN_RANDOM_170 +NT_187465.1 HSCHRUN_RANDOM_173 +NT_187466.1 HSCHRUN_RANDOM_174 +NT_187463.1 HSCHRUN_RANDOM_171 +NT_187464.1 HSCHRUN_RANDOM_172 +NT_187469.1 HSCHRUN_RANDOM_177 +NT_187467.1 HSCHRUN_RANDOM_175 +NT_187468.1 HSCHRUN_RANDOM_176 +NT_187470.1 HSCHRUN_RANDOM_178 +NT_187494.1 HSCHRUN_RANDOM_202 +NT_187491.1 HSCHRUN_RANDOM_199 +NT_187492.1 HSCHRUN_RANDOM_200 +NT_187490.1 HSCHRUN_RANDOM_198 +NT_187493.1 HSCHRUN_RANDOM_201 +NT_187489.1 HSCHRUN_RANDOM_197 +NT_187471.1 HSCHRUN_RANDOM_179 +NT_187472.1 HSCHRUN_RANDOM_180 +NT_187486.1 HSCHRUN_RANDOM_194 +NT_187488.1 HSCHRUN_RANDOM_196 +NT_187482.1 HSCHRUN_RANDOM_190 +NT_187484.1 HSCHRUN_RANDOM_192 +NT_187487.1 HSCHRUN_RANDOM_195 +NT_187480.1 HSCHRUN_RANDOM_188 +NT_187475.1 HSCHRUN_RANDOM_183 +NT_187478.1 HSCHRUN_RANDOM_186 +NT_187473.1 HSCHRUN_RANDOM_181 +NT_187474.1 HSCHRUN_RANDOM_182 +NT_187481.1 HSCHRUN_RANDOM_189 +NT_187485.1 HSCHRUN_RANDOM_193 +NT_187483.1 HSCHRUN_RANDOM_191 +NT_187479.1 HSCHRUN_RANDOM_187 +NT_187476.1 HSCHRUN_RANDOM_184 +NT_187477.1 HSCHRUN_RANDOM_185 +NT_187409.1 HSCHRUN_RANDOM_113 +NT_187408.1 HSCHRUN_RANDOM_112 +NT_187410.1 HSCHRUN_RANDOM_114 +NT_187415.1 HSCHRUN_RANDOM_119 +NT_187412.1 HSCHRUN_RANDOM_116 +NT_187411.1 HSCHRUN_RANDOM_115 +NT_187413.1 HSCHRUN_RANDOM_117 +NT_187416.1 HSCHRUN_RANDOM_120 +NT_187417.1 HSCHRUN_RANDOM_121 +NT_187414.1 HSCHRUN_RANDOM_118 +NT_187418.1 HSCHRUN_RANDOM_122 +NT_187419.1 HSCHRUN_RANDOM_123 +NT_187424.1 HSCHRUN_RANDOM_128 +NT_187425.1 HSCHRUN_RANDOM_129 +NT_187420.1 HSCHRUN_RANDOM_124 +NT_187495.1 HSCHRUN_RANDOM_203 +NT_187422.1 HSCHRUN_RANDOM_126 +NT_187421.1 HSCHRUN_RANDOM_125 +NT_187423.1 HSCHRUN_RANDOM_127 +NT_187426.1 HSCHRUN_RANDOM_130 +NT_187437.1 HSCHRUN_RANDOM_141 +NT_187430.1 HSCHRUN_RANDOM_134 +NT_187428.1 HSCHRUN_RANDOM_132 +NT_187427.1 HSCHRUN_RANDOM_131 +NT_187435.1 HSCHRUN_RANDOM_139 +NT_187432.1 HSCHRUN_RANDOM_136 +NT_187436.1 HSCHRUN_RANDOM_140 +NT_187431.1 HSCHRUN_RANDOM_135 +NT_187438.1 HSCHRUN_RANDOM_142 +NT_187429.1 HSCHRUN_RANDOM_133 +NT_187433.1 HSCHRUN_RANDOM_137 +NT_187496.1 HSCHRUN_RANDOM_204 +NT_187434.1 HSCHRUN_RANDOM_138 +NT_187440.1 HSCHRUN_RANDOM_144 +NT_187439.1 HSCHRUN_RANDOM_143 +NT_187441.1 HSCHRUN_RANDOM_145 +NT_187443.1 HSCHRUN_RANDOM_147 +NT_187442.1 HSCHRUN_RANDOM_146 +NT_187444.1 HSCHRUN_RANDOM_148 +NT_187445.1 HSCHRUN_RANDOM_149 +NT_187450.1 HSCHRUN_RANDOM_158 +NT_187448.1 HSCHRUN_RANDOM_156 +NT_187449.1 HSCHRUN_RANDOM_157 +NT_187454.1 HSCHRUN_RANDOM_162 +NT_187446.1 HSCHRUN_RANDOM_154 +NT_187453.1 HSCHRUN_RANDOM_161 +NT_187447.1 HSCHRUN_RANDOM_155 +NT_187455.1 HSCHRUN_RANDOM_163 +NT_187451.1 HSCHRUN_RANDOM_159 +NT_187452.1 HSCHRUN_RANDOM_160 +NT_187457.1 HSCHRUN_RANDOM_165 +NT_187456.1 HSCHRUN_RANDOM_164 +NT_187497.1 HSCHRUN_RANDOM_CTG17 +NT_187513.1 HSCHRUN_RANDOM_CTG42 +NT_187498.1 HSCHRUN_RANDOM_CTG20 +NT_187499.1 HSCHRUN_RANDOM_CTG21 +NT_187500.1 HSCHRUN_RANDOM_CTG22 +NT_187501.1 HSCHRUN_RANDOM_CTG23 +NT_187502.1 HSCHRUN_RANDOM_CTG24 +NT_187503.1 HSCHRUN_RANDOM_CTG25 +NT_187504.1 HSCHRUN_RANDOM_CTG26 +NT_187505.1 HSCHRUN_RANDOM_CTG27 +NT_187506.1 HSCHRUN_RANDOM_CTG28 +NT_187508.1 HSCHRUN_RANDOM_CTG30 +NT_187509.1 HSCHRUN_RANDOM_CTG33 +NT_187510.1 HSCHRUN_RANDOM_CTG34 +NT_187511.1 HSCHRUN_RANDOM_CTG35 +NT_187512.1 HSCHRUN_RANDOM_CTG36 +NW_009646194.1 HG986_PATCH +NW_009646195.1 HG2058_PATCH +NW_009646196.1 HG2104_PATCH +NW_011332687.1 HG1832_PATCH +NW_011332688.1 HG2095_PATCH +NW_012132914.1 HG1342_HG2282_PATCH +NW_018654708.1 HG2002_PATCH +NW_019805487.1 HG460_PATCH +NW_025791756.1 HG1343_HG173_HG459_PATCH +NW_025791757.1 HG2571_PATCH +NW_025791758.1 HG2515_PATCH +NW_025791759.1 HG2577_PATCH +NW_014040925.1 HSCHR1_3_CTG3 +NW_014040926.1 HSCHR1_4_CTG3 +NW_014040927.1 HSCHR1_5_CTG32_1 +NW_015495298.1 HSCHR1_5_CTG3 +NW_017852928.1 HSCHR1_6_CTG3 +NW_018654706.1 HSCHR1_8_CTG3 +NW_018654707.1 HSCHR1_9_CTG3 +NW_025791753.1 HSCHR1_12_CTG3 +NW_025791754.1 HSCHR1_5_CTG31 +NW_025791755.1 HSCHR1_6_CTG31 +NW_011332689.1 HG2233_PATCH +NW_011332690.1 HG2232_PATCH +NW_012132915.1 HG2290_PATCH +NW_021159987.1 HG721_PATCH +NW_021159988.1 HG1384_PATCH +NW_025791764.1 HG2494_PATCH +NW_025791765.1 HG2275_PATCH +NW_025791766.1 HG2052_PATCH +NW_025791767.1 HG2231_HG2496_PATCH +NW_025791768.1 HG2140_PATCH +NW_015495299.1 HSCHR2_6_CTG7_2 +NW_018654709.1 HSCHR2_7_CTG7_2 +NW_018654710.1 HSCHR2_8_CTG7_2 +NW_025791760.1 HSCHR2_10_CTG7_2 +NW_025791761.1 HSCHR2_11_CTG7_2 +NW_025791762.1 HSCHR2_12_CTG7_2 +NW_025791763.1 HSCHR2_6_CTG1 +NW_009646197.1 HG2066_PATCH +NW_009646198.1 HG2022_PATCH +NW_011332691.1 HG126_PATCH +NW_012132916.1 HG2235_PATCH +NW_012132917.1 HG2237_PATCH +NW_017363813.1 HG2236_PATCH +NW_019805491.1 HG2133_PATCH +NW_025791769.1 HG2264_PATCH +NW_025791770.1 HG2077_PATCH +NW_025791771.1 HG2069_PATCH +NW_018654711.1 HSCHR3_4_CTG1 +NW_019805488.1 HSCHR3_7_CTG2_1 +NW_019805489.1 HSCHR3_8_CTG2_1 +NW_019805490.1 HSCHR3_9_CTG2_1 +NW_019805492.1 HSCHR3_6_CTG2_1 +NW_021159989.1 HSCHR3_5_CTG1 +NW_015495300.1 HG2023_PATCH +NW_021159990.1 HG699_PATCH +NW_021159991.1 HG2525_PATCH +NW_021159992.1 HG1299_PATCH +NW_021159993.1 HG1298_PATCH +NW_021159994.1 HG1296_PATCH +NW_021159995.1 HG705_PATCH +NW_025791773.1 HG2155_PATCH +NW_025791774.1 HG287_PATCH +NW_013171799.1 HSCHR4_2_CTG4 +NW_013171800.1 HSCHR4_8_CTG12 +NW_013171801.1 HSCHR4_9_CTG12 +NW_015495301.1 HSCHR4_11_CTG12 +NW_017363814.1 HSCHR4_12_CTG12 +NW_025791772.1 HSCHR4_2_CTG8_1 +NW_016107298.1 HG30_PATCH +NW_021159996.1 HG1395_PATCH +NW_025791775.1 HG1046_PATCH +NW_025791776.1 HG2476_PATCH +NW_025791777.1 HG2405_PATCH +NW_025791778.1 HG2308_PATCH +NW_009646199.1 HSCHR5_7_CTG1 +NW_016107297.1 HSCHR5_8_CTG1 +NW_018654712.1 HSCHR5_9_CTG1 +NW_025791779.1 HSCHR5_10_CTG1 +NW_009646200.1 HG2128_PATCH +NW_012132918.1 HG1651_PATCH +NW_013171802.1 HG2072_PATCH +NW_017363815.1 HG2121_PATCH +NW_018654713.1 HG2057_PATCH +NW_021159997.1 HG563_PATCH +NW_013171803.1 HSCHR6_1_CTG10 +NW_025791780.1 HSCHR6_1_CTG1 +NW_012132919.1 HG2239_PATCH +NW_017852929.1 HG2088_PATCH +NW_017852930.1 HG2266_PATCH +NW_018654714.1 HG708_PATCH +NW_021159998.1 HG1309_PATCH +NW_018654715.1 HSCHR7_3_CTG4_4 +NW_019805493.1 HSCHR7_3_CTG1 +NW_025791781.1 HSCHR7_4_CTG1 +NW_017852931.1 HG2067_PATCH +NW_017852932.1 HG2068_PATCH +NW_018654716.1 HG2419_PATCH +NW_018654717.1 HG76_PATCH +NW_025791782.1 HG2176_PATCH +NW_025791783.1 HG1047_PATCH +NW_025791784.1 HG2408_PATCH +NW_025791785.1 HG2267_PATCH +NW_025791786.1 HG2031_PATCH +NW_019805494.1 HSCHR8_7_CTG7 +NW_009646201.1 HG2030_PATCH +NW_021159999.1 HG613_PATCH +NW_025791787.1 HG2158_PATCH +NW_025791788.1 HG1012_PATCH +NW_025791789.1 HG1206_PATCH +NW_013171804.1 HSCHR9_1_CTG6 +NW_013171805.1 HSCHR9_1_CTG7 +NW_009646202.1 HG2191_PATCH +NW_011332692.1 HG2241_PATCH +NW_011332693.1 HG2242_HG2243_PATCH +NW_011332694.1 HG2244_HG2245_PATCH +NW_013171807.1 HG2334_PATCH +NW_021160000.1 HG545_PATCH +NW_021160001.1 HG1277_PATCH +NW_025791790.1 HG2576_PATCH +NW_013171806.1 HSCHR10_1_CTG6 +NW_009646203.1 HG2217_PATCH +NW_013171808.1 HG2116_PATCH +NW_015148966.2 HG107_HG2565_PATCH +NW_017363816.1 HG1708_PATCH +NW_019805495.1 HG2060_PATCH +NW_019805496.1 HG2114_PATCH +NW_021160002.1 HG1521_PATCH +NW_021160003.1 HG1445_PATCH +NW_021160004.1 HG28_PATCH +NW_021160005.1 HG2115_PATCH +NW_021160006.1 HG2111_PATCH +NW_025791792.1 HG152_PATCH +NW_025791793.1 HG2568_PATCH +NW_025791794.1 HG2578_PATCH +NW_011332695.1 HSCHR11_1_CTG1_2 +NW_019805497.1 HSCHR11_2_CTG8 +NW_019805498.1 HSCHR11_1_CTG3_1 +NW_025791791.1 HSCHR11_2_CTG3_1 +NW_009646204.1 HG23_PATCH +NW_011332696.1 HG1362_PATCH +NW_011332697.1 HG2247_PATCH +NW_015148967.1 HG2063_PATCH +NW_018654718.1 HG1815_PATCH +NW_018654719.1 HG2047_PATCH +NW_021160007.1 HG2246_HG2248_HG2276_PATCH +NW_021160008.1 HG1398_PATCH +NW_025791795.1 HG2554_PATCH +NW_013171809.1 HSCHR12_2_CTG1 +NW_018654720.1 HSCHR12_8_CTG2_1 +NW_019805499.1 HSCHR12_9_CTG2_1 +NW_009646205.1 HG2216_PATCH +NW_011332698.1 HG2288_HG2289_PATCH +NW_011332699.1 HG2291_PATCH +NW_011332700.1 HG2249_PATCH +NW_021160009.1 HG1817_1_PATCH +NW_021160010.1 HG1523_PATCH +NW_021160011.1 HG1524_PATCH +NW_021160012.1 HG2509_PATCH +NW_013171810.1 HSCHR13_1_CTG7 +NW_013171811.1 HSCHR13_1_CTG8 +NW_018654722.1 HG1_PATCH +NW_021160013.1 HG2510_PATCH +NW_025791796.1 HG2526_HG2573_PATCH +NW_018654721.1 HSCHR14_8_CTG1 +NW_021160014.1 HSCHR14_9_CTG1 +NW_011332701.1 HG2139_PATCH +NW_021160015.1 HG2499_PATCH +NW_021160016.1 HG2198_PATCH +NW_021160017.1 HG2365_PATCH +NW_021160018.1 HG2511_PATCH +NW_025791797.1 HG2280_PATCH +NW_012132920.1 HSCHR15_6_CTG8 +NW_025791798.1 HSCHR15_9_CTG8 +NW_017852933.1 HG926_PATCH +NW_019805500.1 HG2263_PATCH +NW_021160019.1 HG2471_PATCH +NW_025791799.1 HG401_PATCH +NW_025791800.1 HG405_PATCH +NW_012132921.1 HSCHR16_3_CTG3_1 +NW_013171812.1 HSCHR16_5_CTG1 +NW_013171813.1 HSCHR16_4_CTG3_1 +NW_018654723.1 HSCHR16_5_CTG3_1 +NW_016107299.1 HG2046_PATCH +NW_017363817.1 HG2285_HG106_HG2252_PATCH +NW_021160020.1 HG2087_PATCH +NW_021160021.1 HG1320_PATCH +NW_025791802.1 HG2118_PATCH +NW_025791803.1 HG2407_PATCH +NW_025791804.1 HG2251_PATCH +NW_025791805.1 HG1369_PATCH +NW_025791806.1 HG2580_PATCH +NW_017363818.1 HSCHR17_11_CTG4 +NW_017363819.1 HSCHR17_3_CTG1 +NW_019805501.1 HSCHR17_12_CTG4 +NW_025791801.1 HSCHR17_13_CTG4 +NW_013171814.1 HG2213_PATCH +NW_018654724.1 HG2442_PATCH +NW_019805502.1 HG2412_PATCH +NW_014040928.1 HSCHR18_5_CTG1_1 +NW_019805503.1 HSCHR18_1_CTG1 +NW_009646206.1 HG2021_PATCH +NW_014040929.1 HG26_PATCH +NW_021160022.1 HG109_PATCH +NW_025791807.1 HG2461_PATCH +NW_025791808.1 HG2569_PATCH +NW_025791809.1 HG2469_PATCH +NW_016107300.1 HSCHR19KIR_0019-4656-A_CTG3_1 +NW_016107301.1 HSCHR19KIR_CA01-TA01_1_CTG3_1 +NW_016107302.1 HSCHR19KIR_CA01-TA01_2_CTG3_1 +NW_016107303.1 HSCHR19KIR_CA01-TB04_CTG3_1 +NW_016107304.1 HSCHR19KIR_CA01-TB01_CTG3_1 +NW_016107305.1 HSCHR19KIR_HG2394_CTG3_1 +NW_016107306.1 HSCHR19KIR_502960008-2_CTG3_1 +NW_016107307.1 HSCHR19KIR_502960008-1_CTG3_1 +NW_016107308.1 HSCHR19KIR_0010-5217-AB_CTG3_1 +NW_016107309.1 HSCHR19KIR_7191059-1_CTG3_1 +NW_016107310.1 HSCHR19KIR_0019-4656-B_CTG3_1 +NW_016107311.1 HSCHR19KIR_CA04_CTG3_1 +NW_016107312.1 HSCHR19KIR_HG2393_CTG3_1 +NW_016107313.1 HSCHR19KIR_7191059-2_CTG3_1 +NW_016107314.1 HSCHR19KIR_HG2396_CTG3_1 +NW_025791810.1 HSCHR19_6_CTG2 +NW_025791811.1 HG2225_PATCH +NW_025791812.1 HG410_PATCH +NW_021160023.1 HG2513_PATCH +NW_025791813.1 HG2219_PATCH +NW_025791814.1 HG2265_PATCH +NW_025791815.1 HG2521_PATCH +NW_015148969.2 HG1311_HG2539_PATCH +NW_021160024.1 HG1485_PATCH +NW_021160025.1 HG494_PATCH +NW_021160026.1 HG2512_PATCH +NW_009646207.1 HSCHR22_4_CTG1 +NW_009646208.1 HSCHR22_5_CTG1 +NW_014040930.1 HSCHR22_6_CTG1 +NW_014040931.1 HSCHR22_7_CTG1 +NW_015148968.1 HSCHR22_8_CTG1 +NW_021160027.1 HG439_PATCH +NW_021160028.1 HG1506_PATCH +NW_021160029.1 HG1507_PATCH +NW_021160030.1 HG1509_PATCH +NW_021160031.1 HG1466_PATCH +NW_025791816.1 HG2527_PATCH +NW_025791817.1 HG2541_PATCH +NW_017363820.1 HSCHRX_3_CTG7 +NW_025791818.1 HSCHRX_1_CTG14 +NW_025791819.1 HSCHRX_2_CTG14 +NW_025791820.1 HSCHRX_3_CTG3 +NW_009646209.1 HG2062_PATCH +NW_018654725.1 HG1531_PATCH +NW_018654726.1 HG1535_PATCH +NW_025791821.1 HG1532_PATCH +NW_003315905.1 HSCHR1_1_CTG31 +NW_003315906.1 HSCHR1_2_CTG31 +NW_003315907.2 HSCHR1_3_CTG31 +NT_187516.1 HSCHR1_1_CTG32_1 +NT_187514.1 HSCHR1_1_CTG11 +NT_187518.1 HSCHR1_2_CTG32_1 +NT_187515.1 HSCHR1_1_CTG3 +NT_187519.1 HSCHR1_3_CTG32_1 +NT_187521.1 HSCHR1_4_CTG32_1 +NT_187520.1 HSCHR1_4_CTG31 +NT_187517.1 HSCHR1_2_CTG3 +NW_003315908.1 HSCHR2_1_CTG5 +NW_003315909.1 HSCHR2_1_CTG7_2 +NW_003571033.2 HSCHR2_2_CTG7_2 +NT_187523.1 HSCHR2_1_CTG15 +NT_187528.1 HSCHR2_3_CTG7_2 +NT_187522.1 HSCHR2_1_CTG1 +NT_187525.1 HSCHR2_2_CTG1 +NT_187530.1 HSCHR2_4_CTG7_2 +NT_187524.1 HSCHR2_1_CTG7 +NT_187526.1 HSCHR2_3_CTG1 +NT_187529.1 HSCHR2_4_CTG1 +NT_187531.1 HSCHR2_5_CTG7_2 +NT_187527.1 HSCHR2_3_CTG15 +NW_003315913.1 HSCHR3_1_CTG2_1 +NW_003871060.2 HSCHR3_1_CTG1 +NT_187533.1 HSCHR3_2_CTG2_1 +NT_187536.1 HSCHR3_3_CTG2_1 +NT_187532.1 HSCHR3_1_CTG3 +NT_187537.1 HSCHR3_4_CTG2_1 +NT_187538.1 HSCHR3_5_CTG2_1 +NT_187534.1 HSCHR3_2_CTG3 +NT_187535.1 HSCHR3_3_CTG1 +NT_187539.1 HSCHR3_9_CTG3 +NT_167250.2 HSCHR4_1_CTG9 +NW_003315914.1 HSCHR4_1_CTG12 +NW_003315915.1 HSCHR4_1_CTG6 +NT_187542.1 HSCHR4_2_CTG12 +NT_187543.1 HSCHR4_3_CTG12 +NT_187541.1 HSCHR4_1_CTG8_1 +NT_187544.1 HSCHR4_4_CTG12 +NT_187545.1 HSCHR4_5_CTG12 +NT_187540.1 HSCHR4_1_CTG4 +NW_003315917.2 HSCHR5_2_CTG1_1 +NW_003315918.1 HSCHR5_3_CTG1_1 +NW_003315919.1 HSCHR5_1_CTG5 +NW_003315920.1 HSCHR5_1_CTG1 +NW_003571036.1 HSCHR5_2_CTG1 +NT_187547.1 HSCHR5_3_CTG1 +NT_187548.1 HSCHR5_4_CTG1 +NT_187550.1 HSCHR5_5_CTG1 +NT_187551.1 HSCHR5_6_CTG1 +NT_187546.1 HSCHR5_2_CTG5 +NT_187549.1 HSCHR5_4_CTG1_1 +NT_167244.2 HSCHR6_MHC_APD_CTG1 +NW_003315921.1 HSCHR6_1_CTG2 +NW_004166862.2 HSCHR6_1_CTG3 +NT_187552.1 HSCHR6_1_CTG4 +NT_187553.1 HSCHR6_1_CTG5 +NT_187554.1 HSCHR6_1_CTG6 +NT_187555.1 HSCHR6_1_CTG7 +NT_187556.1 HSCHR6_1_CTG8 +NT_187557.1 HSCHR6_1_CTG9 +NW_003315922.2 HSCHR7_1_CTG6 +NT_187562.1 HSCHR7_2_CTG6 +NT_187558.1 HSCHR7_1_CTG1 +NT_187560.1 HSCHR7_1_CTG7 +NT_187559.1 HSCHR7_1_CTG4_4 +NT_187563.1 HSCHR7_2_CTG7 +NT_187564.1 HSCHR7_3_CTG6 +NT_187561.1 HSCHR7_2_CTG4_4 +NT_187567.1 HSCHR8_1_CTG7 +NT_187565.1 HSCHR8_1_CTG1 +NT_187568.1 HSCHR8_2_CTG1 +NT_187570.1 HSCHR8_3_CTG1 +NT_187566.1 HSCHR8_1_CTG6 +NT_187569.1 HSCHR8_2_CTG7 +NT_187571.1 HSCHR8_3_CTG7 +NT_187573.1 HSCHR8_4_CTG7 +NT_187572.1 HSCHR8_4_CTG1 +NT_187574.1 HSCHR8_5_CTG7 +NT_187575.1 HSCHR8_6_CTG7 +NT_187576.1 HSCHR8_8_CTG1 +NT_187577.1 HSCHR8_9_CTG1 +NW_003315928.1 HSCHR9_1_CTG1 +NW_003315929.1 HSCHR9_1_CTG2 +NW_003315930.1 HSCHR9_1_CTG3 +NW_003315931.1 HSCHR9_1_CTG4 +NT_187578.1 HSCHR9_1_CTG5 +NW_003315934.1 HSCHR10_1_CTG1 +NW_003315935.1 HSCHR10_1_CTG2 +NT_187579.1 HSCHR10_1_CTG3 +NW_003315936.1 HSCHR11_1_CTG1_1 +NW_003871073.1 HG142_HG150_NOVEL_TEST +NW_003871074.1 HG151_NOVEL_TEST +NT_187581.1 HSCHR11_1_CTG2 +NT_187582.1 HSCHR11_1_CTG3 +NT_187583.1 HSCHR11_1_CTG5 +NT_187584.1 HSCHR11_1_CTG6 +NT_187585.1 HSCHR11_1_CTG7 +NT_187586.1 HSCHR11_1_CTG8 +NW_003315938.1 HSCHR12_1_CTG2 +NW_003315939.2 HSCHR12_1_CTG2_1 +NW_003315940.1 HSCHR12_4_CTG2_1 +NW_003315941.1 HSCHR12_2_CTG2_1 +NW_003315942.2 HSCHR12_3_CTG2_1 +NW_003571049.1 HSCHR12_1_CTG1 +NW_003571050.1 HSCHR12_2_CTG2 +NT_187589.1 HSCHR12_5_CTG2_1 +NT_187590.1 HSCHR12_6_CTG2_1 +NT_187587.1 HSCHR12_4_CTG2 +NT_187591.1 HSCHR12_7_CTG2_1 +NT_187588.1 HSCHR12_5_CTG2 +NT_187592.1 HSCHR13_1_CTG1 +NT_187593.1 HSCHR13_1_CTG2 +NT_187594.1 HSCHR13_1_CTG3 +NT_187595.1 HSCHR13_1_CTG4 +NT_187596.1 HSCHR13_1_CTG5 +NT_187597.1 HSCHR13_1_CTG6 +NT_187598.1 HSCHR14_1_CTG1 +NT_187599.1 HSCHR14_2_CTG1 +NT_187600.1 HSCHR14_3_CTG1 +NT_187601.1 HSCHR14_7_CTG1 +NW_003315943.1 HSCHR15_1_CTG8 +NW_003315944.2 HSCHR15_2_CTG8 +NT_187603.1 HSCHR15_1_CTG3 +NT_187605.1 HSCHR15_3_CTG8 +NT_187606.1 HSCHR15_5_CTG8 +NT_187604.1 HSCHR15_3_CTG3 +NT_187602.1 HSCHR15_1_CTG1 +NW_003315945.1 HSCHR16_1_CTG3_1 +NW_003315946.1 HSCHR16_2_CTG3_1 +NT_187607.1 HSCHR16_1_CTG1 +NT_187610.1 HSCHR16_CTG2 +NT_187608.1 HSCHR16_3_CTG1 +NT_187609.1 HSCHR16_4_CTG1 +NT_167251.2 HSCHR17_1_CTG5 +NW_003315952.3 HSCHR17_1_CTG1 +NW_003315953.2 HSCHR17_1_CTG4 +NW_003315954.1 HSCHR17_2_CTG4 +NW_003315955.1 HSCHR17_3_CTG4 +NW_003871091.1 HSCHR17_4_CTG4 +NW_003871092.1 HSCHR17_5_CTG4 +NT_187614.1 HSCHR17_7_CTG4 +NT_187615.1 HSCHR17_8_CTG4 +NT_187616.1 HSCHR17_9_CTG4 +NT_187612.1 HSCHR17_1_CTG9 +NT_187611.1 HSCHR17_1_CTG2 +NT_187613.1 HSCHR17_2_CTG2 +NW_003315956.1 HSCHR18_1_CTG1_1 +NW_003315957.1 HSCHR18_1_CTG2 +NW_003315958.1 HSCHR18_1_CTG2_1 +NW_003315959.1 HSCHR18_2_CTG1_1 +NW_003315960.1 HSCHR18_2_CTG2 +NW_003315961.1 HSCHR18_2_CTG2_1 +NT_187617.1 HSCHR18_3_CTG2_1 +NT_187618.1 HSCHR18_4_CTG1_1 +NW_003315962.1 HSCHR19_1_CTG2 +NW_003315963.1 HSCHR19_1_CTG3_1 +NW_003315964.2 HSCHR19_2_CTG2 +NW_003315965.1 HSCHR19_3_CTG2 +NW_003571054.1 HSCHR19LRC_COX1_CTG3_1 +NT_187621.1 HSCHR19_4_CTG2 +NT_187619.1 HSCHR19_2_CTG3_1 +NT_187620.1 HSCHR19_3_CTG3_1 +NT_187622.1 HSCHR19_5_CTG2 +NW_003315966.2 HSCHR20_1_CTG1 +NT_187623.1 HSCHR20_1_CTG2 +NT_187624.1 HSCHR20_1_CTG3 +NT_187625.1 HSCHR20_1_CTG4 +NW_003315967.2 HSCHR21_1_CTG1_1 +NW_003315968.2 HSCHR21_2_CTG1_1 +NW_003315969.2 HSCHR21_3_CTG1_1 +NW_003315970.2 HSCHR21_4_CTG1_1 +NT_187626.1 HSCHR21_5_CTG2 +NT_187627.1 HSCHR21_6_CTG1_1 +NT_187628.1 HSCHR21_8_CTG1_1 +NW_003315971.2 HSCHR22_1_CTG1 +NW_003315972.2 HSCHR22_1_CTG2 +NT_187629.1 HSCHR22_1_CTG3 +NT_187630.1 HSCHR22_1_CTG4 +NT_187631.1 HSCHR22_1_CTG5 +NT_187632.1 HSCHR22_1_CTG6 +NT_187633.1 HSCHR22_1_CTG7 +NT_187634.1 HSCHRX_1_CTG3 +NT_187635.1 HSCHRX_2_CTG12 +NT_187646.1 HSCHR1_ALT2_1_CTG32_1 +NT_187647.1 HSCHR2_2_CTG15 +NT_187648.1 HSCHR2_2_CTG7 +NT_187649.1 HSCHR3_3_CTG3 +NT_187650.1 HSCHR4_6_CTG12 +NT_187651.1 HSCHR5_1_CTG1_1 +NT_187652.1 HSCHR5_3_CTG5 +NT_113891.3 HSCHR6_MHC_COX_CTG1 +NT_187653.1 HSCHR7_2_CTG1 +NT_187654.1 HSCHR8_5_CTG1 +NT_187655.1 HSCHR8_6_CTG1 +NT_187656.1 HSCHR11_2_CTG1 +NT_187657.1 HSCHR11_2_CTG1_1 +NT_187658.1 HSCHR12_3_CTG2 +NT_187660.1 HSCHR15_4_CTG8 +NT_187659.1 HSCHR15_2_CTG3 +NW_003871093.1 HSCHR17_6_CTG4 +NT_187662.1 HSCHR17_2_CTG1 +NT_187663.1 HSCHR17_2_CTG5 +NT_187661.1 HSCHR17_10_CTG4 +NT_187664.1 HSCHR17_3_CTG2 +NT_187666.1 HSCHR18_ALT2_CTG2_1 +NT_187665.1 HSCHR18_ALT21_CTG2_1 +NW_003571055.2 HSCHR19LRC_COX2_CTG3_1 +NW_004504305.1 HSCHR22_2_CTG1 +NT_187667.1 HSCHRX_2_CTG3 +NT_187678.1 HSCHR3_4_CTG3 +NT_187679.1 HSCHR4_7_CTG12 +NT_167245.2 HSCHR6_MHC_DBB_CTG1 +NT_187680.1 HSCHR8_7_CTG1 +NT_187681.1 HSCHR11_3_CTG1 +NW_003571056.2 HSCHR19LRC_LRC_I_CTG3_1 +NT_187682.1 HSCHR22_3_CTG1 +NT_187688.1 HSCHR3_5_CTG3 +NT_167246.2 HSCHR6_MHC_MANN_CTG1 +NW_003571057.2 HSCHR19LRC_LRC_J_CTG3_1 +NT_187689.1 HSCHR3_6_CTG3 +NT_167247.2 HSCHR6_MHC_MCF_CTG1 +NW_003571058.2 HSCHR19LRC_LRC_S_CTG3_1 +NT_187690.1 HSCHR3_7_CTG3 +NT_167248.2 HSCHR6_MHC_QBL_CTG1 +NW_003571059.2 HSCHR19LRC_LRC_T_CTG3_1 +NT_187691.1 HSCHR3_8_CTG3 +NT_167249.2 HSCHR6_MHC_SSTO_CTG1 +NW_003571060.1 HSCHR19LRC_PGF1_CTG3_1 +NT_187692.1 HSCHR6_8_CTG1 +NW_003571061.2 HSCHR19LRC_PGF2_CTG3_1 +NT_187693.1 HSCHR19_4_CTG3_1 +NT_187636.1 HSCHR19KIR_FH15_B_HAP_CTG3_1 +NT_187637.1 HSCHR19KIR_G085_A_HAP_CTG3_1 +NT_187638.1 HSCHR19KIR_G085_BA1_HAP_CTG3_1 +NT_187639.1 HSCHR19KIR_G248_A_HAP_CTG3_1 +NT_187640.1 HSCHR19KIR_G248_BA2_HAP_CTG3_1 +NT_187641.1 HSCHR19KIR_GRC212_AB_HAP_CTG3_1 +NT_187642.1 HSCHR19KIR_GRC212_BA1_HAP_CTG3_1 +NT_187643.1 HSCHR19KIR_LUCE_A_HAP_CTG3_1 +NT_187644.1 HSCHR19KIR_LUCE_BDEL_HAP_CTG3_1 +NT_187645.1 HSCHR19KIR_RSH_A_HAP_CTG3_1 +NT_187668.1 HSCHR19KIR_RSH_BA2_HAP_CTG3_1 +NT_187669.1 HSCHR19KIR_T7526_A_HAP_CTG3_1 +NT_187670.1 HSCHR19KIR_T7526_BDEL_HAP_CTG3_1 +NT_187671.1 HSCHR19KIR_ABC08_A1_HAP_CTG3_1 +NT_187672.1 HSCHR19KIR_ABC08_AB_HAP_C_P_CTG3_1 +NT_187673.1 HSCHR19KIR_ABC08_AB_HAP_T_P_CTG3_1 +NT_187674.1 HSCHR19KIR_FH05_A_HAP_CTG3_1 +NT_187675.1 HSCHR19KIR_FH05_B_HAP_CTG3_1 +NT_187676.1 HSCHR19KIR_FH06_A_HAP_CTG3_1 +NT_187677.1 HSCHR19KIR_FH06_BA1_HAP_CTG3_1 +NT_187683.1 HSCHR19KIR_FH08_A_HAP_CTG3_1 +NT_187684.1 HSCHR19KIR_FH08_BAX_HAP_CTG3_1 +NT_187685.1 HSCHR19KIR_FH13_A_HAP_CTG3_1 +NT_187686.1 HSCHR19KIR_FH13_BA2_HAP_CTG3_1 +NT_187687.1 HSCHR19KIR_FH15_A_HAP_CTG3_1 +NT_113949.2 HSCHR19KIR_RP5_B_HAP_CTG3_1 +NC_012920.1 M +na HSCHR11_CTG1_UNLOCALIZED +na HSCHR22_UNLOCALIZED_CTG4 +na HSCHRUN_RANDOM_CTG29 +na HSCHR10_1_CTG4 diff --git a/convert/vcf_rename.chrnames b/convert/vcf_rename.chrnames new file mode 100644 index 0000000..b97b080 --- /dev/null +++ b/convert/vcf_rename.chrnames @@ -0,0 +1,25 @@ +chr1 1 +chr2 2 +chr3 3 +chr4 4 +chr5 5 +chr6 6 +chr7 7 +chr8 8 +chr9 9 +chr10 10 +chr11 11 +chr12 12 +chr13 13 +chr14 14 +chr15 15 +chr16 16 +chr17 17 +chr18 18 +chr19 19 +chr20 20 +chr21 21 +chr22 22 +chrX X +chrY Y +chrM M \ No newline at end of file diff --git a/database.sh b/database.sh new file mode 100644 index 0000000..2d00242 --- /dev/null +++ b/database.sh @@ -0,0 +1,100 @@ +#!/bin/bash +set -euo pipefail + +# This script is used to download all the application dependencies used in the paper to the selected Path, make sure you have already create the path +CURRENT_PATH=$(pwd) +TARGET_PATH="$HOME/Downloads/Database" + +# Ensure the directory exists +mkdir -p "$TARGET_PATH" + +# Change to the target directory +cd "$TARGET_PATH" + +echo "Starting Download All Neccessary Database, ensure stable connection" + +echo "01. Downloading Reference Sequence GRCh38.p14 from UCSC" +## Reference Sequence GRCh38.p13 +wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/p14/hg38.p14.fa.gz -O hg38.p14.fa.gz +gzip -d hg38.p14.fa.gz + +echo "Run indexing with bwa-mem2, ensure your Memory + Swap > 76GB for smoother experiences" +bwa-mem2 index hg38.p14.fa + +echo "Create index using samtools" +samtools faidx hg38.p14.fa + +echo "02. Download and process dbNSFP from https://sites.google.com/site/jpopgen/dbNSFP?pli=1" +## Paper used dbNSFP4.4a, This link download dbNSFP4.5a, see full release update on https://sites.google.com/site/jpopgen/dbNSFP/changelog?authuser=0 +dbNSFP_version=4.5a + +echo "02a. Downloading dbNSFP${dbNSFP_version}" +wget https://usf.box.com/shared/static/2hzcx5s6p1xui7oen16xqzndfrkt8l9l -O dbNSFP${dbNSFP_version}.zip +unzip dbNSFP${dbNSFP_version}.zip + +echo "02c. Processing the file into a single file version" + +### Create Header +zcat dbNSFP${dbNSFP_version}_variant.chr1.gz | head -n 1 | bgzip > header.gz + +### NOTE: bgzip parameter --threads X represents number of threads +cat dbNSFP${dbNSFP_version}_variant.chr{1..22}.gz dbNSFP${dbNSFP_version}_variant.chrX.gz dbNSFP${dbNSFP_version}_variant.chrY.gz dbNSFP${dbNSFP_version}_variant.chrM.gz | zgrep -v '#chr' | bgzip --threads nproc > dbNSFPv${dbNSFP_version}_custom.gz + +### Add header back into file +cat header.gz dbNSFPv${dbNSFP_version}_custom.gz > dbNSFPv${dbNSFP_version}_custombuild.gz + +zcat dbNSFPv${dbNSFP_version}_custombuild.gz | awk '!/^#/{print $0}' | cut -f 1 | sort | uniq | wc -l + +### Create tabix index +tabix -s 1 -b 2 -e 2 dbNSFPv${dbNSFP_version}_custombuild.gz + + +echo "03. Download and Processing dbSNP from NCBI FTP" + +echo "03a. Downloading dbSNP b156" +wget https://ftp.ncbi.nih.gov/snp/latest_release/VCF/GCF_000001405.40.gz -O GCF_000001405.40.gz +wget https://ftp.ncbi.nih.gov/snp/archive/b156/VCF/GCF_000001405.40.gz.tbi -O GCF_000001405.40.gz.tbi +wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_assembly_report.txt -O GCF_000001405.40_GRCh38.p14_assembly_report.txt + +echo "03b. Processing the file into SnpSift ready file" +bcftools annotate \ + --rename-chrs ${CURRENT_PATH}/convert/GCF_000001405.40_GRCh38.p14_assembly_report_revised_snpsift.chrnames \ + --threads nproc -Oz \ + -o Homo_sapiens_assembly38.dbsnp156_snpsift.vcf.gz \ + GCF_000001405.40.gz + + +echo "03c. Check and filter out forbidden charset annotation" +## Reference : https://pcingola.github.io/SnpEff/ss_faq/ +SnpSift vcfCheck Homo_sapiens_assembly38.dbsnp156_snpsift.vcf.gz 2>&1 | grep "INFO field" | cut -f 2 -d "'" | sort | uniq -c + +zcat Homo_sapiens_assembly38.dbsnp156_snpsift.vcf.gz \ + | sed 's/\&base_change=/\&base_change%3D/g' \ + | sed 's/A=;/A%3D;/' \ + | sed 's/C=;/C%3D;/' \ + | sed 's/G=;/G%3D;/' \ + | sed 's/T=;/T%3D;/' \ + | sed 's/=,;/%3D,/' \ + > Homo_sapiens_assembly38.dbsnp156_snpsift.fixed.vcf + +# Let's also compress and index the new file so we can use it as a database +bgzip Homo_sapiens_assembly38.dbsnp156_snpsift.fixed.vcf +tabix Homo_sapiens_assembly38.dbsnp156_snpsift.fixed.vcf.gz + +SnpSift vcfCheck Homo_sapiens_assembly38.dbsnp156_snpsift.fixed.vcf.gz 2>&1 | grep "INFO field" | cut -f 2 -d "'" | sort | uniq -c + + +echo "04. Download and Processing ClinVar from NCBI FTP" + +echo "04a. Downloading ClinVar data" +# Paper used 20230514, while the most recent one is 20240215 +# File version of 20230514 : +## https://ftp.ncbi.nih.gov/pub/clinvar/vcf_GRCh38/archive_2.0/2023/clinvar_20230514.vcf.gz +## https://ftp.ncbi.nih.gov/pub/clinvar/vcf_GRCh38/archive_2.0/2023/clinvar_20230514_papu.vcf.gz.tbi +clinvar_version=20240215 + +wget https://ftp.ncbi.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20240215.vcf.gz -O clinvar_${clinvar_version}.vcf.gz +tabix clinvar_${clinvar_version}.vcf.gz + +echo "04b. SnpSift check" +SnpSift vcfCheck clinvar_${clinvar_version}.vcf.gz 2>&1 | grep "INFO field" | cut -f 2 -d "'" | sort | uniq -c \ No newline at end of file diff --git a/iderare.sh b/iderare.sh new file mode 100644 index 0000000..32903b1 --- /dev/null +++ b/iderare.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -euo pipefail + +python3 iderare_prep.py +source pipeline.sh \ No newline at end of file diff --git a/iderare.yml b/iderare.yml new file mode 100644 index 0000000..174b63a --- /dev/null +++ b/iderare.yml @@ -0,0 +1,71 @@ +analysis : + # Full Path of trio data + data_dir : /home/ivanwilliamharsono/Downloads/trio_patient + + # Note : + # 1. Ensure all proband, father, mother saved on the same path + # 2. Ensure file name is cleaned .fq.gz, if your file is raw untrimmed fastq, adjust pipeline_template.sh, uncomment Step 0 + # 3. Insert proband, mother, father file name (not full path) without .fq.gz file + # 4. File should be prepared on {data_dir}/input/A_FASTQ + proband : V350145665_L04_B5EHOMdmhwXAAAA-515 + mother : V350145665_L04_B5EHOMdmhwXAABA-517 + father : V350145665_L04_B5EHOMdmhwXAACA-519 + + # Gender setting 1=male, 2=female, 0=unknown + # Proband Gender + proband_gender : 1 + + # Phenotype -9=missing, 0=missing; 1=unaffected; 2=affected + proband_phen : 2 + mother_phen : 1 + father_phen : 1 + + # HPO IDs of patient + hpo_ids : ['HP:0002366', 'HP:0005561', 'HP:0001903', 'HP:0010972', 'HP:0001541', 'HP:0004333', 'HP:0001396', 'HP:0002910', 'HP:0001531', 'HP:0001399', 'HP:0001433', 'HP:0003073', 'HP:0003233', 'HP:0006568', 'HP:0002151', 'HP:0200114', 'HP:0001653', 'HP:0000938', 'HP:0001873'] + + # Library Name - Any kind of sequencing method (e.g. AgilentV6) + library : AgilentV6 + # Method - Any kind of sequencing method (e.g. DNBSeq) + method : DNBSeq + + +setup : + # deep variant version #1.5.0 - tested for GPU <8GB | v1.6.0 need minimum GPU memory of 16 GB + dv_version : 1.5.0 + + # glnexus and tiddit docker version + glnexus_version : 1.4.1 + tiddit_version : 3.6.1--py38h24c8ff8_0 + + # deep variant model (WES/WGS) + dv_model : WES + + # max memory allowed (G = Gigabyte) + max_mem : 60G + + # refseq directory (full folder), assuming you save it on Downloads/Database + ref_dir : /home/ivanwilliamharsono/Downloads/Database + # refseq fasta file name (file name) + ref_fasta : hg38.p14.fa + + # SnpEff data directory & version + snpEff_dir : /home/ivanwilliamharsono/Downloads/Sandbox/snpEff/data + snpEff_ver : GRCh38.p14 + + # Exomiser data directory & version + exomiser_dir : /home/ivanwilliamharsono/Downloads/Sandbox/exomiser/data/exomiser-data + exomiser_data_ver : 2309 + + #dbNSFP file + dbNSFP_file : /home/ivanwilliamharsono/Downloads/Database/dbNSFPv4.5a_custombuild.gz + + #dbSNP file + dbSNP_file : /home/ivanwilliamharsono/Downloads/Database/Homo_sapiens_assembly38.dbsnp156_snpsift.fixed.vcf.gz + + #ClinVar file + ClinVar_file : /home/ivanwilliamharsono/Downloads/Database/clinvar_20240215.vcf.gz + + # SnpSift adjuster + chr_rename : /home/ivanwilliamharsono/Downloads/IDeRare/convert/vcf_rename.chrnames + + diff --git a/iderare_prep.py b/iderare_prep.py new file mode 100644 index 0000000..f269803 --- /dev/null +++ b/iderare_prep.py @@ -0,0 +1,151 @@ +#!/usr/bin/python + +import os +import yaml + +with open("iderare.yml") as i: + y = yaml.safe_load(i) + + # Load data for trio analysis + data_dir = y['analysis']['data_dir'] + proband = y['analysis']['proband'] + mother = y['analysis']['mother'] + father = y['analysis']['father'] + + proband_gender = y['analysis']['proband_gender'] + proband_phen = y['analysis']['proband_phen'] + mother_phen = y['analysis']['mother_phen'] + father_phen = y['analysis']['father_phen'] + hpo_ids = y['analysis']['hpo_ids'] + + library = y['analysis']['library'] + method = y['analysis']['method'] + + # Load data for setup database path / source + dv_version = y['setup']['dv_version'] + dv_model = y['setup']['dv_model'] + + glnexus_version = y['setup']['glnexus_version'] + tiddit_version = y['setup']['tiddit_version'] + max_mem = y['setup']['max_mem'] + + ref_dir = y['setup']['ref_dir'] + ref_fasta = y['setup']['ref_fasta'] + + snpEff_dir = y['setup']['snpEff_dir'] + snpEff_ver = y['setup']['snpEff_ver'] + + exomiser_dir = y['setup']['exomiser_dir'] + exomiser_data_ver = y['setup']['exomiser_data_ver'] + + dbNSFP_file = y['setup']['dbNSFP_file'] + dbSNP_file = y['setup']['dbSNP_file'] + ClinVar_file = y['setup']['ClinVar_file'] + + chr_rename = y['setup']['chr_rename'] + + +# Open template_pipeline.yml and replace {{variable}} with the value from iderare.yml +with open("templates/template_pipeline.sh") as t: + template = t.read() + template = template.replace("{{data_dir}}", data_dir) + template = template.replace("{{proband}}", proband) + template = template.replace("{{mother}}", mother) + template = template.replace("{{father}}", father) + + template = template.replace("{{library}}", library) + template = template.replace("{{method}}", method) + + template = template.replace("{{dv_version}}", dv_version) + template = template.replace("{{dv_model}}", dv_model) + template = template.replace("{{max_mem}}", max_mem) + + template = template.replace("{{glnexus_version}}", glnexus_version) + template = template.replace("{{tiddit_version}}", tiddit_version) + + template = template.replace("{{ref_dir}}", ref_dir) + template = template.replace("{{ref_fasta}}", ref_fasta) + + template = template.replace("{{snpEff_dir}}", snpEff_dir) + template = template.replace("{{snpEff_ver}}", snpEff_ver) + + template = template.replace("{{dbNSFP_file}}", dbNSFP_file) + template = template.replace("{{dbSNP_file}}", dbSNP_file) + template = template.replace("{{ClinVar_file}}", ClinVar_file) + + template = template.replace("{{chr_rename}}", chr_rename) + + # Export the new pipeline.yml + with open("pipeline.sh", "w+") as p: + p.write(template) + print("pipeline.sh created") + + +# Open template_trio.ped and replace with value from iderare.yml +with open("templates/template_trio.ped") as t: + template = t.read() + + template = template.replace("{{proband_gender}}", str(proband_gender)) + template = template.replace("{{proband_phen}}", str(proband_phen)) + template = template.replace("{{mother_phen}}", str(mother_phen)) + template = template.replace("{{father_phen}}", str(father_phen)) + + # Check if directory exist + if not os.path.exists(os.path.join(data_dir, "input")): + os.makedirs(os.path.join(data_dir, "input")) + + # Export the new trio.ped + with open(os.path.join(data_dir, "input", "trio.ped"), "w+") as p: + p.write(template) + print("trio.ped created") + +# Create Trio and Solo Exomiser YAML +## Trio +with open("templates/template_exomiser.yml") as i: + y = yaml.safe_load(i) + y['analysis']['vcf'] = data_dir + '/annotated/' + proband + '-SnpEff-dbSNP-ClinVar-dbNSFP_annotated-deepTrio.vcf.gz' + y['analysis']['ped'] = data_dir + "/input/trio.ped" + y['analysis']['proband'] = 'child' + y['analysis']['hpoIds'] = hpo_ids + y['outputOptions']['outputDirectory'] = data_dir + '/exomiser' + y['outputOptions']['outputFileName'] = proband + "-exomiser-trio" + + +with open(os.path.join(data_dir, proband + "_exomiser_trio.yml"), "w+") as o: + yaml.dump(y, o, default_flow_style=False, sort_keys=False) + print(proband + "_exomiser_trio.yml created") + +## Solo +with open("templates/template_exomiser.yml") as i: + y = yaml.safe_load(i) + y['analysis']['vcf'] = data_dir + '/annotated/' + proband + '-SnpEff-dbSNP-ClinVar-dbNSFP_annotated-deepVariant.vcf.gz' + y['analysis']['hpoIds'] = hpo_ids + y['outputOptions']['outputDirectory'] = data_dir + '/exomiser' + y['outputOptions']['outputFileName'] = proband + "-exomiser-solo" + +with open(os.path.join(data_dir, proband + "_exomiser_solo.yml"), "w+") as o: + yaml.dump(y, o, default_flow_style=False, sort_keys=False) + print(proband + "_exomiser_solo.yml created") + +## Tiddit +with open("templates/template_exomiser.yml") as i: + y = yaml.safe_load(i) + y['analysis']['vcf'] = data_dir + '/sv_tiddit/' + 'output.filtered.dbnsfp.vcf' + y['analysis']['hpoIds'] = hpo_ids + y['outputOptions']['outputDirectory'] = data_dir + '/exomiser' + y['outputOptions']['outputFileName'] = proband + "-tiddit-exomiser-solo" + +with open(os.path.join(data_dir, proband + "_tiddit_exomiser_solo.yml"), "w+") as o: + yaml.dump(y, o, default_flow_style=False, sort_keys=False) + print(proband + "_tiddit_exomiser_solo.yml created") + +# Open template_application.properties and replace {{variable}} with the value from iderare.yml +with open("templates/template_application.properties") as t: + template = t.read() + template = template.replace("{{exomiser_dir}}", exomiser_dir) + template = template.replace("{{exomiser_data_ver}}", str(exomiser_data_ver)) + + # Export the new application.properties + with open(os.path.join(data_dir, "application.properties"), "w+") as p: + p.write(template) + print("application.properties created") \ No newline at end of file diff --git a/install_dependencies.sh b/install_dependencies.sh new file mode 100644 index 0000000..cbad987 --- /dev/null +++ b/install_dependencies.sh @@ -0,0 +1,176 @@ +#!/bin/bash +set -euo pipefail + +# This script is used to download all the application dependencies used in the paper to the selected Path, make sure you have already create the path +TARGET_PATH="$HOME/Downloads/Sandbox" + +# Download Exomiser data from https://data.monarchinitiative.org/exomiser/data/index.html & extract +exomiser_data_version=2309 + +# Ensure the directory exists +mkdir -p "$TARGET_PATH" + +# Change to the target directory +cd "$TARGET_PATH" + +echo "Download Neccesary Application" + +echo "01. Downloading fastp" +# Download fastp from https://github.com/OpenGene/fastp/archive/refs/tags/v0.23.4.zip & extract +wget http://opengene.org/fastp/fastp.0.23.4 +mv fastp.0.23.4 fastp +chmod a+x ./fastp + +echo "02. Downloading bwa-mem2" +# Download bwa-mem2 from https://github.com/bwa-mem2/bwa-mem2/releases/download/v2.2.1/bwa-mem2-2.2.1_x64-linux.tar.bz2 & extract +wget https://github.com/bwa-mem2/bwa-mem2/releases/download/v2.2.1/bwa-mem2-2.2.1_x64-linux.tar.bz2 -O bwa-mem2-2.2.1_x64-linux.tar.bz2 +tar -xvf bwa-mem2-2.2.1_x64-linux.tar.bz2 +mv bwa-mem2-2.2.1_x64-linux bwa-mem2 +chmod a+x ./bwa-mem2/bwa-mem2 + +echo "03. Downloading sambamba" +# Download executable sambamba from https://github.com/biod/sambamba/releases/download/v1.0.1/sambamba-1.0.1-linux-amd64-static.gz & extract +# Used in paper is v1.0.0 conda distribution, but there is no different with the 1.0.1 version +wget https://github.com/biod/sambamba/releases/download/v1.0.1/sambamba-1.0.1-linux-amd64-static.gz -O sambamba-1.0.1-linux-amd64-static.gz +gunzip sambamba-1.0.1-linux-amd64-static.gz +mv sambamba-1.0.1-linux-amd64-static sambamba +chmod a+x ./sambamba + +echo "04. Downloading bcftools" +# Download bcftools and samtools from https://github.com/samtools/bcftools/releases/download/1.19/bcftools-1.19.tar.bz2 , https://github.com/samtools/samtools/releases/download/1.19.2/samtools-1.19.2.tar.bz2 & extract +# Used in paper is v1.17, but there is no different with the core feature of --rename-chrs is still the same +wget https://github.com/samtools/bcftools/releases/download/1.19/bcftools-1.19.tar.bz2 -O bcftools-1.19.tar.bz2 +tar -xvf bcftools-1.19.tar.bz2 +# Make the bcftools executable +cd bcftools-1.19 +make +chmod a+x ./bcftools +cd ../ + +wget https://github.com/samtools/samtools/releases/download/1.19.2/samtools-1.19.2.tar.bz2 -O samtools-1.19.2.tar.bz2 +tar -xvf samtools-1.19.2.tar.bz2 +# Make the samtools executable +cd samtools-1.19.2 +make +chmod a+x ./samtools +cd ../ + +echo "05. Downloading SnpEff and SnpSift" +# Download SnpEff from https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip & extract +wget https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip -O snpEff_latest_core.zip +unzip snpEff_latest_core.zip +chmod a+x ./snpEff/snpEff.jar ./snpEff/SnpSift.jar + +echo "05a. Downloading SnpEff Database" +java -jar ./snpEff/snpEff.jar download GRCh38.105 + +echo "06. Downloading Exomiser" +# Download Exomiser from https://github.com/exomiser/Exomiser/releases/download/13.3.0/exomiser-cli-13.3.0-distribution.zip & extract +wget https://github.com/exomiser/Exomiser/releases/download/13.3.0/exomiser-cli-13.3.0-distribution.zip -O exomiser-cli-13.3.0-distribution.zip +unzip exomiser-cli-13.3.0-distribution.zip +mv exomiser-cli-13.3.0-distribution exomiser +chmod a+x ./exomiser/exomiser-cli-13.3.0.jar + +echo "06b. Downloading Exomiser Data" +# Paper use 2302 database, but the latest version is 2309 +mkdir -p exomiser/data/exomiser-data +wget https://data.monarchinitiative.org/exomiser/data/2309_hg38.zip -O exomiser/data/exomiser-data/2309_hg38.zip +wget https://data.monarchinitiative.org/exomiser/data/2309_phenotype.zip -O exomiser/data/exomiser-data/2309_phenotype.zip +unzip exomiser/data/exomiser-data/2309_hg38.zip -d exomiser/data/exomiser-data +unzip exomiser/data/exomiser-data/2309_phenotype.zip -d exomiser/data/exomiser-data + +echo "06c. Create application properties" +cat < exomiser/application.properties +exomiser.data-directory=$TARGET_PATH/exomiser/data/exomiser-data +exomiser.hg38.data-version=$exomiser_data_version +exomiser.phenotype.data-version=$exomiser_data_version +EOF + +echo "07. Downloading Tiddit" +# Paper used Tiddit v3.6.0, but the is no different in sv feature compared to v3.6.1 (minor bug fix) +docker run quay.io/biocontainers/tiddit:3.6.1--py38h24c8ff8_0 + +echo "STEP 8: Installing Dependency of NVIDIA Docker 2" + +# (1) Install nvidia driver: +# https://linuxhint.com/install-cuda-ubuntu/ +sudo apt-get "${APT_ARGS[@]}" update +sudo apt-get "${APT_ARGS[@]}" install \ + build-essential \ + curl \ + "linux-headers-$(uname -r)" \ + nvidia-cuda-toolkit + +# See https://www.tensorflow.org/install/source#gpu for versions required. +if ! dpkg-query -W cuda-11-3; then + echo "Installing CUDA..." + UBUNTU_VERSION="2004" + curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin + sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 + # From https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212772 + sudo -H apt-key adv --fetch-keys "http://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/3bf863cc.pub" + sudo add-apt-repository -y "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" + sudo -H apt-get update "${APT_ARGS[@]}" > /dev/null + sudo -H apt-get full-upgrade "${APT_ARGS[@]}" > /dev/null + sudo -H apt-get install "${APT_ARGS[@]}" cuda-11-3 +fi + +echo "Checking for CUDNN..." +if [[ ! -e /usr/local/cuda-11/include/cudnn.h ]]; then + echo "Installing CUDNN..." + CUDNN_TAR_FILE="cudnn-11.3-linux-x64-v8.2.0.53.tgz" + wget -q https://developer.download.nvidia.com/compute/redist/cudnn/v8.2.0/${CUDNN_TAR_FILE} + tar -xzvf ${CUDNN_TAR_FILE} + sudo cp -P cuda/include/cudnn.h /usr/local/cuda-11/include + sudo cp -P cuda/lib64/libcudnn* /usr/local/cuda-11/lib64/ + sudo cp -P cuda/lib64/libcudnn* /usr/local/cuda-11/lib64/ + sudo chmod a+r /usr/local/cuda-11/lib64/libcudnn* + sudo ldconfig +fi +# (3) Install nvidia docker: +# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian +# Add the package repositories +distribution=$(. /etc/os-release;echo "$ID$VERSION_ID") +curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - +curl -s -L "https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list" | sudo tee /etc/apt/sources.list.d/nvidia-docker.list + +sudo apt-get update && sudo apt-get install "${APT_ARGS[@]}" nvidia-docker2 +sudo systemctl restart docker + +fi + +echo "STEP 8: Dry Run NVIDIA Docker 2 using GPU" +#### Test nvidia-smi with the latest official CUDA image +docker run --gpus 1 nvidia/cuda:11.3.0-base-ubuntu20.04 nvidia-smi + +echo "09. Downloading DeepVariant Docker Image" +## Paper used DeepVariant v1.5.0 & GLNexus 1.2.7 +## but the latest version is DeepVariant v1.6.0 & GLNexus v1.4.1, +## Main difference is that DeepVariant 1.6.0 occupy around 16GB of memory (Keras), +## While DeepVariant 1.5.0 used around 6GB of memory and trained on Slim +## See this link for details : https://github.com/google/deepvariant/blob/r1.6/docs/FAQ.md +## Uncomment the 1.6.0 version if you have more than 16GB of GPU memory +# docker pull google/deepvariant:1.6.0-gpu +# docker pull google/deepvariant:deeptrio-1.6.0-gpu +docker pull google/deepvariant:1.5.0-gpu +docker pull google/deepvariant:deeptrio-1.5.0-gpu +docker pull realtimegenomics/rtg-tools +docker pull ghcr.io/dnanexus-rnd/glnexus:v1.4.1 + +echo "Final : Add all the application to the .bashrc" + +# Add aliases to ~/.bashrc\ +echo "" >> ~/.bashrc +echo "alias fastp=\"$TARGET_PATH/fastp\";" >> ~/.bashrc +echo "alias bwa-mem2=\"$TARGET_PATH/bwa-mem2/bwa-mem2\";" >> ~/.bashrc +echo "alias sambamba=\"$TARGET_PATH/sambamba\";" >> ~/.bashrc +echo "alias bcftools=\"$TARGET_PATH/bcftools-1.19/bcftools\";" >> ~/.bashrc +echo "alias samtools=\"$TARGET_PATH/samtools-1.19.2/samtools\";" >> ~/.bashrc +echo "alias SnpEff=\"java -jar $TARGET_PATH/snpEff/snpEff.jar\";" >> ~/.bashrc +echo "alias SnpSift=\"java -jar $TARGET_PATH/snpEff/SnpSift.jar\";" >> ~/.bashrc +echo "alias exomiser=\"java -jar $TARGET_PATH/exomiser/exomiser-cli-13.3.0.jar\";" >> ~/.bashrc + +# Source ~/.bashrc to apply changes +source ~/.bashrc + +echo "Aliases added to ~/.bashrc" \ No newline at end of file diff --git a/picture/01.png b/picture/01.png new file mode 100644 index 0000000..069f71d Binary files /dev/null and b/picture/01.png differ diff --git a/picture/02.png b/picture/02.png new file mode 100644 index 0000000..dbe12bf Binary files /dev/null and b/picture/02.png differ diff --git a/pipeline.sh b/pipeline.sh new file mode 100644 index 0000000..cf6089e --- /dev/null +++ b/pipeline.sh @@ -0,0 +1,455 @@ +#!/bin/bash +set -euo pipefail + +echo "STEP 0 : Setup Variable, Directory, and Ensure NVIDIA Docker Supported" +### WARNING : Don't use tilde (~) in the path + +# ------------------- +## Step 0a: Set variable necessary with sample name for RG +# ------------------- +echo "STEP 0a: Set variable necessary" + +BIN_VERSION=1.5.0 +memory=60G +DEEPTRIO_MODEL=WES + +glnexus_version=1.4.1 +tiddit_version=3.6.1--py38h24c8ff8_0 + +# Father +father_name=V350145665_L04_B5EHOMdmhwXAACA-519 + +# Mother +mother_name=V350145665_L04_B5EHOMdmhwXAABA-517 + +# Proband +proband_name=V350145665_L04_B5EHOMdmhwXAAAA-515 + +proband_SM=Proband +proband_PU=AgilentV6 +proband_PL=DNBSeq +proband_LB=WXS + +exomiser_solo=V350145665_L04_B5EHOMdmhwXAAAA-515_exomiser_solo.yml +exomiser_solo_sv=V350145665_L04_B5EHOMdmhwXAAAA-515_tiddit_exomiser_solo.yml +exomiser_trio=V350145665_L04_B5EHOMdmhwXAAAA-515_exomiser_trio.yml + +mother_SM=Mother +mother_PU=AgilentV6 +mother_PL=DNBSeq +mother_LB=WXS + +father_SM=Father +father_PU=AgilentV6 +father_PL=DNBSeq +father_LB=WXS + + +# ------------------- +## Step 0b: Prepare directory and pedigree file +# ------------------- +echo "STEP 0b: Prepare directory and pedigree file" + +## Pedigree file on INPUT_DIR (Just file name only) +INPUT_DIR="/home/ivanwilliamharsono/Downloads/trio_patient/input" +PEDIGREE="trio.ped" + +## Subfolder of INPUT_DIR +FASTQ_DIR=${INPUT_DIR}/A_FASTQ +SAM_DIR=${INPUT_DIR}/B_RAW_SAM_BAM + +## Output Dir +OUTPUT_DIR="/home/ivanwilliamharsono/Downloads/trio_patient/output" +ANNOTATED_DIR="/home/ivanwilliamharsono/Downloads/trio_patient/annotated" +SV_DIR="/home/ivanwilliamharsono/Downloads/trio_patient/sv_tiddit" +EXOMISER_DIR="/home/ivanwilliamharsono/Downloads/trio_patient/exomiser" + +## FASTA file on REFERENCE_DIR (Just file name only) +REFERENCE_DIR=/home/ivanwilliamharsono/Downloads/Database +REFSEQ_FASTA=hg38.p14.fa + +snpEff_dir=/home/ivanwilliamharsono/Downloads/Sandbox/snpEff/data +snpEff_ver=GRCh38.p14 +dbnsfp=/home/ivanwilliamharsono/Downloads/Database/dbNSFPv4.5a_custombuild.gz +dbSNP=/home/ivanwilliamharsono/Downloads/Database/Homo_sapiens_assembly38.dbsnp156_snpsift.fixed.vcf.gz +ClinVar=/home/ivanwilliamharsono/Downloads/Database/clinvar_20240215.vcf.gz +chr_rename=/home/ivanwilliamharsono/Downloads/IDeRare/convert/vcf_rename.chrnames + +cd /home/ivanwilliamharsono/Downloads/trio_patient + +## Make Directory +mkdir -p ${INPUT_DIR} ${FASTQ_DIR} ${SAM_DIR} ${OUTPUT_DIR} ${OUTPUT_DIR}/intermediate_results_dir_proband ${OUTPUT_DIR}/intermediate_results_dir_trio ${ANNOTATED_DIR} ${SV_DIR} ${EXOMISER_DIR} + +# # ------------------- +# # STEP 1: QC - Run fastqp +# # ------------------- +# echo "STEP 1: QC - Run fastqp - Optional (if the input is raw untrimmed fastq)" +# # echo "STEP 1a : Proband" + +# # fastp -g -x -w $(nproc) \ +# # -D --dup_calc_accuracy 6 \ +# # --in1 ${FASTQ_DIR}/${proband_name}_1.fastq \ +# # --in2 ${FASTQ_DIR}/${proband_name}_2.fastq \ +# # --out1 ${FASTQ_DIR}/${proband_name}_1.fq.gz \ +# # --out2 ${FASTQ_DIR}/${proband_name}_2.fq.gz \ +# # -h ${FASTQ_DIR}/${proband_name}.html \ +# # -j ${FASTQ_DIR}/${proband_name}.json \ +# # -R ${proband_name}-${proband_SM} + +# # echo "STEP 1b : Mother" + +# # fastp -g -x -w $(nproc) \ +# # -D --dup_calc_accuracy 6 \ +# # --in1 ${FASTQ_DIR}/${mother_name}_1.fastq \ +# # --in2 ${FASTQ_DIR}/${mother_name}_2.fastq \ +# # --out1 ${FASTQ_DIR}/${mother_name}_1.fq.gz \ +# # --out2 ${FASTQ_DIR}/${mother_name}_2.fq.gz \ +# # -h ${FASTQ_DIR}/${mother_name}.html \ +# # -j ${FASTQ_DIR}/${mother_name}.json \ +# # -R ${mother_name}-${mother_SM} + +# # echo "STEP 1C : Father" + +# # fastp -g -x -w $(nproc) \ +# # -D --dup_calc_accuracy 6 \ +# # --in1 ${FASTQ_DIR}/${father_name}_1.fastq \ +# # --in2 ${FASTQ_DIR}/${father_name}_2.fastq \ +# # --out1 ${FASTQ_DIR}/${father_name}_1.fq.gz \ +# # --out2 ${FASTQ_DIR}/${father_name}_2.fq.gz \ +# # -h ${FASTQ_DIR}/${father_name}.html \ +# # -j ${FASTQ_DIR}/${father_name}.json \ +# # -R ${father_name}-${father_SM} + +# # -------------------------------------- +# # STEP 2: Map to reference using BWA-MEM2 +# # -------------------------------------- +# echo "STEP 2: Map to reference using BWA-MEM2" + +# echo "STEP 2a : Proband" + +# # BWA MEM2 Alignment +# bwa-mem2 mem -R "@RG\tID:${proband_name}\tSM:${proband_SM}\tPU:${proband_PU}\tPL:${proband_PL}\tLB:${proband_LB}" \ +# -t $(nproc) ${REFERENCE_DIR}/${REFSEQ_FASTA} \ +# ${FASTQ_DIR}/${proband_name}_1.fq.gz \ +# ${FASTQ_DIR}/${proband_name}_2.fq.gz \ +# > ${SAM_DIR}/${proband_name}_raw.sam + +# echo "STEP 2b : Mother" + +# bwa-mem2 mem -R "@RG\tID:${mother_name}\tSM:${mother_SM}\tPU:${mother_PU}\tPL:${mother_PL}\tLB:${mother_LB}" \ +# -t $(nproc) ${REFERENCE_DIR}/${REFSEQ_FASTA} \ +# ${FASTQ_DIR}/${mother_name}_1.fq.gz \ +# ${FASTQ_DIR}/${mother_name}_2.fq.gz \ +# > ${SAM_DIR}/${mother_name}_raw.sam + +# echo "STEP 2c : Father" + +# bwa-mem2 mem -R "@RG\tID:${father_name}\tSM:${father_SM}\tPU:${father_PU}\tPL:${father_PL}\tLB:${father_LB}" \ +# -t $(nproc) ${REFERENCE_DIR}/${REFSEQ_FASTA} \ +# ${FASTQ_DIR}/${father_name}_1.fq.gz \ +# ${FASTQ_DIR}/${father_name}_2.fq.gz \ +# > ${SAM_DIR}/${father_name}_raw.sam + +# # ----------------------------------------- +# # STEP 3: Mark Duplicates and Sort - sambamba +# # ----------------------------------------- +# echo "STEP 3: Mark Duplicates and Sort - Sambamba" + +# # Conversion of SAM to BAM & Markdup +# echo "STEP 3a : Proband" + +# sambamba view -p -t=$(nproc) -l=9 \ +# -S ${SAM_DIR}/${proband_name}_raw.sam \ +# -f=bam -o=${SAM_DIR}/${proband_name}_raw.bam + +# sambamba markdup -r -p -t=$(nproc) -l=9 \ +# ${SAM_DIR}/${proband_name}_raw.bam \ +# ${SAM_DIR}/${proband_name}_dedup.bam + +# sambamba sort -m=${memory} -p -t=$(nproc) -l=9 \ +# ${SAM_DIR}/${proband_name}_dedup.bam \ +# -o=${INPUT_DIR}/${proband_name}.bam + +# echo "STEP 3b : Mother" + +# sambamba view -p -t=$(nproc) -l=9 \ +# -S ${SAM_DIR}/${mother_name}_raw.sam \ +# -f=bam -o=${SAM_DIR}/${mother_name}_raw.bam + +# sambamba markdup -r -p -t=$(nproc) -l=9 \ +# ${SAM_DIR}/${mother_name}_raw.bam \ +# ${SAM_DIR}/${mother_name}_dedup.bam + +# sambamba sort -m=${memory} -p -t=$(nproc) -l=9 \ +# ${SAM_DIR}/${mother_name}_dedup.bam \ +# -o=${INPUT_DIR}/${mother_name}.bam + +# echo "STEP 3c : Father" + +# sambamba view -p -t=$(nproc) -l=9 \ +# -S ${SAM_DIR}/${father_name}_raw.sam \ +# -f=bam -o=${SAM_DIR}/${father_name}_raw.bam + +# sambamba markdup -r -p -t=$(nproc) -l=9 \ +# ${SAM_DIR}/${father_name}_raw.bam \ +# ${SAM_DIR}/${father_name}_dedup.bam + +# sambamba sort -m=${memory} -p -t=$(nproc) -l=9 \ +# ${SAM_DIR}/${father_name}_dedup.bam \ +# -o=${INPUT_DIR}/${father_name}.bam + +# ### Remove Intermediate SAM BAM file as it consumes too much spaces + +# #### Remove all in one folder +# rm ${SAM_DIR}/*.sam ${SAM_DIR}/*.bam + +# #### Remove one by one +# rm ${SAM_DIR}/${proband_name}_raw.sam ${SAM_DIR}/${mother_name}_raw.sam ${SAM_DIR}/${father_name}_raw.sam +# rm ${SAM_DIR}/${proband_name}_raw.bam ${SAM_DIR}/${mother_name}_raw.bam ${SAM_DIR}/${father_name}_raw.bam +# rm ${SAM_DIR}/${proband_name}_dedup.bam ${SAM_DIR}/${mother_name}_dedup.bam ${SAM_DIR}/${father_name}_dedup.bam + +# # ---------------------------------------------- +# # STEP 4: Variant Calling +# # ---------------------------------------------- +# echo "STEP 4a: Variant Calling Proband DeepVariant" + +# docker run --gpus 1 \ +# -v "${INPUT_DIR}":"/input" \ +# -v "${OUTPUT_DIR}":"/output" \ +# -v "${REFERENCE_DIR}":"/reference" \ +# google/deepvariant:"${BIN_VERSION}-gpu" \ +# /opt/deepvariant/bin/run_deepvariant \ +# --model_type ${DEEPTRIO_MODEL} \ +# --ref /reference/${REFSEQ_FASTA} \ +# --reads /input/${proband_name}.bam \ +# --num_shards $(nproc) \ +# --intermediate_results_dir /output/intermediate_results_dir_proband \ +# --output_gvcf /output/${proband_name}_proband.g.vcf.gz \ +# --output_vcf /output/${proband_name}_proband.vcf.gz + +# ## Remove Intermediate_results_dir to save spaces +# rm -r -f ${OUTPUT_DIR}/intermediate_results_dir_proband + +# ## Stop Docker to save spaces and memory +# docker stop $(docker ps -aq) +# docker rm $(docker ps -aq) + +# echo "STEP 4b: Variant Calling DeepTrio" + +# docker run --gpus 1 \ +# -v "${INPUT_DIR}":"/input" \ +# -v "${OUTPUT_DIR}":"/output" \ +# -v "${REFERENCE_DIR}":"/reference" \ +# google/deepvariant:deeptrio-"${BIN_VERSION}-gpu" \ +# /opt/deepvariant/bin/deeptrio/run_deeptrio \ +# --model_type ${DEEPTRIO_MODEL} \ +# --ref /reference/${REFSEQ_FASTA} \ +# --reads_child /input/${proband_name}.bam \ +# --reads_parent1 /input/${father_name}.bam \ +# --reads_parent2 /input/${mother_name}.bam \ +# --output_vcf_child /output/${proband_name}.output.vcf.gz \ +# --output_vcf_parent1 /output/${father_name}.output.vcf.gz \ +# --output_vcf_parent2 /output/${mother_name}.output.vcf.gz \ +# --sample_name_child 'child' \ +# --sample_name_parent1 'father' \ +# --sample_name_parent2 'mother' \ +# --num_shards $(nproc) \ +# --intermediate_results_dir /output/intermediate_results_dir_trio \ +# --output_gvcf_child /output/${proband_name}.g.vcf.gz \ +# --output_gvcf_parent1 /output/${father_name}.g.vcf.gz \ +# --output_gvcf_parent2 /output/${mother_name}.g.vcf.gz + +# ## Remove Intermediate_results_dir to save spaces +# rm -r -f ${OUTPUT_DIR}/intermediate_results_dir_trio + +# ## Stop Docker to save spaces and memory +# docker stop $(docker ps -aq) +# docker rm $(docker ps -aq) + +# # ---------------------------------------------- +# # STEP 5: Merge gVCF files with GLnexus +# # ---------------------------------------------- +# echo "STEP 5: Merge gVCF files with GLnexus" + +# docker run \ +# -v "${OUTPUT_DIR}":"/output" \ +# ghcr.io/dnanexus-rnd/glnexus:v${glnexus_version} \ +# /usr/local/bin/glnexus_cli \ +# --config DeepVariant_unfiltered \ +# /output/${proband_name}.g.vcf.gz \ +# /output/${father_name}.g.vcf.gz \ +# /output/${mother_name}.g.vcf.gz \ +# | bcftools view -Oz -o ${OUTPUT_DIR}/${proband_name}_trio_merged.vcf.gz + +# ## Stop Docker to save spaces and memory +# docker stop $(docker ps -aq) +# docker rm $(docker ps -aq) + +# # ---------------------------------------------- +# # STEP 6: Calculate Mendelian Violation Rate using RTG Tools +# # ---------------------------------------------- +# echo "STEP 6: Calculate Mendelian Violation Rate using RTG Tools" +# #### Reference : https://www.animalgenome.org/bioinfo/resources/manuals/RTGOperationsManual.pdf + +# if [ -d ${REFERENCE_DIR}/${REFSEQ_FASTA}.sdf ]; +# then +# echo "${REFERENCE_DIR}/${REFSEQ_FASTA}.sdf exists. Removing directory." +# rm -r -f ${REFERENCE_DIR}/${REFSEQ_FASTA}.sdf +# else +# echo "${REFERENCE_DIR}/${REFSEQ_FASTA}.sdf directory does not exist. Continue" +# fi + +# docker run \ +# -v "${INPUT_DIR}":"/input" \ +# -v "${REFERENCE_DIR}":"/reference" \ +# realtimegenomics/rtg-tools format \ +# -o /reference/${REFSEQ_FASTA}.sdf "/reference/${REFSEQ_FASTA}" + +# docker run \ +# -v "${INPUT_DIR}":"/input" \ +# -v "${REFERENCE_DIR}":"/reference" \ +# -v "${OUTPUT_DIR}":"/output" \ +# realtimegenomics/rtg-tools vcfstats \ +# "/output/${proband_name}_trio_merged.vcf.gz" \ +# | tee output/deepvariant.${proband_name}_trio.vcfstats.txt + +# docker run \ +# -v "${INPUT_DIR}":"/input" \ +# -v "${REFERENCE_DIR}":"/reference" \ +# -v "${OUTPUT_DIR}":"/output" \ +# realtimegenomics/rtg-tools mendelian \ +# -i "/output/${proband_name}_trio_merged.vcf.gz" \ +# -o "/output/${proband_name}_trio_annotated.output.vcf.gz" \ +# --pedigree=/input/trio.ped \ +# -t /reference/${REFSEQ_FASTA}.sdf \ +# | tee output/deepvariant.${proband_name}_trio.mendelian.txt + +# # ------------------- +# # STEP 6b: Docker Stop and Kill All docker Container Process +# # ------------------- +# docker stop $(docker ps -aq) +# docker rm $(docker ps -aq) + +# # ## Jump to direct to annotation session + +# # ------------------- +# # STEP 7: Annotate Variants - SnpEff SnpSift +# # ------------------- +# echo "STEP 7a: Annotate Variants Proband - SNPEff with latest database, SnpSift ClinVar, SnpSift dbNSFP" + +# # Create GATK > dbnsfp Chromosome +# bcftools annotate \ +# --rename-chrs ${chr_rename} \ +# --threads nproc -Oz \ +# -o ${OUTPUT_DIR}/${proband_name}-converted-deepVariant.vcf \ +# ${OUTPUT_DIR}/${proband_name}_proband.vcf.gz + +# # SnpEff with recent GRCh38.p14 database +# SnpEff -v ${snpEff_ver} -dataDir ${snpEff_dir} \ +# -s ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepVariant.html \ +# ${OUTPUT_DIR}/${proband_name}-converted-deepVariant.vcf \ +# > ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepVariant.vcf + +# # SnpSift Annotate ClinVar +# SnpSift annotate -v ${dbSNP} \ +# ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepVariant.vcf \ +# > ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-deepVariant.vcf + +# SnpSift annotate -v ${ClinVar} \ +# ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-deepVariant.vcf \ +# > ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepVariant.vcf + +# # Annotate using dbNSFP for SNP Only (Indel Give 0 Annotation Result) +# SnpSift dbnsfp -v -db ${dbnsfp} \ +# ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepVariant.vcf \ +# > ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-dbNSFP_annotated-deepVariant.vcf + +# # Final SnpEff for known vs unknown dbSNP +# SnpEff -v ${snpEff_ver} -dataDir ${snpEff_dir} \ +# -s ${ANNOTATED_DIR}/${proband_name}-2ndSnpEff-dbSNP-ClinVar-deepVariant.html \ +# ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepVariant.vcf \ +# > ${ANNOTATED_DIR}/${proband_name}-2ndSnpEff-dbSNP-ClinVar-deepVariant.vcf + +# # Compress file size using bgzip +# bgzip --threads nproc ${OUTPUT_DIR}/${proband_name}-converted-deepVariant.vcf +# bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepVariant.vcf +# bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-deepVariant.vcf +# bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepVariant.vcf +# bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-dbNSFP_annotated-deepVariant.vcf + + +# echo "STEP 7b: Annotate Variants Trio - SNP Sift" + +# # Create GATK > dbnsfp Chromosome +# bcftools annotate \ +# --rename-chrs ${chr_rename} \ +# --threads nproc -Oz \ +# -o ${OUTPUT_DIR}/${proband_name}-converted-deepTrio.vcf \ +# ${OUTPUT_DIR}/${proband_name}_trio_annotated.output.vcf.gz + +# # SnpEff with recent GRCh38.p14 database +# SnpEff -v ${snpEff_ver} -dataDir ${snpEff_dir} \ +# -s ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepTrio.html \ +# ${OUTPUT_DIR}/${proband_name}-converted-deepTrio.vcf \ +# > ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepTrio.vcf + +# # SnpSift Annotate dbSNP & ClinVar +# SnpSift annotate -v ${dbSNP} \ +# ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepTrio.vcf \ +# > ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-deepTrio.vcf + +# SnpSift annotate -v ${ClinVar} \ +# ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-deepTrio.vcf \ +# > ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepTrio.vcf + +# # Annotate using dbNSFP for SNP Only (Indel Give 0 Annotation Result) +# SnpSift dbnsfp -v -db ${dbnsfp} \ +# ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepTrio.vcf \ +# > ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-dbNSFP_annotated-deepTrio.vcf + +# # Final SnpEff for known vs unknown dbSNP +# SnpEff -v ${snpEff_ver} -dataDir ${snpEff_dir} \ +# -s ${ANNOTATED_DIR}/${proband_name}-2ndSnpEff-dbSNP-ClinVar-deepTrio.html \ +# ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepTrio.vcf \ +# > ${ANNOTATED_DIR}/${proband_name}-2ndSnpEff-dbSNP-ClinVar-deepTrio.vcf + +# # Compress file size using bgzip +# bgzip --threads nproc ${OUTPUT_DIR}/${proband_name}-converted-deepTrio.vcf +# bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepTrio.vcf +# bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-deepTrio.vcf +# bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepTrio.vcf +# bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-dbNSFP_annotated-deepTrio.vcf + + +# echo "STEP 8: SV calling - Tiddit" +# # Additional SV Step using Tiddit @ SV_DIR + +# docker run \ +# -v "${INPUT_DIR}":"/input" \ +# -v "${SV_DIR}":"/output" \ +# -v "${REFERENCE_DIR}":"/reference" \ +# quay.io/biocontainers/tiddit:${tiddit_version} tiddit \ +# --sv \ +# --ref /reference/${REFSEQ_FASTA} \ +# --bam /input/${proband_name}.bam \ +# --skip_assembly \ +# --threads $(nproc) \ +# -o /output/output + +# # Stop Docker +# docker stop $(docker ps -aq) +# docker rm $(docker ps -aq) + +# grep -E "#|PASS" ${SV_DIR}/output.vcf > ${SV_DIR}/output.filtered.vcf + +# SnpEff -v ${snpEff_ver} -dataDir ${snpEff_dir} \ +# ${SV_DIR}/output.filtered.vcf > ${SV_DIR}/output.filtered.snpeff.vcf + +# SnpSift dbnsfp -v -db ${dbnsfp} \ +# ${SV_DIR}/output.filtered.snpeff.vcf > ${SV_DIR}/output.filtered.dbnsfp.vcf + +echo "STEP 9: Exomiser Analysis" +# Step using Exomiser @ Exomiser root folder +exomiser --analysis ${exomiser_solo} +exomiser --analysis ${exomiser_trio} +exomiser --analysis ${exomiser_solo_sv} \ No newline at end of file diff --git a/templates/template_application.properties b/templates/template_application.properties new file mode 100644 index 0000000..02755bd --- /dev/null +++ b/templates/template_application.properties @@ -0,0 +1,3 @@ +exomiser.data-directory={{exomiser_dir}} +exomiser.hg38.data-version={{exomiser_data_ver}} +exomiser.phenotype.data-version={{exomiser_data_ver}} diff --git a/templates/template_exomiser.yml b/templates/template_exomiser.yml new file mode 100644 index 0000000..e5a8a90 --- /dev/null +++ b/templates/template_exomiser.yml @@ -0,0 +1,127 @@ +## Exomiser Analysis Template for multi-sample VCF files +# These are all the possible options for running exomiser. Use this as a template for +# your own set-up. +analysis: + # hg19 or hg38 - ensure that the application has been configured to run the specified assembly otherwise it will halt. + genomeAssembly: hg38 + vcf: + ped: + proband: + hpoIds: + # These are the default settings, with values representing the maximum minor allele frequency in percent (%) permitted for an + # allele to be considered as a causative candidate under that mode of inheritance. + # If you just want to analyse a sample under a single inheritance mode, delete/comment-out the others. For AUTOSOMAL_RECESSIVE + # or X_RECESSIVE ensure *both* relevant HOM_ALT and COMP_HET modes are present. + # In cases where you do not want any cut-offs applied an empty map should be used e.g. inheritanceModes: {} + inheritanceModes: { + AUTOSOMAL_DOMINANT: 0.1, + AUTOSOMAL_RECESSIVE_HOM_ALT: 0.1, + AUTOSOMAL_RECESSIVE_COMP_HET: 2.0, + X_DOMINANT: 0.1, + X_RECESSIVE_HOM_ALT: 0.1, + X_RECESSIVE_COMP_HET: 2.0, + MITOCHONDRIAL: 0.2 + } + #FULL or PASS_ONLY + analysisMode: PASS_ONLY + #Possible frequencySources: + #Thousand Genomes project - http://www.1000genomes.org/ (THOUSAND_GENOMES) + #TOPMed - https://www.nhlbi.nih.gov/science/precision-medicine-activities (TOPMED) + #UK10K - http://www.uk10k.org/ (UK10K) + #ESP project - http://evs.gs.washington.edu/EVS/ (ESP_) + # ESP_AFRICAN_AMERICAN, ESP_EUROPEAN_AMERICAN, ESP_ALL, + #ExAC project http://exac.broadinstitute.org/about (EXAC_) + # EXAC_AFRICAN_INC_AFRICAN_AMERICAN, EXAC_AMERICAN, + # EXAC_SOUTH_ASIAN, EXAC_EAST_ASIAN, + # EXAC_FINNISH, EXAC_NON_FINNISH_EUROPEAN, + # EXAC_OTHER + #gnomAD - http://gnomad.broadinstitute.org/ (GNOMAD_E, GNOMAD_G) + frequencySources: [ + THOUSAND_GENOMES, + TOPMED, + UK10K, + + ESP_AFRICAN_AMERICAN, ESP_EUROPEAN_AMERICAN, ESP_ALL, + + EXAC_AFRICAN_INC_AFRICAN_AMERICAN, EXAC_AMERICAN, + EXAC_SOUTH_ASIAN, EXAC_EAST_ASIAN, + EXAC_FINNISH, EXAC_NON_FINNISH_EUROPEAN, + EXAC_OTHER, + + GNOMAD_E_AFR, + GNOMAD_E_AMR, +# GNOMAD_E_ASJ, + GNOMAD_E_EAS, + GNOMAD_E_FIN, + GNOMAD_E_NFE, + GNOMAD_E_OTH, + GNOMAD_E_SAS, + + GNOMAD_G_AFR, + GNOMAD_G_AMR, + # GNOMAD_G_ASJ, + GNOMAD_G_EAS, + GNOMAD_G_FIN, + GNOMAD_G_NFE, + GNOMAD_G_OTH, + GNOMAD_G_SAS + ] + # Possible pathogenicitySources: (POLYPHEN, MUTATION_TASTER, SIFT), (REVEL, MVP), CADD, REMM + # REMM is trained on non-coding regulatory regions + # *WARNING* if you enable CADD or REMM ensure that you have downloaded and installed the CADD/REMM tabix files + # and updated their location in the application.properties. Exomiser will not run without this. + pathogenicitySources: [ REVEL, MVP ] + # this is the standard exomiser order. + # all steps are optional + steps: [ + #intervalFilter: {interval: 'chr10:123256200-123256300'}, + # or for multiple intervals: + #intervalFilter: {intervals: ['chr10:123256200-123256300', 'chr10:123256290-123256350']}, + # or using a BED file - NOTE this should be 0-based, Exomiser otherwise uses 1-based coordinates in line with VCF + #intervalFilter: {bed: /full/path/to/bed_file.bed}, + #genePanelFilter: {geneSymbols: ['FGFR1','FGFR2']}, + failedVariantFilter: { }, + #qualityFilter: {minQuality: 50.0}, + variantEffectFilter: { + remove: [ + FIVE_PRIME_UTR_EXON_VARIANT, + FIVE_PRIME_UTR_INTRON_VARIANT, + THREE_PRIME_UTR_EXON_VARIANT, + THREE_PRIME_UTR_INTRON_VARIANT, + NON_CODING_TRANSCRIPT_EXON_VARIANT, + UPSTREAM_GENE_VARIANT, + INTERGENIC_VARIANT, + REGULATORY_REGION_VARIANT, + CODING_TRANSCRIPT_INTRON_VARIANT, + NON_CODING_TRANSCRIPT_INTRON_VARIANT, + DOWNSTREAM_GENE_VARIANT + ] + }, + #knownVariantFilter: {}, #removes variants represented in the database + frequencyFilter: {maxFrequency: 2.0}, + pathogenicityFilter: {keepNonPathogenic: true}, + #inheritanceFilter and omimPrioritiser should always run AFTER all other filters have completed + #they will analyse genes according to the specified modeOfInheritance above- UNDEFINED will not be analysed. + inheritanceFilter: {}, + #omimPrioritiser isn't mandatory. + omimPrioritiser: {}, + #priorityScoreFilter: {minPriorityScore: 0.4}, + #Other prioritisers: Only combine omimPrioritiser with one of these. + #Don't include any if you only want to filter the variants. + hiPhivePrioritiser: {}, + # or run hiPhive in benchmarking mode: + #hiPhivePrioritiser: {runParams: 'mouse'}, + # phivePrioritiser: {} + phenixPrioritiser: {} + #exomeWalkerPrioritiser: {seedGeneIds: [11111, 22222, 33333]} + ] +outputOptions: + outputContributingVariantsOnly: true + #numGenes options: 0 = all or specify a limit e.g. 500 for the first 500 results + numGenes: 50 + # Path to the desired output directory. Will default to the 'results' subdirectory of the exomiser install directory + outputDirectory: + # Filename for the output files. Will default to {input-vcf-filename}-exomiser + outputFileName: + #out-format options: HTML, JSON, TSV_GENE, TSV_VARIANT, VCF (default: HTML) + outputFormats: [HTML, JSON, TSV_GENE, TSV_VARIANT, VCF] \ No newline at end of file diff --git a/templates/template_pipeline.sh b/templates/template_pipeline.sh new file mode 100644 index 0000000..d6153b8 --- /dev/null +++ b/templates/template_pipeline.sh @@ -0,0 +1,455 @@ +#!/bin/bash +set -euo pipefail + +echo "STEP 0 : Setup Variable, Directory, and Ensure NVIDIA Docker Supported" +### WARNING : Don't use tilde (~) in the path + +# ------------------- +## Step 0a: Set variable necessary with sample name for RG +# ------------------- +echo "STEP 0a: Set variable necessary" + +BIN_VERSION={{dv_version}} +memory={{max_mem}} +DEEPTRIO_MODEL={{dv_model}} + +glnexus_version={{glnexus_version}} +tiddit_version={{tiddit_version}} + +# Father +father_name={{father}} + +# Mother +mother_name={{mother}} + +# Proband +proband_name={{proband}} + +proband_SM=Proband +proband_PU={{library}} +proband_PL={{method}} +proband_LB=WXS + +exomiser_solo={{proband}}_exomiser_solo.yml +exomiser_solo_sv={{proband}}_tiddit_exomiser_solo.yml +exomiser_trio={{proband}}_exomiser_trio.yml + +mother_SM=Mother +mother_PU={{library}} +mother_PL={{method}} +mother_LB=WXS + +father_SM=Father +father_PU={{library}} +father_PL={{method}} +father_LB=WXS + + +# ------------------- +## Step 0b: Prepare directory and pedigree file +# ------------------- +echo "STEP 0b: Prepare directory and pedigree file" + +## Pedigree file on INPUT_DIR (Just file name only) +INPUT_DIR="{{data_dir}}/input" +PEDIGREE="trio.ped" + +## Subfolder of INPUT_DIR +FASTQ_DIR=${INPUT_DIR}/A_FASTQ +SAM_DIR=${INPUT_DIR}/B_RAW_SAM_BAM + +## Output Dir +OUTPUT_DIR="{{data_dir}}/output" +ANNOTATED_DIR="{{data_dir}}/annotated" +SV_DIR="{{data_dir}}/sv_tiddit" +EXOMISER_DIR="{{data_dir}}/exomiser" + +## FASTA file on REFERENCE_DIR (Just file name only) +REFERENCE_DIR={{ref_dir}} +REFSEQ_FASTA={{ref_fasta}} + +snpEff_dir={{snpEff_dir}} +snpEff_ver={{snpEff_ver}} +dbnsfp={{dbNSFP_file}} +dbSNP={{dbSNP_file}} +ClinVar={{ClinVar_file}} +chr_rename={{chr_rename}} + +cd {{data_dir}} + +## Make Directory +mkdir -p ${INPUT_DIR} ${FASTQ_DIR} ${SAM_DIR} ${OUTPUT_DIR} ${OUTPUT_DIR}/intermediate_results_dir_proband ${OUTPUT_DIR}/intermediate_results_dir_trio ${ANNOTATED_DIR} ${SV_DIR} ${EXOMISER_DIR} + +# ------------------- +# STEP 1: QC - Run fastqp +# ------------------- +echo "STEP 1: QC - Run fastqp - Optional (if the input is raw untrimmed fastq)" +# echo "STEP 1a : Proband" + +# fastp -g -x -w $(nproc) \ +# -D --dup_calc_accuracy 6 \ +# --in1 ${FASTQ_DIR}/${proband_name}_1.fastq \ +# --in2 ${FASTQ_DIR}/${proband_name}_2.fastq \ +# --out1 ${FASTQ_DIR}/${proband_name}_1.fq.gz \ +# --out2 ${FASTQ_DIR}/${proband_name}_2.fq.gz \ +# -h ${FASTQ_DIR}/${proband_name}.html \ +# -j ${FASTQ_DIR}/${proband_name}.json \ +# -R ${proband_name}-${proband_SM} + +# echo "STEP 1b : Mother" + +# fastp -g -x -w $(nproc) \ +# -D --dup_calc_accuracy 6 \ +# --in1 ${FASTQ_DIR}/${mother_name}_1.fastq \ +# --in2 ${FASTQ_DIR}/${mother_name}_2.fastq \ +# --out1 ${FASTQ_DIR}/${mother_name}_1.fq.gz \ +# --out2 ${FASTQ_DIR}/${mother_name}_2.fq.gz \ +# -h ${FASTQ_DIR}/${mother_name}.html \ +# -j ${FASTQ_DIR}/${mother_name}.json \ +# -R ${mother_name}-${mother_SM} + +# echo "STEP 1C : Father" + +# fastp -g -x -w $(nproc) \ +# -D --dup_calc_accuracy 6 \ +# --in1 ${FASTQ_DIR}/${father_name}_1.fastq \ +# --in2 ${FASTQ_DIR}/${father_name}_2.fastq \ +# --out1 ${FASTQ_DIR}/${father_name}_1.fq.gz \ +# --out2 ${FASTQ_DIR}/${father_name}_2.fq.gz \ +# -h ${FASTQ_DIR}/${father_name}.html \ +# -j ${FASTQ_DIR}/${father_name}.json \ +# -R ${father_name}-${father_SM} + +# -------------------------------------- +# STEP 2: Map to reference using BWA-MEM2 +# -------------------------------------- +echo "STEP 2: Map to reference using BWA-MEM2" + +echo "STEP 2a : Proband" + +# BWA MEM2 Alignment +bwa-mem2 mem -R "@RG\tID:${proband_name}\tSM:${proband_SM}\tPU:${proband_PU}\tPL:${proband_PL}\tLB:${proband_LB}" \ + -t $(nproc) ${REFERENCE_DIR}/${REFSEQ_FASTA} \ + ${FASTQ_DIR}/${proband_name}_1.fq.gz \ + ${FASTQ_DIR}/${proband_name}_2.fq.gz \ + > ${SAM_DIR}/${proband_name}_raw.sam + +echo "STEP 2b : Mother" + +bwa-mem2 mem -R "@RG\tID:${mother_name}\tSM:${mother_SM}\tPU:${mother_PU}\tPL:${mother_PL}\tLB:${mother_LB}" \ + -t $(nproc) ${REFERENCE_DIR}/${REFSEQ_FASTA} \ + ${FASTQ_DIR}/${mother_name}_1.fq.gz \ + ${FASTQ_DIR}/${mother_name}_2.fq.gz \ + > ${SAM_DIR}/${mother_name}_raw.sam + +echo "STEP 2c : Father" + +bwa-mem2 mem -R "@RG\tID:${father_name}\tSM:${father_SM}\tPU:${father_PU}\tPL:${father_PL}\tLB:${father_LB}" \ + -t $(nproc) ${REFERENCE_DIR}/${REFSEQ_FASTA} \ + ${FASTQ_DIR}/${father_name}_1.fq.gz \ + ${FASTQ_DIR}/${father_name}_2.fq.gz \ + > ${SAM_DIR}/${father_name}_raw.sam + +# ----------------------------------------- +# STEP 3: Mark Duplicates and Sort - sambamba +# ----------------------------------------- +echo "STEP 3: Mark Duplicates and Sort - Sambamba" + +# Conversion of SAM to BAM & Markdup +echo "STEP 3a : Proband" + +sambamba view -p -t=$(nproc) -l=9 \ + -S ${SAM_DIR}/${proband_name}_raw.sam \ + -f=bam -o=${SAM_DIR}/${proband_name}_raw.bam + +sambamba markdup -r -p -t=$(nproc) -l=9 \ + ${SAM_DIR}/${proband_name}_raw.bam \ + ${SAM_DIR}/${proband_name}_dedup.bam + +sambamba sort -m=${memory} -p -t=$(nproc) -l=9 \ + ${SAM_DIR}/${proband_name}_dedup.bam \ + -o=${INPUT_DIR}/${proband_name}.bam + +echo "STEP 3b : Mother" + +sambamba view -p -t=$(nproc) -l=9 \ + -S ${SAM_DIR}/${mother_name}_raw.sam \ + -f=bam -o=${SAM_DIR}/${mother_name}_raw.bam + +sambamba markdup -r -p -t=$(nproc) -l=9 \ + ${SAM_DIR}/${mother_name}_raw.bam \ + ${SAM_DIR}/${mother_name}_dedup.bam + +sambamba sort -m=${memory} -p -t=$(nproc) -l=9 \ + ${SAM_DIR}/${mother_name}_dedup.bam \ + -o=${INPUT_DIR}/${mother_name}.bam + +echo "STEP 3c : Father" + +sambamba view -p -t=$(nproc) -l=9 \ + -S ${SAM_DIR}/${father_name}_raw.sam \ + -f=bam -o=${SAM_DIR}/${father_name}_raw.bam + +sambamba markdup -r -p -t=$(nproc) -l=9 \ + ${SAM_DIR}/${father_name}_raw.bam \ + ${SAM_DIR}/${father_name}_dedup.bam + +sambamba sort -m=${memory} -p -t=$(nproc) -l=9 \ + ${SAM_DIR}/${father_name}_dedup.bam \ + -o=${INPUT_DIR}/${father_name}.bam + +### Remove Intermediate SAM BAM file as it consumes too much spaces + +#### Remove all in one folder +rm ${SAM_DIR}/*.sam ${SAM_DIR}/*.bam + +#### Remove one by one +rm ${SAM_DIR}/${proband_name}_raw.sam ${SAM_DIR}/${mother_name}_raw.sam ${SAM_DIR}/${father_name}_raw.sam +rm ${SAM_DIR}/${proband_name}_raw.bam ${SAM_DIR}/${mother_name}_raw.bam ${SAM_DIR}/${father_name}_raw.bam +rm ${SAM_DIR}/${proband_name}_dedup.bam ${SAM_DIR}/${mother_name}_dedup.bam ${SAM_DIR}/${father_name}_dedup.bam + +# ---------------------------------------------- +# STEP 4: Variant Calling +# ---------------------------------------------- +echo "STEP 4a: Variant Calling Proband DeepVariant" + +docker run --gpus 1 \ + -v "${INPUT_DIR}":"/input" \ + -v "${OUTPUT_DIR}":"/output" \ + -v "${REFERENCE_DIR}":"/reference" \ + google/deepvariant:"${BIN_VERSION}-gpu" \ + /opt/deepvariant/bin/run_deepvariant \ + --model_type ${DEEPTRIO_MODEL} \ + --ref /reference/${REFSEQ_FASTA} \ + --reads /input/${proband_name}.bam \ + --num_shards $(nproc) \ + --intermediate_results_dir /output/intermediate_results_dir_proband \ + --output_gvcf /output/${proband_name}_proband.g.vcf.gz \ + --output_vcf /output/${proband_name}_proband.vcf.gz + +## Remove Intermediate_results_dir to save spaces +rm -r -f ${OUTPUT_DIR}/intermediate_results_dir_proband + +## Stop Docker to save spaces and memory +docker stop $(docker ps -aq) +docker rm $(docker ps -aq) + +echo "STEP 4b: Variant Calling DeepTrio" + +docker run --gpus 1 \ + -v "${INPUT_DIR}":"/input" \ + -v "${OUTPUT_DIR}":"/output" \ + -v "${REFERENCE_DIR}":"/reference" \ + google/deepvariant:deeptrio-"${BIN_VERSION}-gpu" \ + /opt/deepvariant/bin/deeptrio/run_deeptrio \ + --model_type ${DEEPTRIO_MODEL} \ + --ref /reference/${REFSEQ_FASTA} \ + --reads_child /input/${proband_name}.bam \ + --reads_parent1 /input/${father_name}.bam \ + --reads_parent2 /input/${mother_name}.bam \ + --output_vcf_child /output/${proband_name}.output.vcf.gz \ + --output_vcf_parent1 /output/${father_name}.output.vcf.gz \ + --output_vcf_parent2 /output/${mother_name}.output.vcf.gz \ + --sample_name_child 'child' \ + --sample_name_parent1 'father' \ + --sample_name_parent2 'mother' \ + --num_shards $(nproc) \ + --intermediate_results_dir /output/intermediate_results_dir_trio \ + --output_gvcf_child /output/${proband_name}.g.vcf.gz \ + --output_gvcf_parent1 /output/${father_name}.g.vcf.gz \ + --output_gvcf_parent2 /output/${mother_name}.g.vcf.gz + +## Remove Intermediate_results_dir to save spaces +rm -r -f ${OUTPUT_DIR}/intermediate_results_dir_trio + +## Stop Docker to save spaces and memory +docker stop $(docker ps -aq) +docker rm $(docker ps -aq) + +# ---------------------------------------------- +# STEP 5: Merge gVCF files with GLnexus +# ---------------------------------------------- +echo "STEP 5: Merge gVCF files with GLnexus" + +docker run \ + -v "${OUTPUT_DIR}":"/output" \ + ghcr.io/dnanexus-rnd/glnexus:v${glnexus_version} \ + /usr/local/bin/glnexus_cli \ + --config DeepVariant_unfiltered \ + /output/${proband_name}.g.vcf.gz \ + /output/${father_name}.g.vcf.gz \ + /output/${mother_name}.g.vcf.gz \ + | bcftools view -Oz -o ${OUTPUT_DIR}/${proband_name}_trio_merged.vcf.gz + +## Stop Docker to save spaces and memory +docker stop $(docker ps -aq) +docker rm $(docker ps -aq) + +# ---------------------------------------------- +# STEP 6: Calculate Mendelian Violation Rate using RTG Tools +# ---------------------------------------------- +echo "STEP 6: Calculate Mendelian Violation Rate using RTG Tools" +#### Reference : https://www.animalgenome.org/bioinfo/resources/manuals/RTGOperationsManual.pdf + +if [ -d ${REFERENCE_DIR}/${REFSEQ_FASTA}.sdf ]; +then + echo "${REFERENCE_DIR}/${REFSEQ_FASTA}.sdf exists. Removing directory." + rm -r -f ${REFERENCE_DIR}/${REFSEQ_FASTA}.sdf +else + echo "${REFERENCE_DIR}/${REFSEQ_FASTA}.sdf directory does not exist. Continue" +fi + +docker run \ + -v "${INPUT_DIR}":"/input" \ + -v "${REFERENCE_DIR}":"/reference" \ + realtimegenomics/rtg-tools format \ + -o /reference/${REFSEQ_FASTA}.sdf "/reference/${REFSEQ_FASTA}" + +docker run \ + -v "${INPUT_DIR}":"/input" \ + -v "${REFERENCE_DIR}":"/reference" \ + -v "${OUTPUT_DIR}":"/output" \ + realtimegenomics/rtg-tools vcfstats \ + "/output/${proband_name}_trio_merged.vcf.gz" \ + | tee output/deepvariant.${proband_name}_trio.vcfstats.txt + +docker run \ + -v "${INPUT_DIR}":"/input" \ + -v "${REFERENCE_DIR}":"/reference" \ + -v "${OUTPUT_DIR}":"/output" \ + realtimegenomics/rtg-tools mendelian \ + -i "/output/${proband_name}_trio_merged.vcf.gz" \ + -o "/output/${proband_name}_trio_annotated.output.vcf.gz" \ + --pedigree=/input/trio.ped \ + -t /reference/${REFSEQ_FASTA}.sdf \ + | tee output/deepvariant.${proband_name}_trio.mendelian.txt + +# ------------------- +# STEP 6b: Docker Stop and Kill All docker Container Process +# ------------------- +docker stop $(docker ps -aq) +docker rm $(docker ps -aq) + +# ## Jump to direct to annotation session + +# ------------------- +# STEP 7: Annotate Variants - SnpEff SnpSift +# ------------------- +echo "STEP 7a: Annotate Variants Proband - SNPEff with latest database, SnpSift ClinVar, SnpSift dbNSFP" + +# Create GATK > dbnsfp Chromosome +bcftools annotate \ + --rename-chrs ${chr_rename} \ + --threads nproc -Oz \ + -o ${OUTPUT_DIR}/${proband_name}-converted-deepVariant.vcf \ + ${OUTPUT_DIR}/${proband_name}_proband.vcf.gz + +# SnpEff with recent GRCh38.p14 database +SnpEff -v ${snpEff_ver} -dataDir ${snpEff_dir} \ + -s ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepVariant.html \ + ${OUTPUT_DIR}/${proband_name}-converted-deepVariant.vcf \ + > ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepVariant.vcf + +# SnpSift Annotate ClinVar +SnpSift annotate -v ${dbSNP} \ + ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepVariant.vcf \ + > ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-deepVariant.vcf + +SnpSift annotate -v ${ClinVar} \ + ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-deepVariant.vcf \ + > ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepVariant.vcf + +# Annotate using dbNSFP for SNP Only (Indel Give 0 Annotation Result) +SnpSift dbnsfp -v -db ${dbnsfp} \ + ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepVariant.vcf \ + > ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-dbNSFP_annotated-deepVariant.vcf + +# Final SnpEff for known vs unknown dbSNP +SnpEff -v ${snpEff_ver} -dataDir ${snpEff_dir} \ + -s ${ANNOTATED_DIR}/${proband_name}-2ndSnpEff-dbSNP-ClinVar-deepVariant.html \ + ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepVariant.vcf \ + > ${ANNOTATED_DIR}/${proband_name}-2ndSnpEff-dbSNP-ClinVar-deepVariant.vcf + +# Compress file size using bgzip +bgzip --threads nproc ${OUTPUT_DIR}/${proband_name}-converted-deepVariant.vcf +bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepVariant.vcf +bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-deepVariant.vcf +bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepVariant.vcf +bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-dbNSFP_annotated-deepVariant.vcf + + +echo "STEP 7b: Annotate Variants Trio - SNP Sift" + +# Create GATK > dbnsfp Chromosome +bcftools annotate \ + --rename-chrs ${chr_rename} \ + --threads nproc -Oz \ + -o ${OUTPUT_DIR}/${proband_name}-converted-deepTrio.vcf \ + ${OUTPUT_DIR}/${proband_name}_trio_annotated.output.vcf.gz + +# SnpEff with recent GRCh38.p14 database +SnpEff -v ${snpEff_ver} -dataDir ${snpEff_dir} \ + -s ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepTrio.html \ + ${OUTPUT_DIR}/${proband_name}-converted-deepTrio.vcf \ + > ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepTrio.vcf + +# SnpSift Annotate dbSNP & ClinVar +SnpSift annotate -v ${dbSNP} \ + ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepTrio.vcf \ + > ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-deepTrio.vcf + +SnpSift annotate -v ${ClinVar} \ + ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-deepTrio.vcf \ + > ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepTrio.vcf + +# Annotate using dbNSFP for SNP Only (Indel Give 0 Annotation Result) +SnpSift dbnsfp -v -db ${dbnsfp} \ + ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepTrio.vcf \ + > ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-dbNSFP_annotated-deepTrio.vcf + +# Final SnpEff for known vs unknown dbSNP +SnpEff -v ${snpEff_ver} -dataDir ${snpEff_dir} \ + -s ${ANNOTATED_DIR}/${proband_name}-2ndSnpEff-dbSNP-ClinVar-deepTrio.html \ + ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepTrio.vcf \ + > ${ANNOTATED_DIR}/${proband_name}-2ndSnpEff-dbSNP-ClinVar-deepTrio.vcf + +# Compress file size using bgzip +bgzip --threads nproc ${OUTPUT_DIR}/${proband_name}-converted-deepTrio.vcf +bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-deepTrio.vcf +bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-deepTrio.vcf +bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-deepTrio.vcf +bgzip --threads nproc ${ANNOTATED_DIR}/${proband_name}-SnpEff-dbSNP-ClinVar-dbNSFP_annotated-deepTrio.vcf + + +# echo "STEP 8: SV calling - Tiddit" +# # Additional SV Step using Tiddit @ SV_DIR + +# docker run \ +# -v "${INPUT_DIR}":"/input" \ +# -v "${SV_DIR}":"/output" \ +# -v "${REFERENCE_DIR}":"/reference" \ +# quay.io/biocontainers/tiddit:${tiddit_version} tiddit \ +# --sv \ +# --ref /reference/${REFSEQ_FASTA} \ +# --bam /input/${proband_name}.bam \ +# --skip_assembly \ +# --threads $(nproc) \ +# -o /output/output + +# # Stop Docker +# docker stop $(docker ps -aq) +# docker rm $(docker ps -aq) + +# grep -E "#|PASS" ${SV_DIR}/output.vcf > ${SV_DIR}/output.filtered.vcf + +# SnpEff -v ${snpEff_ver} -dataDir ${snpEff_dir} \ +# ${SV_DIR}/output.filtered.vcf > ${SV_DIR}/output.filtered.snpeff.vcf + +# SnpSift dbnsfp -v -db ${dbnsfp} \ +# ${SV_DIR}/output.filtered.snpeff.vcf > ${SV_DIR}/output.filtered.dbnsfp.vcf + +echo "STEP 9: Exomiser Analysis" +# Step using Exomiser @ Exomiser root folder +exomiser --analysis ${exomiser_solo} +exomiser --analysis ${exomiser_trio} +exomiser --analysis ${exomiser_solo_sv} \ No newline at end of file diff --git a/templates/template_trio.ped b/templates/template_trio.ped new file mode 100644 index 0000000..7b162bd --- /dev/null +++ b/templates/template_trio.ped @@ -0,0 +1,10 @@ +#PED format pedigree +# +#fam-id/ind-id/pat-id/mat-id: 0=unknown +#sex: 1=male; 2=female; 0=unknown +#phenotype: -9=missing, 0=missing; 1=unaffected; 2=affected +# +#fam-id ind-id pat-id mat-id sex phen +1 child father mother {{proband_gender}} {{proband_phen}} +1 father 0 0 1 {{father_phen}} +1 mother 0 0 2 {{mother_phen}}